├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── data_preparation.py
├── data_transformations.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── conf.py
    │   ├── data_transformations.rst
    │   ├── define_multi_task_model.rst
    │   ├── examples.rst
    │   ├── index.rst
    │   ├── infering.rst
    │   ├── license.rst
    │   ├── multi_task.png
    │   ├── quickstart.rst
    │   ├── shared_encoder.rst
    │   ├── task_formats.rst
    │   └── training.rst
├── examples
    ├── answerability_detection
    │   ├── answerability_detection_msmarco.ipynb
    │   ├── tasks_file_answerability.yml
    │   └── transform_file_answerability.yml
    ├── entailment_detection
    │   ├── entailment_snli.ipynb
    │   ├── tasks_file_snli.yml
    │   └── transform_file_snli.yml
    ├── intent_ner_fragment
    │   ├── intent_ner_fragment.ipynb
    │   ├── snips_data
    │   │   ├── snips_dev.txt
    │   │   ├── snips_test.txt
    │   │   └── snips_train.txt
    │   ├── tasks_file_snips.yml
    │   └── transform_file_snips.yml
    ├── ner_pos_tagging
    │   ├── coNLL_data
    │   │   ├── coNLL_testa.txt
    │   │   ├── coNLL_testb.txt
    │   │   └── coNLL_train.txt
    │   ├── ner_pos_tagging_conll.ipynb
    │   ├── tasks_file_conll.yml
    │   └── transform_file_conll.yml
    ├── query_correctness
    │   ├── query_correctness.ipynb
    │   ├── query_correctness_data
    │   │   ├── dev.tsv
    │   │   ├── test.tsv
    │   │   └── train.tsv
    │   ├── tasks_file_query_correctness.yml
    │   └── transform_file_query_correctness.yml
    ├── query_pair_similarity
    │   ├── query_similarity_qqp.ipynb
    │   ├── tasks_file_qqp.yml
    │   └── transform_file_qqp.yml
    ├── query_type_detection
    │   ├── query_type_detection.ipynb
    │   ├── tasks_file_querytype.yml
    │   └── transform_file_querytype.yml
    └── sentiment_analysis
    │   ├── IMDb_sentiment_analysis.ipynb
    │   ├── tasks_file_imdb.yml
    │   └── transform_file_imdb.yml
├── infer_pipeline.py
├── logger_.py
├── models
    ├── data_manager.py
    ├── dropout.py
    ├── eval.py
    ├── loss.py
    └── model.py
├── requirements.txt
├── run_inference.py
├── train.py
└── utils
    ├── data_utils.py
    ├── eval_metrics.py
    ├── task_utils.py
    ├── tranform_functions.py
    └── transform_utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # macOS Files
  2 | .DS_Store
  3 | 
  4 | #jupyter lab checkpoints
  5 | .ipynb_checkpoints
  6 | 
  7 | #docs
  8 | docs/source/_build
  9 | docs/build
 10 | 
 11 | #data
 12 | data
 13 | 
 14 | #vs code
 15 | .vscode
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | pip-wheel-metadata/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at swapan@haptik.co. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/data_transformations.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | For transforming the raw data in different formats to standard tsv format 
 3 | to be consumed for multi-task
 4 | '''
 5 | import argparse
 6 | import os
 7 | from utils.transform_utils import TransformParams
 8 | from utils.data_utils import TRANSFORM_FUNCS
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--transform_file', type=str, required=True,
13 |                         default='transform_file.yml', help="path to the yml tranform file")
14 |     args = parser.parse_args()
15 |     #making transform params
16 |     transformParams = TransformParams(args.transform_file)
17 | 
18 |     for transformName, transformFn in transformParams.transformFnMap.items():
19 |         transformParameters = transformParams.transformParamsMap[transformName]
20 |         dataDir = transformParams.readDirMap[transformName]
21 |         assert os.path.exists(dataDir), "{} doesnt exist".format(dataDir)
22 |         saveDir = transformParams.saveDirMap[transformName]
23 |         if not os.path.exists(saveDir):
24 |             os.makedirs(saveDir)
25 |         isTrain = True
26 |         for file in transformParams.readFileNamesMap[transformName]:
27 |             #calling respective transform function over file
28 |             TRANSFORM_FUNCS[transformFn](dataDir = dataDir, readFile=file,
29 |                                         wrtDir=saveDir, transParamDict=transformParameters,
30 |                                         isTrainFile=isTrain)
31 |             # only the first file will be considered as train file for making label map
32 |             isTrain = False
33 |         
34 |         
35 | if __name__ == "__main__":
36 |     main()


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | sys.path.insert(0, os.path.abspath('../../'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | import sphinx_rtd_theme
 23 | project = 'multi-task-NLP'
 24 | copyright = '2020, Jio Haptik Technologies Limited'
 25 | author = 'saransh mehta'
 26 | 
 27 | # The short X.Y version
 28 | version = ''
 29 | # The full version, including alpha/beta/rc tags
 30 | release = '0.0.1'
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.napoleon',
 45 |     'sphinx.ext.todo',
 46 |     'sphinx.ext.mathjax',
 47 |     'sphinx.ext.viewcode',
 48 |     'sphinx.ext.githubpages',
 49 |     'sphinx_rtd_theme',
 50 |     'sphinx.ext.autosectionlabel'
 51 | ]
 52 | 
 53 | # Add any paths that contain templates here, relative to this directory.
 54 | templates_path = ['_templates']
 55 | 
 56 | # The suffix(es) of source filenames.
 57 | # You can specify multiple suffix as a list of string:
 58 | #
 59 | source_suffix = ['.rst', '.md']
 60 | #source_suffix = '.rst'
 61 | 
 62 | # The master toctree document.
 63 | master_doc = 'index'
 64 | 
 65 | # The language for content autogenerated by Sphinx. Refer to documentation
 66 | # for a list of supported languages.
 67 | #
 68 | # This is also used if you do content translation via gettext catalogs.
 69 | # Usually you set "language" from the command line for these cases.
 70 | language = None
 71 | 
 72 | # List of patterns, relative to source directory, that match files and
 73 | # directories to ignore when looking for source files.
 74 | # This pattern also affects html_static_path and html_extra_path.
 75 | exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 76 | 
 77 | # The name of the Pygments (syntax highlighting) style to use.
 78 | pygments_style = None
 79 | 
 80 | 
 81 | # -- Options for HTML output -------------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | 
 87 | html_theme = "sphinx_rtd_theme"
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a theme
 90 | # further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ['_static']
 99 | 
100 | # Custom sidebar templates, must be a dictionary that maps document names
101 | # to template names.
102 | #
103 | # The default sidebars (for documents that don't match any pattern) are
104 | # defined by theme itself.  Builtin themes are using these templates by
105 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
106 | # 'searchbox.html']``.
107 | #
108 | # html_sidebars = {}
109 | 
110 | 
111 | # -- Options for HTMLHelp output ---------------------------------------------
112 | 
113 | # Output file base name for HTML help builder.
114 | htmlhelp_basename = 'multi_task_NLPdoc'
115 | 
116 | 
117 | # -- Options for LaTeX output ------------------------------------------------
118 | 
119 | latex_elements = {
120 |     # The paper size ('letterpaper' or 'a4paper').
121 |     #
122 |     # 'papersize': 'letterpaper',
123 | 
124 |     # The font size ('10pt', '11pt' or '12pt').
125 |     #
126 |     # 'pointsize': '10pt',
127 | 
128 |     # Additional stuff for the LaTeX preamble.
129 |     #
130 |     # 'preamble': '',
131 | 
132 |     # Latex figure (float) alignment
133 |     #
134 |     # 'figure_align': 'htbp',
135 | }
136 | 
137 | # Grouping the document tree into LaTeX files. List of tuples
138 | # (source start file, target name, title,
139 | #  author, documentclass [howto, manual, or own class]).
140 | latex_documents = [
141 |     (master_doc, 'multi_task_NLP.tex', 'multi\\_task\\_NLP Documentation',
142 |      'saransh mehta', 'manual'),
143 | ]
144 | 
145 | 
146 | # -- Options for manual page output ------------------------------------------
147 | 
148 | # One entry per manual page. List of tuples
149 | # (source start file, name, description, authors, manual section).
150 | man_pages = [
151 |     (master_doc, 'multi_task_nlp', 'multi_task_NLP Documentation',
152 |      [author], 1)
153 | ]
154 | 
155 | 
156 | # -- Options for Texinfo output ----------------------------------------------
157 | 
158 | # Grouping the document tree into Texinfo files. List of tuples
159 | # (source start file, target name, title, author,
160 | #  dir menu entry, description, category)
161 | texinfo_documents = [
162 |     (master_doc, 'multi_task_NLP', 'multi_task_NLP Documentation',
163 |      author, 'multi_task_NLP', 'One line description of project.',
164 |      'Miscellaneous'),
165 | ]
166 | 
167 | 
168 | # -- Options for Epub output -------------------------------------------------
169 | 
170 | # Bibliographic Dublin Core info.
171 | epub_title = project
172 | 
173 | # The unique identifier of the text. This can be a ISBN number
174 | # or the project homepage.
175 | #
176 | # epub_identifier = ''
177 | 
178 | # A unique identification for the text.
179 | #
180 | # epub_uid = ''
181 | 
182 | # A list of files that should not be packed into the epub file.
183 | epub_exclude_files = ['search.html']
184 | 
185 | 
186 | # -- Extension configuration -------------------------------------------------
187 | 
188 | # -- Options for todo extension ----------------------------------------------
189 | 
190 | # If true, `todo` and `todoList` produce output, else they produce nothing.
191 | todo_include_todos = True
192 | 


--------------------------------------------------------------------------------
/docs/source/data_transformations.rst:
--------------------------------------------------------------------------------
  1 | Data transformations
  2 | ====================
  3 | 
  4 | It is very likely that the data you have is not in the format as required by the library.
  5 | Hence, data transformations provide a way to convert data in raw form to standard tsv format required.
  6 | 
  7 | Transform functions
  8 | -------------------
  9 | 
 10 | Transform functions are the functions which can be used for performing transformations.
 11 | Each function is defined to take raw data in certain format, perform the defined transformation steps and
 12 | create the respective ``tsv`` file.
 13 | 
 14 | Sample transform functions
 15 | ^^^^^^^^^^^^^^^^^^^^^^^^^^
 16 | .. automodule:: utils.tranform_functions
 17 |     :members: snips_intent_ner_to_tsv, snli_entailment_to_tsv, create_fragment_detection_tsv,
 18 |         msmarco_answerability_detection_to_tsv, msmarco_query_type_to_tsv, bio_ner_to_tsv, coNLL_ner_pos_to_tsv, qqp_query_similarity_to_tsv,
 19 |         query_correctness_to_tsv, imdb_sentiment_data_to_tsv
 20 | 
 21 | Your own transform function
 22 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 23 | In case, you need to convert some custom format data into the standard tsv format, you can do that
 24 | by writing your own transform function. You must keep the following points in mind while writing your function
 25 | 
 26 | - The function must take the standard input arguments like :ref:`sample transform functions<Sample transform functions>`
 27 |   Any extra function specific parameter can be added to the ``transParamDict`` argument.
 28 | 
 29 | - You should add the function in ``utils/tranform_functions.py`` file.
 30 | 
 31 | - You should add a name map for the function in ``utils/data_utils.py`` file under ``TRANSFORM_FUNCS`` map. This
 32 |   step is required for transform file to recognize your function.
 33 | 
 34 | - You should be able to use your function in the :ref:`transform file<Transform File>`.
 35 | 
 36 | Transform File
 37 | --------------
 38 | 
 39 | You can easily use the sample transformation functions or your own transformation function, 
 40 | by defining a YAML format ``transform_file``. Say you want to perform these transformations -
 41 | **sample_transform1**, **sample_transform2**, ..., **sample_transform5**.
 42 | Following is an example for the transform file,
 43 | ::
 44 | 
 45 |   sample_transform1:
 46 |     transform_func: snips_intent_ner_to_tsv
 47 |     read_file_names:
 48 |       - snips_train.txt
 49 |       - snips_dev.txt
 50 |       - snips_test.txt
 51 |     read_dir: snips_data
 52 |     save_dir: demo_transform
 53 | 
 54 | 
 55 |   sample_transform2:
 56 |     transform_func: snli_entailment_to_tsv
 57 |     read_file_names:
 58 |       - snli_train.jsonl
 59 |       - snli_dev.jsonl
 60 |       - snli_test.jsonl
 61 |     read_dir : snli_data
 62 |     save_dir: demo_transform
 63 | 
 64 |   sample_transform3:
 65 |     transform_func: bio_ner_to_tsv
 66 |     transform_params:
 67 |       save_prefix : sample
 68 |       tag_col : 1
 69 |       col_sep : " "
 70 |       sen_sep : "\n"
 71 |     read_file_names:
 72 |       - coNLL_train.txt
 73 |       - coNLL_testa.txt
 74 |       - coNLL_testb.txt
 75 | 
 76 |     read_dir: coNLL_data
 77 |     save_dir: demo_transform
 78 | 
 79 |   sample_transform4:
 80 |     transform_func: fragment_detection_to_tsv
 81 |     transform_params:
 82 |       data_frac : 0.2
 83 |       seq_len_right : 3
 84 |       seq_len_left : 2
 85 |       sep : "\t"
 86 |       query_col : 2
 87 |     read_file_names:
 88 |       - int_snips_train.tsv
 89 |       - int_snips_dev.tsv
 90 |       - int_snips_test.tsv
 91 | 
 92 |     read_dir: data
 93 |     save_dir: demo_transform
 94 | 
 95 |   sample_transform5:
 96 |     transform_func: msmarco_query_type_to_tsv
 97 |     transform_params:
 98 |       data_frac : 0.2
 99 |     read_file_names:
100 |       - train_v2.1.json
101 |       - dev_v2.1.json
102 |       - eval_v2.1_public.json
103 | 
104 |     read_dir: msmarco_qna_data
105 |     save_dir: demo_transform
106 | 
107 | 
108 | NOTE:- The transform names (sample_transform1, sample_transform2, ...) are unique identifiers for the transform, hence the transform names must always be distinct. 
109 | 
110 | Transform file parameters
111 | ^^^^^^^^^^^^^^^^^^^^^^^^^
112 | 
113 | Detailed description of the parameters available in the transform file.
114 | 
115 | - ``transform_func`` `(required)` : Name of the :ref:`transform function<Sample transform functions>` to use.
116 | - ``transform_params`` `(optional)` : Dictionary of function specific parameters which will go in ``transParamDict`` parameter of function.
117 | - ``read_file_names`` `(required)` : List of raw data files for transformations. The first file will be considered as **train file** and will be used to create label
118 |   map file when required.
119 | - ``read_dir`` `(required)` : Directory containing the input files.
120 | - ``save_dir`` `(required)` : Directory to save the transformed tsv/label map files.
121 | 
122 | 
123 | Running data transformations
124 | ----------------------------
125 | 
126 | Once you have made the :ref:`transform file<Transform File>` with all the transform operations, 
127 | you can run data transformations with the following terminal command.
128 | 
129 | .. code-block:: console
130 | 
131 |   $ python data_transformations.py \
132 |         --transform_file 'transform_file.yml'
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/docs/source/define_multi_task_model.rst:
--------------------------------------------------------------------------------
 1 | How to define your multi-task model?
 2 | ====================================
 3 | 
 4 | Let’s consider you have three tasks - **TaskA**, **TaskB** and **TaskC** to train together. TaskA is single sentence classification type,
 5 | TaskB is NER type and TaskC is sentence pair classification type. 
 6 | You can define a task file mentioning the required details about the task in following YAML format.
 7 | ::
 8 | 
 9 |   TaskA:
10 |     model_type: BERT
11 |     config_name: bert-base-uncased
12 |     dropout_prob: 0.05
13 |     label_map_or_file:
14 |     -label1
15 |     -label2
16 |     -label3
17 |     metrics:
18 |     - accuracy
19 |     loss_type: CrossEntropyLoss
20 |     task_type: SingleSenClassification
21 |     file_names:
22 |     - taskA_train.tsv
23 |     - taskA_dev.tsv
24 |     - taskA_test.tsv
25 | 
26 |   TaskB:
27 |     model_type: BERT
28 |     config_name: bert-base-uncased
29 |     dropout_prob: 0.3
30 |     label_map_or_file: data/taskB_train_label_map.joblib
31 |     metrics:
32 |     - seq_f1
33 |     - seq_precision
34 |     - seq_recall
35 |     loss_type: NERLoss
36 |     task_type: NER
37 |     file_names:
38 |     - taskB_train.tsv
39 |     - taskB_dev.tsv
40 |     - taskB_test.tsv
41 | 
42 |   TaskC:
43 |     model_type: BERT
44 |     config_name: bert-base-uncased
45 |     dropout_prob: 0.05
46 |     metrics:
47 |     - accuracy
48 |     loss_type: CrossEntropyLoss
49 |     class_num: 2
50 |     task_type: SentencePairClassification
51 |     file_names:
52 |     - taskC_train.tsv
53 |     - taskC_dev.tsv
54 |     - taskC_test.tsv
55 | 
56 | Few points to keep in mind while making the task file,
57 | 
58 | - You can keep the tasks which you want to train a single model for in this file.
59 | - The file can have either a single task or multiple tasks. In case only a single task is mentioned, the model will act like single-task model.
60 | - The task names (TaskA, TaskB and TaskC) are unique identifiers for the task, hence the task names must always be distinct. 
61 | - The model type for all the tasks mentioned in the file must be the same, as the library uses a single shared encoder model for all these tasks.
62 | 
63 | Task file parameters
64 | --------------------
65 | 
66 | Detailed description of the parameters available in the task file.
67 | 
68 | - ``task_type`` `(required)` :  Format of the task as described in :ref:`Task types<Task types>`
69 | 
70 | - ``file_names`` `(required)` : List of standard data tsv file names required for task. The first file is considered as **train** file, second file as **dev** file and the third file as **test** file.
71 | 
72 | - ``model_type`` `(required)` : Type of shared encoder model to use. The model type for all the tasks mentioned in the file must be the same. You can refer :ref:`Model type<Choice of shared encoder>` for selecting model type.
73 | 
74 | - ``config_name`` `(optional)` : Config of the encoder model. You can refer :ref:`Model type<Choice of shared encoder>` for selecting the model type config. In case this parameter is not present, default config will be used.
75 | 
76 | - ``class_num``  `(required/optional)` : Number of classes present for classification. This parameter is optional if label_map_or_file is provided, required otherwise.
77 | 
78 | - ``label_map_or_file``  `(required/optional)` :
79 | 
80 |   - In case labels are strings, this is the list of unique labels.
81 |   - You can also give a joblib dumped dictionary map file like {‘label1’:0, ‘label2’:1, ..}.
82 |   - If you’re using :ref:`Data Transformations<Data transformations>` to create the data files, path to the label_map file created along with transformed files is to be given here.
83 |   
84 | - ``loss_type`` `(required)` : Type of loss for training as defined in :ref:`Losses<Losses>`.
85 | 
86 | - ``dropout_prob`` `(optional)`: Dropout probability to use between encoder hidden outputs and task specific headers.
87 | 
88 | - ``metrics`` `(optional)` : List of metrics to use during evaluation as defined in :ref:`Metrics<Metrics>`.
89 | 
90 | - ``loss_weight`` `(optional)`: Loss weight value (between 0 to 1) for individual task.
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
  1 | Examples
  2 | ===========
  3 | Here you can find various NLP (especially conversational AI) tasks as examples and can train them either in multi-task or single-task manner, using some simple steps mentioned in the notebooks.
  4 | 
  5 | Example-1 Intent detection, NER, Fragment detection
  6 | ---------------------------------------------------
  7 | 
  8 | **Tasks Description**
  9 | 
 10 | ``Intent Detection`` :- This is a single sentence classification task where an `intent` specifies which class the data sample belongs to. 
 11 | 
 12 | ``NER`` :- This is a Named Entity Recognition/ Sequence Labelling/ Slot filling task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as "O". 
 13 | 
 14 | ``Fragment Detection`` :- This is modeled as a single sentence classification task which detects whether a sentence is incomplete (fragment) or not (non-fragment).
 15 | 
 16 | **Conversational Utility** :-  Intent detection is one of the fundamental components for conversational system as it gives a broad understand of the category/domain the sentence/query belongs to.
 17 | 
 18 | NER helps in extracting values for required entities (eg. location, date-time) from query.
 19 | 
 20 | Fragment detection is a very useful piece in conversational system as knowing if a query/sentence is incomplete can aid in discarding bad queries beforehand.
 21 | 
 22 | **Intent Detection**
 23 | 
 24 |   Query: I need a reservation for a bar in bangladesh on feb the 11th 2032
 25 |  
 26 |   Intent: BookRestaurant
 27 | 
 28 | **NER**
 29 | 
 30 |  
 31 |   Query: ['book', 'a', 'spot', 'for', 'ten', 'at', 'a', 'top-rated', 'caucasian', 'restaurant', 'not', 'far', 'from', 'selmer']
 32 | 
 33 |   NER tags: ['O', 'O', 'O', 'O', 'B-party_size_number', 'O', 'O', 'B-sort', 'B-cuisine', 'B-restaurant_type', 'B-spatial_relation', 'I-spatial_relation', 'O', 'B-city']
 34 |  
 35 | 
 36 | **Fragment Detection**
 37 | 
 38 |  
 39 |   Query: a reservation for
 40 | 
 41 |   Label: fragment
 42 |  
 43 | 
 44 | **Notebook** :- `intent_ner_fragment <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/intent_ner_fragment/intent_ner_fragment.ipynb>`_
 45 | 
 46 | **Transform file** :- `transform_file_snips <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/intent_ner_fragment/transform_file_snips.yml>`_
 47 | 
 48 | **Tasks file** :-  `tasks_file_snips <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/intent_ner_fragment/tasks_file_snips.yml>`_
 49 | 
 50 | Example-2 Recognising Textual Entailment 
 51 | ----------------------------------------
 52 | 
 53 | **Tasks Description**
 54 | 
 55 | ``Entailment`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.
 56 | 
 57 | **Conversational Utility** :-  In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. 
 58 |  
 59 |   Query1: An old man with a package poses in front of an advertisement.
 60 | 
 61 |   Query2: A man poses in front of an ad.
 62 | 
 63 |   Label: entailment
 64 | 
 65 |   Query1: An old man with a package poses in front of an advertisement.
 66 | 
 67 |   Query2: A man poses in front of an ad for beer.
 68 | 
 69 |   Label: non-entailment
 70 | 
 71 |  
 72 | 
 73 | **Notebook** :- `entailment_snli <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/entailment_detection/entailment_snli.ipynb>`_
 74 | 
 75 | **Transform file** :- `transform_file_snli <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/entailment_detection/transform_file_snli.yml>`_
 76 | 
 77 | **Tasks file** :- `tasks_file_snli <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/entailment_detection/tasks_file_snli.yml>`_
 78 | 
 79 | 
 80 | 
 81 | Example-3 Answerability detection
 82 | ---------------------------------
 83 | **Tasks Description**
 84 | 
 85 | ``answerability`` :- This is modeled as a sentence pair classification task where the first sentence is a query and second sentence is a context passage. The objective of this task is to determine whether the query can be answered from the context passage or not.
 86 | 
 87 | **Conversational Utility** :- This can be a useful component for building a question-answering/ machine comprehension based system. In such cases, it becomes very important to determine whether the given query can be answered with given context passage or not before extracting/abstracting an answer from it. Performing question-answering for a query which is not answerable from the context, could lead to incorrect answer extraction.
 88 |  
 89 |   Query: how much money did evander holyfield make
 90 | 
 91 |   Context: Evander Holyfield Net Worth. How much is Evander Holyfield Worth? Evander Holyfield Net Worth: Evander Holyfield is a retired American professional boxer who has a net worth of $500 thousand. A professional boxer, Evander Holyfield has fought at the Heavyweight, Cruiserweight, and Light-Heavyweight Divisions, and won a Bronze medal a the 1984 Olympic Games.
 92 | 
 93 |   Label: answerable
 94 |  
 95 | **Notebook** :- `answerability_detection_msmarco <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/answerability_detection/answerability_detection_msmarco.ipynb>`_
 96 | 
 97 | **Transform file** :- `transform_file_answerability <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/answerability_detection/transform_file_answerability.yml>`_
 98 | 
 99 | **Tasks file** :- `tasks_file_answerability <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/answerability_detection/tasks_file_answerability.yml>`_
100 | 
101 | Example-4 Query type detection
102 | ------------------------------
103 |  
104 | **Tasks Description**
105 | 
106 | ``querytype`` :- This is a single sentence classification task to determine what type (category) of answer is expected for the given query. The queries are divided into 5 major classes according to the answer expected for them.
107 | 
108 | **Conversational Utility** :-  While returning a response for a query, knowing what kind of answer is expected for the query can help in both curating and cross-verfying an answer according to the type.
109 | 
110 |   Query: what's the distance between destin florida and birmingham alabama?
111 | 
112 |   Label: NUMERIC
113 | 
114 |   Query: who is suing scott wolter
115 | 
116 |   Label: PERSON
117 | 
118 |  
119 | 
120 | **Notebook** :- `query_type_detection <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_type_detection/query_type_detection.ipynb>`_
121 | 
122 | **Transform file** :- `transform_file_querytype <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_type_detection/transform_file_querytype.yml>`_
123 | 
124 | **Tasks file** :- `tasks_file_querytype <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_type_detection/tasks_file_querytype.yml>`_
125 | 
126 | Example-5 POS tagging, NER tagging
127 | ----------------------------------
128 |  
129 | **Tasks Description**
130 | 
131 | ``NER`` :-This is a Named Entity Recognition task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as "O".
132 | 
133 | ``POS`` :- This is a Part of Speech tagging task. A part of speech is a category of words that have similar grammatical properties. Each word of the sentence is tagged with the part of speech label it belongs to. The words which don't belong to any part of speech label are simply labeled as "O".
134 | 
135 | **Conversational Utility** :-  In conversational AI context, determining the syntactic parts of the sentence can help in extracting noun-phrases or important keyphrases from the sentence.
136 | 
137 |   Query: ['Despite', 'winning', 'the', 'Asian', 'Games', 'title', 'two', 'years', 'ago', ',', 'Uzbekistan', 'are', 'in', 'the', 'finals', 'as', 'outsiders', '.']
138 | 
139 |   NER tags: ['O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
140 | 
141 |   POS tags: ['I-PP', 'I-VP', 'I-NP', 'I-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-ADVP', 'O', 'I-NP', 'I-VP', 'I-PP', 'I-NP', 'I-NP', 'I-SBAR', 'I-NP', 'O']
142 | 
143 |  
144 | 
145 | **Notebook** :- `ner_pos_tagging_conll <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/ner_pos_tagging/ner_pos_tagging_conll.ipynb>`_
146 | 
147 | **Transform file** :- `transform_file_conll <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/ner_pos_tagging/transform_file_conll.yml>`_
148 | 
149 | **Tasks file** :- `tasks_file_conll <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/ner_pos_tagging/tasks_file_conll.yml>`_
150 | 
151 | Example-6 Query correctness
152 | ---------------------------
153 | 
154 | **Tasks Description**
155 | 
156 | ``querycorrectness`` :- This is modeled as single sentence classification task identifying  whether or not  a query is structurally well formed.  can  enhance  query  un-derstanding.
157 | 
158 | **Conversational Utility** :- Determining how much the query is structured would help in enhancing query understanding and improve reliability of tasks which depend on query structure to extract information.
159 | 
160 |   Query: What places have the oligarchy government ?
161 | 
162 |   Label: well-formed
163 | 
164 |   Query: What day of Diwali in 1980 ?
165 | 
166 |   Label: not well-formed
167 | 
168 |  
169 | 
170 | **Notebook** :- `query_correctness <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_correctness/query_correctness.ipynb>`_
171 | 
172 | **Transform file** :- `transform_file_query_correctness <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_correctness/transform_file_query_correctness.yml>`_
173 | 
174 | **Tasks file** :- `tasks_file_query_correctness <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_correctness/tasks_file_query_correctness.yml>`_
175 | 
176 | 
177 | Example-7 Query similarity
178 | --------------------------
179 |  
180 | **Tasks Description**
181 | 
182 | ``Query similarity`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.
183 | 
184 | **Conversational Utility** :-  In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. 
185 | 
186 | 
187 |   Query1: What is the most used word in Malayalam?
188 | 
189 |   Query2: What is meaning of the Malayalam word ""thumbatthu""?
190 | 
191 |   Label: not similar
192 | 
193 |   Query1: Which is the best compliment you have ever received?
194 | 
195 |   Query2: What's the best compliment you've got?
196 | 
197 |   Label: similar
198 | 
199 |  
200 | **Notebook** :- `query_similarity <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_pair_similarity/query_similarity_qqp.ipynb>`_
201 | 
202 | **Transform file** :- `transform_file_qqp <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_pair_similarity/transform_file_qqp.yml>`_
203 | 
204 | **Tasks file** :- `tasks_file_qqp <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/query_pair_similarity/tasks_file_query_qqp.yml>`_
205 | 
206 | Example-8 Sentiment Analysis
207 | ----------------------------
208 | 
209 | **Tasks Description**
210 | 
211 | ``sentiment`` :- This is modeled as single sentence classification task to determine where a piece of text conveys a positive or negative sentiment.
212 | 
213 | **Conversational Utility** :- To determine whether a review is positive or negative.
214 | 
215 |   Review: What I enjoyed most in this film was the scenery of Corfu, being Greek I adore my country and I liked the flattering director's point of view. Based on a true story during the years when Greece was struggling to stand on her own two feet through war, Nazis and hardship.
216 |   An Italian soldier and a Greek girl fall in love but the times are hard and they have a lot of sacrifices to make. Nicholas Cage looking great in a uniform gives a passionate account of this unfulfilled (in the beginning) love. I adored Christian Bale playing Mandras
217 |   the heroine's husband-to-be, he looks very very good as a Greek, his personality matched the one of the Greek patriot! A true fighter in there, or what! One of the movies I would like to buy and keep it in my collection...for ever!
218 | 
219 |   Label: positive
220 | 
221 |  
222 | 
223 | **Notebook** :- `IMDb_sentiment_analysis <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/sentiment_analysis/IMDb_sentiment_analysis.ipynb>`_
224 | 
225 | **Transform file** :- `transform_file_imdb <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/sentiment_analysis/transform_file_imdb.yml>`_ 
226 | 
227 | **Tasks file** :-  `tasks_file_imdb <https://github.com/hellohaptik/multi-task-NLP/blob/master/examples/sentiment_analysis/tasks_file_query_imdb.yml>`_
228 | 
229 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==============
 3 | multi-task-NLP
 4 | ==============
 5 | 
 6 | multi_task_NLP is a utility toolkit enabling NLP developers to easily train and infer a single model for multiple tasks.
 7 | We support various data formats for majority of NLI tasks and multiple transformer-based encoders (eg. BERT, Distil-BERT, ALBERT, RoBERTa, XLNET etc.)
 8 | 
 9 | .. image:: multi_task.png
10 |    :scale: 75%
11 |    :align: center
12 | 
13 | What is multi_task_NLP about?
14 | -----------------------------
15 | 
16 | Any conversational AI system involves building multiple components to perform various tasks and a pipeline to stitch all components together.
17 | Provided the recent effectiveness of transformer-based models in NLP, it’s very common to build a transformer-based model to solve your use case.
18 | But having multiple such models running together for a conversational AI system can lead to expensive resource consumption, increased latencies for predictions and make the system difficult to manage.
19 | This poses a real challenge for anyone who wants to build a conversational AI system in a simplistic way.
20 | 
21 | multi_task_NLP gives you the capability to define multiple tasks together and train a single model which simultaneously learns on all defined tasks.
22 | This means one can perform multiple tasks with latency and resource consumption equivalent to a single task.
23 | 
24 | Installation
25 | ------------
26 | 
27 | To use multi-task-NLP, you can clone the repository into the desired location on your system
28 | with the following terminal command.
29 | 
30 | .. code-block:: console
31 | 
32 |    $ cd /desired/location/
33 |    $ git clone https://github.com/hellohaptik/multi-task-NLP.git
34 |    $ cd multi-task-NLP
35 |    $ pip install -r requirements.txt 
36 | 
37 | NOTE:- The library is built and tested using ``Python 3.7.3``. It is recommended to install the requirements in a virtual environment.
38 | 
39 | Quickstart Guide
40 | ----------------
41 | A quick guide to show how a single model can be trained for multiple NLI tasks in just 3 simple steps
42 | and with **no requirement to code!!**
43 | 
44 | .. toctree::
45 |    quickstart
46 | 
47 | Examples Guide
48 | --------------
49 | We provide exemplar notebooks to demonstrate some conversational AI tasks which can be perfomed using our library.
50 | You can follow along the `notebooks <https://github.com/hellohaptik/multi-task-NLP/tree/master/examples/>`_ to understand and train a multi-task model for the tasks.
51 | 
52 | .. toctree::
53 |    :maxdepth: 2
54 | 
55 |    examples
56 | 
57 | Step by Step Guide
58 | ------------------
59 | A complete guide explaining all the components of multi-task-NLP in sequential order.
60 | 
61 | .. toctree::
62 |    :maxdepth: 2
63 | 
64 |    task_formats
65 |    data_transformations
66 |    shared_encoder
67 |    define_multi_task_model
68 |    training
69 |    infering
70 |    license
71 | 
72 | 


--------------------------------------------------------------------------------
/docs/source/infering.rst:
--------------------------------------------------------------------------------
1 | How to Infer?
2 | =============
3 | 
4 | Once you have a multi-task model trained on your tasks, we provide a convenient and easy way to use it for getting
5 | predictions on samples through the **inference pipeline**.
6 | 
7 | .. autoclass:: infer_pipeline.inferPipeline
8 |     :members:


--------------------------------------------------------------------------------
/docs/source/license.rst:
--------------------------------------------------------------------------------
  1 | License
  2 | =======
  3 | 
  4 |                                  Apache License
  5 |                            Version 2.0, January 2004
  6 |                         http://www.apache.org/licenses/
  7 | 
  8 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  9 | 
 10 |    1. Definitions.
 11 | 
 12 |       "License" shall mean the terms and conditions for use, reproduction,
 13 |       and distribution as defined by Sections 1 through 9 of this document.
 14 | 
 15 |       "Licensor" shall mean the copyright owner or entity authorized by
 16 |       the copyright owner that is granting the License.
 17 | 
 18 |       "Legal Entity" shall mean the union of the acting entity and all
 19 |       other entities that control, are controlled by, or are under common
 20 |       control with that entity. For the purposes of this definition,
 21 |       "control" means (i) the power, direct or indirect, to cause the
 22 |       direction or management of such entity, whether by contract or
 23 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 24 |       outstanding shares, or (iii) beneficial ownership of such entity.
 25 | 
 26 |       "You" (or "Your") shall mean an individual or Legal Entity
 27 |       exercising permissions granted by this License.
 28 | 
 29 |       "Source" form shall mean the preferred form for making modifications,
 30 |       including but not limited to software source code, documentation
 31 |       source, and configuration files.
 32 | 
 33 |       "Object" form shall mean any form resulting from mechanical
 34 |       transformation or translation of a Source form, including but
 35 |       not limited to compiled object code, generated documentation,
 36 |       and conversions to other media types.
 37 | 
 38 |       "Work" shall mean the work of authorship, whether in Source or
 39 |       Object form, made available under the License, as indicated by a
 40 |       copyright notice that is included in or attached to the work
 41 |       (an example is provided in the Appendix below).
 42 | 
 43 |       "Derivative Works" shall mean any work, whether in Source or Object
 44 |       form, that is based on (or derived from) the Work and for which the
 45 |       editorial revisions, annotations, elaborations, or other modifications
 46 |       represent, as a whole, an original work of authorship. For the purposes
 47 |       of this License, Derivative Works shall not include works that remain
 48 |       separable from, or merely link (or bind by name) to the interfaces of,
 49 |       the Work and Derivative Works thereof.
 50 | 
 51 |       "Contribution" shall mean any work of authorship, including
 52 |       the original version of the Work and any modifications or additions
 53 |       to that Work or Derivative Works thereof, that is intentionally
 54 |       submitted to Licensor for inclusion in the Work by the copyright owner
 55 |       or by an individual or Legal Entity authorized to submit on behalf of
 56 |       the copyright owner. For the purposes of this definition, "submitted"
 57 |       means any form of electronic, verbal, or written communication sent
 58 |       to the Licensor or its representatives, including but not limited to
 59 |       communication on electronic mailing lists, source code control systems,
 60 |       and issue tracking systems that are managed by, or on behalf of, the
 61 |       Licensor for the purpose of discussing and improving the Work, but
 62 |       excluding communication that is conspicuously marked or otherwise
 63 |       designated in writing by the copyright owner as "Not a Contribution."
 64 | 
 65 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 66 |       on behalf of whom a Contribution has been received by Licensor and
 67 |       subsequently incorporated within the Work.
 68 | 
 69 |    2. Grant of Copyright License. Subject to the terms and conditions of
 70 |       this License, each Contributor hereby grants to You a perpetual,
 71 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 72 |       copyright license to reproduce, prepare Derivative Works of,
 73 |       publicly display, publicly perform, sublicense, and distribute the
 74 |       Work and such Derivative Works in Source or Object form.
 75 | 
 76 |    3. Grant of Patent License. Subject to the terms and conditions of
 77 |       this License, each Contributor hereby grants to You a perpetual,
 78 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 79 |       (except as stated in this section) patent license to make, have made,
 80 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 81 |       where such license applies only to those patent claims licensable
 82 |       by such Contributor that are necessarily infringed by their
 83 |       Contribution(s) alone or by combination of their Contribution(s)
 84 |       with the Work to which such Contribution(s) was submitted. If You
 85 |       institute patent litigation against any entity (including a
 86 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 87 |       or a Contribution incorporated within the Work constitutes direct
 88 |       or contributory patent infringement, then any patent licenses
 89 |       granted to You under this License for that Work shall terminate
 90 |       as of the date such litigation is filed.
 91 | 
 92 |    4. Redistribution. You may reproduce and distribute copies of the
 93 |       Work or Derivative Works thereof in any medium, with or without
 94 |       modifications, and in Source or Object form, provided that You
 95 |       meet the following conditions:
 96 | 
 97 |       (a) You must give any other recipients of the Work or
 98 |           Derivative Works a copy of this License; and
 99 | 
100 |       (b) You must cause any modified files to carry prominent notices
101 |           stating that You changed the files; and
102 | 
103 |       (c) You must retain, in the Source form of any Derivative Works
104 |           that You distribute, all copyright, patent, trademark, and
105 |           attribution notices from the Source form of the Work,
106 |           excluding those notices that do not pertain to any part of
107 |           the Derivative Works; and
108 | 
109 |       (d) If the Work includes a "NOTICE" text file as part of its
110 |           distribution, then any Derivative Works that You distribute must
111 |           include a readable copy of the attribution notices contained
112 |           within such NOTICE file, excluding those notices that do not
113 |           pertain to any part of the Derivative Works, in at least one
114 |           of the following places: within a NOTICE text file distributed
115 |           as part of the Derivative Works; within the Source form or
116 |           documentation, if provided along with the Derivative Works; or,
117 |           within a display generated by the Derivative Works, if and
118 |           wherever such third-party notices normally appear. The contents
119 |           of the NOTICE file are for informational purposes only and
120 |           do not modify the License. You may add Your own attribution
121 |           notices within Derivative Works that You distribute, alongside
122 |           or as an addendum to the NOTICE text from the Work, provided
123 |           that such additional attribution notices cannot be construed
124 |           as modifying the License.
125 | 
126 |       You may add Your own copyright statement to Your modifications and
127 |       may provide additional or different license terms and conditions
128 |       for use, reproduction, or distribution of Your modifications, or
129 |       for any such Derivative Works as a whole, provided Your use,
130 |       reproduction, and distribution of the Work otherwise complies with
131 |       the conditions stated in this License.
132 | 
133 |    5. Submission of Contributions. Unless You explicitly state otherwise,
134 |       any Contribution intentionally submitted for inclusion in the Work
135 |       by You to the Licensor shall be under the terms and conditions of
136 |       this License, without any additional terms or conditions.
137 |       Notwithstanding the above, nothing herein shall supersede or modify
138 |       the terms of any separate license agreement you may have executed
139 |       with Licensor regarding such Contributions.
140 | 
141 |    6. Trademarks. This License does not grant permission to use the trade
142 |       names, trademarks, service marks, or product names of the Licensor,
143 |       except as required for reasonable and customary use in describing the
144 |       origin of the Work and reproducing the content of the NOTICE file.
145 | 
146 |    7. Disclaimer of Warranty. Unless required by applicable law or
147 |       agreed to in writing, Licensor provides the Work (and each
148 |       Contributor provides its Contributions) on an "AS IS" BASIS,
149 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
150 |       implied, including, without limitation, any warranties or conditions
151 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
152 |       PARTICULAR PURPOSE. You are solely responsible for determining the
153 |       appropriateness of using or redistributing the Work and assume any
154 |       risks associated with Your exercise of permissions under this License.
155 | 
156 |    8. Limitation of Liability. In no event and under no legal theory,
157 |       whether in tort (including negligence), contract, or otherwise,
158 |       unless required by applicable law (such as deliberate and grossly
159 |       negligent acts) or agreed to in writing, shall any Contributor be
160 |       liable to You for damages, including any direct, indirect, special,
161 |       incidental, or consequential damages of any character arising as a
162 |       result of this License or out of the use or inability to use the
163 |       Work (including but not limited to damages for loss of goodwill,
164 |       work stoppage, computer failure or malfunction, or any and all
165 |       other commercial damages or losses), even if such Contributor
166 |       has been advised of the possibility of such damages.
167 | 
168 |    9. Accepting Warranty or Additional Liability. While redistributing
169 |       the Work or Derivative Works thereof, You may choose to offer,
170 |       and charge a fee for, acceptance of support, warranty, indemnity,
171 |       or other liability obligations and/or rights consistent with this
172 |       License. However, in accepting such obligations, You may act only
173 |       on Your own behalf and on Your sole responsibility, not on behalf
174 |       of any other Contributor, and only if You agree to indemnify,
175 |       defend, and hold each Contributor harmless for any liability
176 |       incurred by, or claims asserted against, such Contributor by reason
177 |       of your accepting any such warranty or additional liability.
178 | 
179 |    END OF TERMS AND CONDITIONS
180 | 
181 |    APPENDIX: How to apply the Apache License to your work.
182 | 
183 |       To apply the Apache License to your work, attach the following
184 |       boilerplate notice, with the fields enclosed by brackets "[]"
185 |       replaced with your own identifying information. (Don't include
186 |       the brackets!)  The text should be enclosed in the appropriate
187 |       comment syntax for the file format. We also recommend that a
188 |       file or class name and description of purpose be included on the
189 |       same "printed page" as the copyright notice for easier
190 |       identification within third-party archives.
191 | 
192 |    Copyright [yyyy] [name of copyright owner]
193 | 
194 |    Licensed under the Apache License, Version 2.0 (the "License");
195 |    you may not use this file except in compliance with the License.
196 |    You may obtain a copy of the License at
197 | 
198 |        http://www.apache.org/licenses/LICENSE-2.0
199 | 
200 |    Unless required by applicable law or agreed to in writing, software
201 |    distributed under the License is distributed on an "AS IS" BASIS,
202 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
203 |    See the License for the specific language governing permissions and
204 |    limitations under the License.
205 | 


--------------------------------------------------------------------------------
/docs/source/multi_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellohaptik/multi-task-NLP/b8ae9c051437213245b51b9b1a5bea10565c38e8/docs/source/multi_task.png


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ===========
 3 | Follow these 3 simple steps to train your multi-task model!
 4 | 
 5 | Step 1 - Define your task file
 6 | ------------------------------
 7 | 
 8 | Task file is a YAML format file where you can add all your tasks for which you want to train a multi-task model.
 9 | 
10 | ::
11 | 
12 |   TaskA:
13 |     model_type: BERT
14 |     config_name: bert-base-uncased
15 |     dropout_prob: 0.05
16 |     label_map_or_file:
17 |     -label1
18 |     -label2
19 |     -label3
20 |     metrics:
21 |     - accuracy
22 |     loss_type: CrossEntropyLoss
23 |     task_type: SingleSenClassification
24 |     file_names:
25 |     - taskA_train.tsv
26 |     - taskA_dev.tsv
27 |     - taskA_test.tsv
28 | 
29 |   TaskB:
30 |     model_type: BERT
31 |     config_name: bert-base-uncased
32 |     dropout_prob: 0.3
33 |     label_map_or_file: data/taskB_train_label_map.joblib
34 |     metrics:
35 |     - seq_f1
36 |     - seq_precision
37 |     - seq_recall
38 |     loss_type: NERLoss
39 |     task_type: NER
40 |     file_names:
41 |     - taskB_train.tsv
42 |     - taskB_dev.tsv
43 |     - taskB_test.tsv
44 | 
45 | For knowing about the task file parameters to make your task file, refer :ref:`here<Task file parameters>`.
46 | 
47 | Step 2 - Run data preparation
48 | -----------------------------
49 | 
50 | After defining the task file in :ref:`Step 1<Step 1 - Define your task file>`, run the following command to prepare the data.
51 | 
52 | .. code-block:: console
53 |   
54 |   $ python data_preparation.py \ 
55 |       --task_file 'sample_task_file.yml' \
56 |       --data_dir 'data' \
57 |       --max_seq_len 50 
58 | 
59 | For knowing about the ``data_preparation.py`` script and its arguments, refer :ref:`here<Running data preparation>`.
60 | 
61 | Step 3 - Run train
62 | ------------------
63 | 
64 | Finally you can start your training using the following command.
65 | 
66 | .. code-block:: console
67 |   
68 |   $ python train.py \
69 |       --data_dir 'data/bert-base-uncased_prepared_data' \
70 |       --task_file 'sample_task_file.yml' \
71 |       --out_dir 'sample_out' \
72 |       --epochs 5 \
73 |       --train_batch_size 4 \
74 |       --eval_batch_size 8 \
75 |       --grad_accumulation_steps 2 \
76 |       --log_per_updates 25 \
77 |       --save_per_updates 1000 \
78 |       --eval_while_train True \
79 |       --test_while_train True \
80 |       --max_seq_len 50 \
81 |       --silent True 
82 | 
83 | For knowing about the ``train.py`` script and its arguments, refer :ref:`here<Running train>`.
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/docs/source/shared_encoder.rst:
--------------------------------------------------------------------------------
 1 | Shared Encoder
 2 | ==============
 3 | 
 4 | What is a shared encoder?
 5 | -------------------------
 6 | 
 7 | The concept of this library is to provide a single model for multiple tasks.
 8 | To achieve this we place a transformer-based encoder at centre. Data for all tasks will go through this centre encoder.
 9 | This encoder is called shared as it is responsible for majority of learnings on all the tasks. 
10 | Further, task specific headers are formed over the shared encoder.
11 | 
12 | Task specific headers
13 | ---------------------
14 | 
15 | The encoder hidden states are consumed by task specific layers defined to output logits in the format required by the task.
16 | Forward pass for a data batch belonging to say taskA occurs through the shared encoder and header for taskA.
17 | The computed loss (which is called as ‘task loss’) is back-propagated through the same path.
18 | 
19 | Choice of shared encoder
20 | ------------------------
21 | 
22 | We support multiple transformer-based encoder models.
23 | For ease of use, we’ve integrated the encoders from the `transformers <https://github.com/huggingface/transformers>`_ library.
24 | Available encoders with their config names are mentioned below.
25 | 
26 | +------------------+---------------------------+---------------------------+
27 | |    Model type    |       Config name         | Default config            |
28 | +==================+===========================+===========================+
29 | |                  |  distilbert-base-uncased  |                           |
30 | |   DISTILBERT     +---------------------------+  distilbert-base-uncased  |
31 | |                  |  distilbert-base-cased    |                           |
32 | +------------------+---------------------------+---------------------------+
33 | |                  |    bert-base-uncased      |                           |
34 | |                  +---------------------------+                           |
35 | |                  |     bert-base-cased       |                           |
36 | |      BERT        +---------------------------+     bert-base-uncased     |
37 | |                  |    bert-large-uncased     |                           |
38 | |                  +---------------------------+                           |
39 | |                  |     bert-large-cased      |                           |
40 | +------------------+---------------------------+---------------------------+
41 | |                  |      roberta-base         |                           |
42 | |     ROBERTA      +---------------------------+       roberta-base        |
43 | |                  |      roberta-large        |                           |
44 | +------------------+---------------------------+---------------------------+
45 | |                  |    albert-base-v1         |                           |
46 | |                  +---------------------------+                           |
47 | |                  |     albert-large-v1       |                           |
48 | |                  +---------------------------+                           |
49 | |                  |    albert-xlarge-v1       |                           |
50 | |                  +---------------------------+                           |
51 | |                  |     albert-xxlarge-v1     |                           |
52 | |     ALBERT       +---------------------------+      albert-base-v1       |
53 | |                  |    albert-base-v2         |                           |
54 | |                  +---------------------------+                           |
55 | |                  |     albert-large-v2       |                           |
56 | |                  +---------------------------+                           |
57 | |                  |    albert-xlarge-v2       |                           |
58 | |                  +---------------------------+                           |
59 | |                  |     albert-xxlarge-v2     |                           |
60 | +------------------+---------------------------+---------------------------+
61 | |                  |      xlnet-base-cased     |                           |
62 | |     XLNET        +---------------------------+      xlnet-base-cased     |
63 | |                  |      xlnet-large-cased    |                           |
64 | +------------------+---------------------------+---------------------------+
65 | 
66 | Losses
67 | ------
68 | We support following two types of loss functions.
69 | 
70 | .. autoclass:: models.loss.CrossEntropyLoss
71 |     :members: forward
72 | 
73 | .. autoclass:: models.loss.NERLoss
74 |     :members: forward
75 | 
76 | Metrics
77 | -------
78 | For evaluating the performance on dev and test sets during training, we provide the following standard metrics.
79 | 
80 | .. automodule:: utils.eval_metrics
81 |     :members: classification_accuracy, classification_f1_score, seqeval_f1_score, 
82 |         seqeval_precision, seqeval_recall, snips_f1_score, snips_precision, snips_recall, classification_recall_score
83 | 
84 | 


--------------------------------------------------------------------------------
/docs/source/task_formats.rst:
--------------------------------------------------------------------------------
 1 | Task Formats
 2 | ============
 3 | 
 4 | - To standardize the data input files, all the tasks require ``tsv`` format files as input data files.
 5 | - The tsv data files shouldn’t contain any headers. Detailed tsv formats required for specific task types are mentioned in following subsection.
 6 | 
 7 | Task types
 8 | ----------
 9 | Input data formats for different NLI tasks can vary from task to task. We support the following three task types.
10 | Majority of the NLI tasks can be modeled using one of these task types.
11 | 
12 | - ``SingleSenClassification``: This task type is to be used for classification of single sentences. The data files needs to have following columns separated by **"\\t"** 
13 |   in the order as mentioned below.
14 | 
15 |   1. **Unique id** :- an id to uniquely identify each row/sample.
16 |   2. **Label** :- label for the sentence. Labels can be numeric or strings. In case labels are strings, label mapping needs to be provided.
17 |   3. **Sentence** :- The sentence which needs to be classified.
18 | 
19 | - ``SentencePairClassification``: This task type is to be used for classification of sentence pairs (two sentences). The data files needs to have following columns separated by **"\\t"** 
20 |   in the order as mentioned below.
21 | 
22 |   1. **Unique id** :- an id to uniquely identify each row/sample.
23 |   2. **Label** :- label for the sentence. Labels can be numeric or strings. In case labels are strings, label mapping needs to be provided.
24 |   3. **SentenceA** :-  First sentence of the sentence pair.
25 |   4. **SentenceB** :- Second sentence of the sentence pair.
26 | 
27 | - ``NER`` : This task type is to be used for sequence labelling tasks like Named Entity Recognition , entity mention detection, keyphrase extraction etc. The data files need to have following columns separated by **"\\t"** in the order as mentioned below.
28 | 
29 |   1. **Unique id** :- an id to uniquely identify each row/sample. 
30 |   2. **Label** :- List of tags for words in sentence.
31 |   3. **Sentence** :- List of words in sentence.
32 | 
33 | 
34 | 
35 | NOTE:- The tsv data files must not have the header names.
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/source/training.rst:
--------------------------------------------------------------------------------
 1 | How to train?
 2 | =============
 3 | 
 4 | Once you have made the task file with the tasks you want to train for,
 5 | the next step is to run ``data_preparation.py`` and ``train.py``.
 6 | 
 7 | Running data preparation
 8 | ------------------------
 9 | 
10 | - The job of this script is to convert the given tsv data files to model inputs such as **Token Ids**, **Attention Masks** and **Token Type Ids** based on the shared encoder type.
11 | 
12 | - The script uses **multi-processing** which effectively reduces the data preparation time for large data files.
13 | 
14 | - It stores the prepared data in json files under the directory name **prepared_data** prefixed with the shared encoder config name.
15 | 
16 | The script takes the following arguments,
17 | 
18 | - ``task_file`` `(required)` :- Path to the created task file for which you want to train.
19 | 
20 | - ``data_dir`` `(required)` :- Path to the directory where the data files mentioned in task file are present.
21 | 
22 | - ``do_lower_case`` `(optional, default True)` :- Set this to False in case you are using  a `cased` config for model type.
23 | 
24 | - ``max_seq_len`` `(required, default 128)` :- Maximum sequence length for inputs. Truncating or padding will occur accordingly.
25 | 
26 | You can use the following terminal command with your own argument values to run.
27 | 
28 | .. code-block:: console
29 | 
30 |   $ python data_preparation.py \ 
31 |         --task_file 'sample_task_file.yml' \
32 |         --data_dir 'data' \
33 |         --max_seq_len 50 
34 | 
35 | Running train
36 | -------------
37 | 
38 | After ``data_preparation.py`` has finished running, it will store the respective prepared files
39 | under the directory name ‘prepared_data’ prefixed with the shared encoder config name. 
40 | The ``train.py`` can be run from terminal to start the training. Following arguments are
41 | available
42 | 
43 | - ``data_dir`` `(required)` :- Path to the directory where prepared data is stored. (eg. bert_base_uncased_prepared_data)
44 | - ``task_file`` `(required)` :-  Path to task file for training.
45 | - ``out_dir`` `(required)` :- Path to save the multi-task model checkpoints.
46 | - ``epochs`` `(required)` :- Number of epochs to train.
47 | - ``train_batch_size`` `(optional, default 8)` :- Batch size for training.
48 | - ``eval_batch_size`` `(optional, default 32)` :- Batch size for evaluation.
49 | - ``grad_accumulation_steps`` `(optional, default 1)` :- Number of batches to accumulate before update.
50 | - ``log_per_updates`` `(optional, default 10)` :- Number of updates after which to log loss.
51 | - ``silent`` `(optional, default True)` :- Set to False for logs to be shown on terminal output as well. 
52 | - ``max_seq_len`` `(optional, default 128)` :- Maximum sequence length which was used during data preparation.
53 | - ``save_per_updates`` `(optional, default 0)` :- Number of update steps after which model checkpoint to be saved. Model is always saved at the end of every epoch. 
54 | - ``load_saved_model`` `(optional, default None)` :- Path to the saved model in case of loading.
55 | - ``resume_train`` `(optional, default False)` :- Set to True for resuming training from the saved model. Training will resume from the step at which the loaded model was saved.
56 | 
57 | You can use the following terminal command with your own argument values to run.
58 | 
59 | .. code-block:: console
60 | 
61 |   $ python train.py \
62 |         --data_dir 'data/bert-base-uncased_prepared_data' \
63 |         --task_file 'sample_task_file.yml' \
64 |         --out_dir 'sample_out' \
65 |         --epochs 5 \
66 |         --train_batch_size 4 \
67 |         --eval_batch_size 8 \
68 |         --grad_accumulation_steps 2 \
69 |         --max_seq_len 50 \
70 |         --log_per_updates 25 \
71 |         --save_per_updates 1000 \
72 |         --eval_while_train \
73 |         --test_while_train  \
74 |         --silent
75 | 
76 | Logs and tensorboard
77 | --------------------
78 | 
79 | - Logs for the training should be saved in a time-stamp named directory (eg. 05_05-17_30). 
80 | - The tensorboard logs are also present in the same directory and tensorboard can be started with the following command
81 | 
82 | .. code-block:: console
83 | 
84 |   $ tensorboard --logdir 05_05-17_30/tb_logs
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/examples/answerability_detection/answerability_detection_msmarco.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## EXAMPLE - 3\n",
  8 |     "\n",
  9 |     "**Tasks :- Answerability detection**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``answerability`` :- This is modeled as a sentence pair classification task where the first sentence is a query and second sentence is a context passage. The objective of this task is to determine whether the query can be answered from the context passage or not.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :- This can be a useful component for building a question-answering/ machine comprehension based system. In such cases, it becomes very important to determine whether the given query can be answered with given context passage or not before extracting/abstracting an answer from it. Performing question-answering for a query which is not answerable from the context, could lead to incorrect answer extraction.\n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz\">MSMARCO triples</a> data which is having sentence pairs and labels.\n",
 18 |     "The data contains triplets where the first entry is the query, second one is the context passage from which the query can be answered (positive passage) , while the third entry is a context passage from which the query cannot be answered (negative passage).\n",
 19 |     "\n",
 20 |     "Data is transformed into sentence pair classification format, with query-positive context pair labeled as 1 (answerable) and query-negative context pair labeled as 0 (non-answerable)\n",
 21 |     "\n",
 22 |     "The data can be downloaded using the following ``wget`` command and extracted using ``tar`` command. The data is fairly large to download (7.4GB). "
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "!wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz -P msmarco_data"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "!tar -xvzf msmarco_data/triples.train.small.tar.gz -C msmarco_data/"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "!rm msmarco_data/triples.train.small.tar.gz"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# Step - 1: Transforming data\n",
 57 |     "\n",
 58 |     "The data is present in *JSONL* format where each object contains a sample having the two sentences as ``sentence1`` and ``sentence2``. We consider ``gold_label`` field as the label which can have value: entailment, contradiction or neutral.\n",
 59 |     "\n",
 60 |     "We already provide a sample transformation function ``msmarco_answerability_detection_to_tsv`` to convert this data to required tsv format. Data is transformed into sentence pair classification format, with query-positive context pair labeled as 1 (answerable) and query-negative context pair labeled as 0 (non-answerable)\n",
 61 |     "\n",
 62 |     "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 63 |     "\n",
 64 |     "The transformation file should have the following details which is already created ``transform_file_snli.yml``.\n",
 65 |     "\n",
 66 |     "```\n",
 67 |     "transform1:\n",
 68 |     "  transform_func: msmarco_answerability_detection_to_tsv\n",
 69 |     "  transform_params:\n",
 70 |     "    data_frac : 0.02\n",
 71 |     "  read_file_names:\n",
 72 |     "    - triples.train.small.tsv\n",
 73 |     "  read_dir : msmarco_data\n",
 74 |     "  save_dir: ../../data\n",
 75 |     "  \n",
 76 |     " ```\n",
 77 |     " Following command can be used to run the data transformation for the tasks."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# Step -2 Data Preparation\n",
 85 |     "\n",
 86 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 87 |     "\n",
 88 |     "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_answerability.yml``\n",
 89 |     "```\n",
 90 |     "answerability:\n",
 91 |     "    model_type: BERT\n",
 92 |     "    config_name: bert-base-uncased\n",
 93 |     "    dropout_prob: 0.2\n",
 94 |     "    class_num: 2\n",
 95 |     "    metrics:\n",
 96 |     "    - classification_accuracy\n",
 97 |     "    loss_type: CrossEntropyLoss\n",
 98 |     "    task_type: SentencePairClassification\n",
 99 |     "    file_names:\n",
100 |     "    - msmarco_answerability_train.tsv\n",
101 |     "    - msmarco_answerability_dev.tsv\n",
102 |     "    - msmarco_answerability_test.tsv\n",
103 |     "```"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "!python ../../data_preparation.py \\\n",
113 |     "    --task_file 'tasks_file_answerability.yml' \\\n",
114 |     "    --data_dir '../../data' \\\n",
115 |     "    --max_seq_len 324"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "# Step - 3 Running train\n",
123 |     "\n",
124 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n",
125 |     "\n",
126 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "!python ../../train.py \\\n",
136 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
137 |     "    --task_file 'tasks_file_answerability.yml' \\\n",
138 |     "    --out_dir 'msmarco_answerability_bert_base' \\\n",
139 |     "    --epochs 3 \\\n",
140 |     "    --train_batch_size 8 \\\n",
141 |     "    --eval_batch_size 16 \\\n",
142 |     "    --grad_accumulation_steps 2 \\\n",
143 |     "    --log_per_updates 250 \\\n",
144 |     "    --max_seq_len 324 \\\n",
145 |     "    --save_per_updates 16000 \\\n",
146 |     "    --eval_while_train \\\n",
147 |     "    --test_while_train \\\n",
148 |     "    --silent"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "# Step - 4 Infering\n",
156 |     "\n",
157 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
158 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
159 |     "\n",
160 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "import sys\n",
170 |     "sys.path.insert(1, '../../')\n",
171 |     "from infer_pipeline import inferPipeline"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": []
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": []
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "kernelspec": {
191 |    "display_name": "Python 3",
192 |    "language": "python",
193 |    "name": "python3"
194 |   },
195 |   "language_info": {
196 |    "codemirror_mode": {
197 |     "name": "ipython",
198 |     "version": 3
199 |    },
200 |    "file_extension": ".py",
201 |    "mimetype": "text/x-python",
202 |    "name": "python",
203 |    "nbconvert_exporter": "python",
204 |    "pygments_lexer": "ipython3",
205 |    "version": "3.7.3"
206 |   }
207 |  },
208 |  "nbformat": 4,
209 |  "nbformat_minor": 4
210 | }
211 | 


--------------------------------------------------------------------------------
/examples/answerability_detection/tasks_file_answerability.yml:
--------------------------------------------------------------------------------
 1 | answerability:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     class_num: 2
 6 |     metrics:
 7 |     - classification_accuracy
 8 |     loss_type: CrossEntropyLoss
 9 |     task_type: SentencePairClassification
10 |     file_names:
11 |     - msmarco_answerability_train.tsv
12 |     - msmarco_answerability_dev.tsv
13 |     - msmarco_answerability_test.tsv


--------------------------------------------------------------------------------
/examples/answerability_detection/transform_file_answerability.yml:
--------------------------------------------------------------------------------
1 | transform1:
2 |   transform_func: msmarco_answerability_detection_to_tsv
3 |   transform_params:
4 |     data_frac : 0.02
5 |   read_file_names:
6 |     - triples.train.small.tsv
7 |   read_dir : msmarco_data
8 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/entailment_detection/entailment_snli.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 2\n",
  8 |     "\n",
  9 |     "**Tasks :- Entailment detection**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``Entailment`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :-  In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. \n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://nlp.stanford.edu/projects/snli\">SNLI</a> data which is having sentence pairs and labels.\n",
 18 |     "\n",
 19 |     "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "!unzip snli_1.0.zip"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "# Step - 1: Transforming data\n",
 45 |     "\n",
 46 |     "The data is present in *JSONL* format where each object contains a sample having the two sentences as ``sentence1`` and ``sentence2``. We consider ``gold_label`` field as the label which can have value: entailment, contradiction or neutral.\n",
 47 |     "\n",
 48 |     "We already provide a sample transformation function ``snli_entailment_to_tsv`` to convert this data to required tsv format. Contradiction and neutral labels are mapped to 0 representing non-entailment scenario. Only entailment label is mapped to 1.\n",
 49 |     "\n",
 50 |     "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 51 |     "\n",
 52 |     "The transformation file should have the following details which is already created ``transform_file_snli.yml``.\n",
 53 |     "\n",
 54 |     "```\n",
 55 |     "transform1:\n",
 56 |     "  transform_func: snli_entailment_to_tsv\n",
 57 |     "  read_file_names:\n",
 58 |     "    - snli_1.0_train.jsonl\n",
 59 |     "    - snli_1.0_dev.jsonl\n",
 60 |     "    - snli_1.0_test.jsonl\n",
 61 |     "  read_dir : snli_1.0\n",
 62 |     "  save_dir: ../../data\n",
 63 |     "  \n",
 64 |     " ```\n",
 65 |     " Following command can be used to run the data transformation for the tasks."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "!python ../../data_transformations.py \\\n",
 75 |     "    --transform_file 'transform_file_snli.yml'"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "# Step -2 Data Preparation\n",
 83 |     "\n",
 84 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 85 |     "\n",
 86 |     "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_snips.yml``\n",
 87 |     "```\n",
 88 |     "entailmentsnli:\n",
 89 |     "    model_type: BERT\n",
 90 |     "    config_name: bert-base-uncased\n",
 91 |     "    dropout_prob: 0.2\n",
 92 |     "    metrics:\n",
 93 |     "    - classification_accuracy\n",
 94 |     "    loss_type: CrossEntropyLoss\n",
 95 |     "    class_num: 2\n",
 96 |     "    task_type: SentencePairClassification\n",
 97 |     "    file_names:\n",
 98 |     "    - entailment_snli_1.0_train.tsv\n",
 99 |     "    - entailment_snli_1.0_dev.tsv\n",
100 |     "    - entailment_snli_1.0_test.tsv\n",
101 |     "```"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "!python ../../data_preparation.py \\\n",
111 |     "    --task_file 'tasks_file_snli.yml' \\\n",
112 |     "    --data_dir '../../data' \\\n",
113 |     "    --max_seq_len 128"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "# Step - 3 Running train\n",
121 |     "\n",
122 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n",
123 |     "\n",
124 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "!python ../../train.py \\\n",
134 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
135 |     "    --task_file 'tasks_file_snli.yml' \\\n",
136 |     "    --out_dir 'snli_entailment_bert_base' \\\n",
137 |     "    --epochs 3 \\\n",
138 |     "    --train_batch_size 64 \\\n",
139 |     "    --eval_batch_size 64 \\\n",
140 |     "    --grad_accumulation_steps 1 \\\n",
141 |     "    --log_per_updates 100 \\\n",
142 |     "    --max_seq_len 128 \\\n",
143 |     "    --eval_while_train \\\n",
144 |     "    --test_while_train \\\n",
145 |     "    --silent"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "# Step - 4 Infering\n",
153 |     "\n",
154 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
155 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
156 |     "\n",
157 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "import sys\n",
167 |     "sys.path.insert(1, '../../')\n",
168 |     "from infer_pipeline import inferPipeline"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": []
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": []
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "kernelspec": {
195 |    "display_name": "Python 3",
196 |    "language": "python",
197 |    "name": "python3"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 3
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython3",
209 |    "version": "3.7.3"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 4
214 | }
215 | 


--------------------------------------------------------------------------------
/examples/entailment_detection/tasks_file_snli.yml:
--------------------------------------------------------------------------------
 1 | entailmentsnli:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     metrics:
 6 |     - classification_accuracy
 7 |     loss_type: CrossEntropyLoss
 8 |     class_num: 2
 9 |     task_type: SentencePairClassification
10 |     file_names:
11 |     - entailment_snli_1.0_train.tsv
12 |     - entailment_snli_1.0_dev.tsv
13 |     - entailment_snli_1.0_test.tsv


--------------------------------------------------------------------------------
/examples/entailment_detection/transform_file_snli.yml:
--------------------------------------------------------------------------------
1 | transform1:
2 |   transform_func: snli_entailment_to_tsv
3 |   read_file_names:
4 |     - snli_1.0_train.jsonl
5 |     - snli_1.0_dev.jsonl
6 |     - snli_1.0_test.jsonl
7 |   read_dir : snli_1.0
8 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/intent_ner_fragment/intent_ner_fragment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 1\n",
  8 |     "\n",
  9 |     "**Tasks :- Intent Detection, NER, Fragment Detection**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``Intent Detection`` :- This is a single sentence classification task where an `intent` specifies which class the data sample belongs to. \n",
 14 |     "\n",
 15 |     "``NER`` :- This is a Named Entity Recognition/ Sequence Labelling/ Slot filling task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as \"O\". \n",
 16 |     "\n",
 17 |     "``Fragment Detection`` :- This is modeled as a single sentence classification task which detects whether a sentence is incomplete (fragment) or not (non-fragment).\n",
 18 |     "\n",
 19 |     "**Conversational Utility** :-  Intent detection is one of the fundamental components for conversational system as it gives a broad understand of the category/domain the sentence/query belongs to.\n",
 20 |     "\n",
 21 |     "NER helps in extracting values for required entities (eg. location, date-time) from query.\n",
 22 |     "\n",
 23 |     "Fragment detection is a very useful piece in conversational system as knowing if a query/sentence is incomplete can aid in discarding bad queries beforehand.\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "**Data** :- In this example, we are using the <a href=\"https://snips-nlu.readthedocs.io/en/latest/dataset.html\">SNIPS</a> data for intent and entity detection. For the sake of simplicity, we provide \n",
 27 |     "the data in simpler form under ``snips_data`` directory taken from <a href=\"https://github.com/LeePleased/StackPropagation-SLU/tree/master/data/snips\">here</a>.\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Step - 1: Transforming data\n",
 35 |     "\n",
 36 |     "The data is present in *BIO* format where each word in a sentence is tagged with corresponding entity. \n",
 37 |     "Sentences are separated by \\\" \" and at the end of each sentence, intent class to which the sentence belongs is mentioned. We already provide a sample transformation function ``snli_entailment_to_tsv`` to convert this data to required tsv data files. T\n",
 38 |     "Fragment detection data is generated from intent detection data created using the transform function\n",
 39 |     "``create_fragment_detection_tsv``. \n",
 40 |     "\n",
 41 |     "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 42 |     "\n",
 43 |     "The transformation file should have the following details which is already created ``transform_file_snips.yml``.\n",
 44 |     "\n",
 45 |     "```\n",
 46 |     "transform1:\n",
 47 |     "  transform_func: snips_intent_ner_to_tsv\n",
 48 |     "  read_file_names:\n",
 49 |     "    - snips_train.txt\n",
 50 |     "    - snips_dev.txt\n",
 51 |     "    - snips_test.txt\n",
 52 |     "  read_dir: snips_data\n",
 53 |     "  save_dir: ../../data\n",
 54 |     "  \n",
 55 |     "transform2:\n",
 56 |     "  transform_func: create_fragment_detection_tsv\n",
 57 |     "  read_file_names:\n",
 58 |     "    - intent_snips_train.tsv\n",
 59 |     "    - intent_snips_dev.tsv\n",
 60 |     "    - intent_snips_test.tsv\n",
 61 |     "  read_dir: ../../data\n",
 62 |     "  save_dir: ../../data\n",
 63 |     "  \n",
 64 |     " ```\n",
 65 |     " Following command can be used to run the data transformation for the tasks."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "!python ../../data_transformations.py \\\n",
 75 |     "    --transform_file 'transform_file_snips.yml'"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "# Step -2 Data Preparation\n",
 83 |     "\n",
 84 |     "Here we are training the three tasks together for demonstration. This means we will have a single\n",
 85 |     "multi-task model capable of performing on all the three tasks. You can also train the tasks separately \n",
 86 |     "by mentioning single tasks in task file.\n",
 87 |     "\n",
 88 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 89 |     "\n",
 90 |     "Defining tasks file for training single model for multiple tasks - intent detection, NER and fragment detection. The file is already created at ``tasks_file_snips.yml``\n",
 91 |     "\n",
 92 |     "```\n",
 93 |     "ner:\n",
 94 |     "  model_type: BERT\n",
 95 |     "  config_name: bert-base-uncased\n",
 96 |     "  dropout_prob: 0.3\n",
 97 |     "  label_map_or_file: ../../data/ner_snips_train_label_map.joblib\n",
 98 |     "  metrics:\n",
 99 |     "  - snips_f1_score\n",
100 |     "  - snips_precision\n",
101 |     "  - snips_recall\n",
102 |     "  loss_type: NERLoss\n",
103 |     "  task_type: NER\n",
104 |     "  file_names:\n",
105 |     "  - ner_snips_train.tsv\n",
106 |     "  - ner_snips_dev.tsv\n",
107 |     "  - ner_snips_test.tsv\n",
108 |     "\n",
109 |     "intent:\n",
110 |     "    model_type: BERT\n",
111 |     "    config_name: bert-base-uncased\n",
112 |     "    dropout_prob: 0.3\n",
113 |     "    label_map_or_file: ../../data/int_snips_train_label_map.joblib\n",
114 |     "    metrics:\n",
115 |     "    - classification_accuracy\n",
116 |     "    loss_type: CrossEntropyLoss\n",
117 |     "    task_type: SingleSenClassification\n",
118 |     "    file_names:\n",
119 |     "    - int_snips_train.tsv\n",
120 |     "    - int_snips_dev.tsv\n",
121 |     "    - int_snips_test.tsv\n",
122 |     "\n",
123 |     "    \n",
124 |     "fragment:\n",
125 |     "    model_type: BERT\n",
126 |     "    config_name: bert-base-uncased\n",
127 |     "    dropout_prob: 0.2\n",
128 |     "    class_num: 2\n",
129 |     "    metrics:\n",
130 |     "    - classification_accuracy\n",
131 |     "    loss_type: CrossEntropyLoss\n",
132 |     "    task_type: SingleSenClassification\n",
133 |     "    file_names:\n",
134 |     "    - fragment_snips_train.tsv\n",
135 |     "    - fragment_snips_dev.tsv\n",
136 |     "    - fragment_snips_test.tsv\n",
137 |     "```\n",
138 |     "\n",
139 |     "Following command can be used to run the data preparation for the tasks."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "!python ../../data_preparation.py \\\n",
149 |     "    --task_file 'tasks_file_snips.yml' \\\n",
150 |     "    --data_dir '../../data' \\\n",
151 |     "    --max_seq_len 50"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "# Step - 3 Running train\n",
159 |     "\n",
160 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory. For demonstration, we've put up sample logs under ``train_logs`` directory.\n",
161 |     "\n",
162 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "!python ../../train.py \\\n",
172 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
173 |     "    --task_file 'tasks_file_snips.yml' \\\n",
174 |     "    --out_dir 'snips_intent_ner_fragment_bert_base' \\\n",
175 |     "    --epochs 3 \\\n",
176 |     "    --train_batch_size 16 \\\n",
177 |     "    --eval_batch_size 32 \\\n",
178 |     "    --grad_accumulation_steps 2 \\\n",
179 |     "    --log_per_updates 50 \\\n",
180 |     "    --max_seq_len 50 \\\n",
181 |     "    --eval_while_train \\\n",
182 |     "    --test_while_train \\\n",
183 |     "    --silent "
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "# Step - 4 Infering\n",
191 |     "\n",
192 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
193 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
194 |     "\n",
195 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "import sys\n",
205 |     "sys.path.insert(1, '../../')\n",
206 |     "from infer_pipeline import inferPipeline"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": []
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": []
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.7.3"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 4
252 | }
253 | 


--------------------------------------------------------------------------------
/examples/intent_ner_fragment/tasks_file_snips.yml:
--------------------------------------------------------------------------------
 1 | ner:
 2 |   model_type: BERT
 3 |   config_name: bert-base-uncased
 4 |   dropout_prob: 0.3
 5 |   label_map_or_file: ../../data/ner_snips_train_label_map.joblib
 6 |   metrics:
 7 |   - snips_f1_score
 8 |   - snips_precision
 9 |   - snips_recall
10 |   loss_type: NERLoss
11 |   task_type: NER
12 |   file_names:
13 |   - ner_snips_train.tsv
14 |   - ner_snips_dev.tsv
15 |   - ner_snips_test.tsv
16 | 
17 | intent:
18 |     model_type: BERT
19 |     config_name: bert-base-uncased
20 |     dropout_prob: 0.3
21 |     label_map_or_file: ../../data/int_snips_train_label_map.joblib
22 |     metrics:
23 |     - classification_accuracy
24 |     loss_type: CrossEntropyLoss
25 |     task_type: SingleSenClassification
26 |     file_names:
27 |     - intent_snips_train.tsv
28 |     - intent_snips_dev.tsv
29 |     - intent_snips_test.tsv
30 | 
31 |     
32 | fragdetect:
33 |     model_type: BERT
34 |     config_name: bert-base-uncased
35 |     dropout_prob: 0.2
36 |     class_num: 2
37 |     metrics:
38 |     - classification_accuracy
39 |     loss_type: CrossEntropyLoss
40 |     task_type: SingleSenClassification
41 |     file_names:
42 |     - fragment_snips_train.tsv
43 |     - fragment_snips_dev.tsv
44 |     - fragment_snips_test.tsv


--------------------------------------------------------------------------------
/examples/intent_ner_fragment/transform_file_snips.yml:
--------------------------------------------------------------------------------
 1 | transform1:
 2 |   transform_func: snips_intent_ner_to_tsv
 3 |   read_file_names:
 4 |     - snips_train.txt
 5 |     - snips_dev.txt
 6 |     - snips_test.txt
 7 |   read_dir: snips_data
 8 |   save_dir: ../../data
 9 |   
10 | transform2:
11 |   transform_func: create_fragment_detection_tsv
12 |   read_file_names:
13 |     - intent_snips_train.tsv
14 |     - intent_snips_dev.tsv
15 |     - intent_snips_test.tsv
16 |   read_dir: ../../data
17 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/ner_pos_tagging/ner_pos_tagging_conll.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 5\n",
  8 |     "\n",
  9 |     "**Tasks :- NER tagging, POS tagging**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``NER`` :-This is a Named Entity Recognition task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as \"O\".\n",
 14 |     "\n",
 15 |     "``POS`` :- This is a Part of Speech tagging task. A part of speech is a category of words that have similar grammatical properties. Each word of the sentence is tagged with the part of speech label it belongs to. The words which don't belong to any part of speech label are simply labeled as \"O\".\n",
 16 |     "\n",
 17 |     "**Conversational Utility** :-  In conversational AI context, determining the syntactic parts of the sentence can help in extracting noun-phrases or important keyphrases from the sentence.\n",
 18 |     "\n",
 19 |     "**Data** :- In this example, we are using the <a href=\"https://www.clips.uantwerpen.be/conll2003/ner/\">coNLL 2003</a> data which is BIO tagged format with the POS and NER tags separated by space.\n",
 20 |     "\n",
 21 |     "The data is already present in ``coNLL_data`` directory."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Step - 1: Transforming data\n",
 29 |     "\n",
 30 |     "Raw data is in BIO tagged format with the POS and NER tags separated by space.\n",
 31 |     "\n",
 32 |     "We already provide a sample transformation function ``coNLL_ner_pos_to_tsv`` to convert this data to required tsv format. \n",
 33 |     "\n",
 34 |     "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 35 |     "\n",
 36 |     "The transformation file should have the following details which is already created ``transform_file_conll.yml``.\n",
 37 |     "\n",
 38 |     "```\n",
 39 |     "transform1:\n",
 40 |     "  transform_func: coNLL_ner_pos_to_tsv\n",
 41 |     "  read_file_names:\n",
 42 |     "    - coNLL_train.txt\n",
 43 |     "    - coNLL_testa.txt\n",
 44 |     "    - coNLL_testb.txt\n",
 45 |     "  read_dir: coNLL_data\n",
 46 |     "  save_dir: ../../data\n",
 47 |     " ```\n",
 48 |     " Following command can be used to run the data transformation for the tasks."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "!python ../../data_transformations.py \\\n",
 58 |     "    --transform_file 'transform_file_conll.yml'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "# Step -2 Data Preparation\n",
 66 |     "\n",
 67 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 68 |     "\n",
 69 |     "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_conll.yml``\n",
 70 |     "```\n",
 71 |     "conllner:\n",
 72 |     "  model_type: BERT\n",
 73 |     "  config_name: bert-base-uncased\n",
 74 |     "  dropout_prob: 0.2\n",
 75 |     "  label_map_or_file: ../../data/ner_coNLL_train_label_map.joblib\n",
 76 |     "  metrics:\n",
 77 |     "  - seqeval_f1_score\n",
 78 |     "  - seqeval_precision\n",
 79 |     "  - seqeval_recall\n",
 80 |     "  loss_type: NERLoss\n",
 81 |     "  task_type: NER\n",
 82 |     "  file_names:\n",
 83 |     "  - ner_coNLL_train.tsv\n",
 84 |     "  - ner_coNLL_testa.tsv\n",
 85 |     "  - ner_coNLL_testb.tsv\n",
 86 |     "\n",
 87 |     "conllpos:\n",
 88 |     "    model_type: BERT\n",
 89 |     "    config_name: bert-base-uncased\n",
 90 |     "    dropout_prob: 0.2\n",
 91 |     "    label_map_or_file: ../../data/pos_coNLL_train_label_map.joblib\n",
 92 |     "    metrics:\n",
 93 |     "    - seqeval_f1_score\n",
 94 |     "    - seqeval_precision\n",
 95 |     "    - seqeval_recall\n",
 96 |     "    loss_type: NERLoss\n",
 97 |     "    task_type: NER\n",
 98 |     "    file_names:\n",
 99 |     "    - pos_coNLL_train.tsv\n",
100 |     "    - pos_coNLL_testa.tsv\n",
101 |     "    - pos_coNLL_testb.tsv\n",
102 |     "```"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "!python ../../data_preparation.py \\\n",
112 |     "    --task_file 'tasks_file_conll.yml' \\\n",
113 |     "    --data_dir '../../data' \\\n",
114 |     "    --max_seq_len 50"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "# Step -3 Running Training"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "!python ../../train.py \\\n",
131 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
132 |     "    --task_file 'tasks_file_conll.yml' \\\n",
133 |     "    --out_dir 'conll_ner_pos_bert_base' \\\n",
134 |     "    --epochs 10 \\\n",
135 |     "    --train_batch_size 32 \\\n",
136 |     "    --eval_batch_size 32 \\\n",
137 |     "    --grad_accumulation_steps 1 \\\n",
138 |     "    --log_per_updates 50 \\\n",
139 |     "    --max_seq_len 50 \\\n",
140 |     "    --eval_while_train \\\n",
141 |     "    --test_while_train \\\n",
142 |     "    --silent"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "# Step - 4 Infering\n",
150 |     "\n",
151 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
152 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
153 |     "\n",
154 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "import sys\n",
164 |     "sys.path.insert(1, '../../')\n",
165 |     "from infer_pipeline import inferPipeline"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": []
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": []
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": []
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.7.3"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 4
218 | }
219 | 


--------------------------------------------------------------------------------
/examples/ner_pos_tagging/tasks_file_conll.yml:
--------------------------------------------------------------------------------
 1 | conllner:
 2 |   model_type: BERT
 3 |   config_name: bert-base-uncased
 4 |   dropout_prob: 0.2
 5 |   label_map_or_file: ../../data/ner_coNLL_train_label_map.joblib
 6 |   metrics:
 7 |   - seqeval_f1_score
 8 |   - seqeval_precision
 9 |   - seqeval_recall
10 |   loss_type: NERLoss
11 |   task_type: NER
12 |   file_names:
13 |   - ner_coNLL_train.tsv
14 |   - ner_coNLL_testa.tsv
15 |   - ner_coNLL_testb.tsv
16 | 
17 | conllpos:
18 |     model_type: BERT
19 |     config_name: bert-base-uncased
20 |     dropout_prob: 0.2
21 |     label_map_or_file: ../../data/pos_coNLL_train_label_map.joblib
22 |     metrics:
23 |     - seqeval_f1_score
24 |     - seqeval_precision
25 |     - seqeval_recall
26 |     loss_type: NERLoss
27 |     task_type: NER
28 |     file_names:
29 |     - pos_coNLL_train.tsv
30 |     - pos_coNLL_testa.tsv
31 |     - pos_coNLL_testb.tsv


--------------------------------------------------------------------------------
/examples/ner_pos_tagging/transform_file_conll.yml:
--------------------------------------------------------------------------------
1 | transform1:
2 |   transform_func: coNLL_ner_pos_to_tsv
3 |   read_file_names:
4 |     - coNLL_train.txt
5 |     - coNLL_testa.txt
6 |     - coNLL_testb.txt
7 |   read_dir: coNLL_data
8 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/query_correctness/query_correctness.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 6\n",
  8 |     "\n",
  9 |     "**Tasks :- query correctness**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``querycorrectness`` :- This is modeled as single sentence classification task identifying  whether or not  a query is structurally well formed.  can  enhance  query  un-derstanding.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :- Determining how much the query is structured would help in enhancing query understanding and improve reliability of tasks which depend on query structure to extract information.\n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://github.com/google-research-datasets/query-wellformedness\">Query-wellformedness</a> data where every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n",
 18 |     "\n",
 19 |     "The data is already present under the directory ``query_correctness_data``"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Step - 1: Transforming data\n",
 27 |     "\n",
 28 |     "```\n",
 29 |     "transform1:\n",
 30 |     "  transform_func: query_correctness_to_tsv\n",
 31 |     "  read_file_names:\n",
 32 |     "    - train.tsv\n",
 33 |     "    - dev.tsv\n",
 34 |     "    - test.tsv\n",
 35 |     "\n",
 36 |     "  read_dir: query_correctness_data\n",
 37 |     "  save_dir: ../../data\n",
 38 |     "```"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "!python ../../data_transformations.py \\\n",
 48 |     "    --transform_file 'transform_file_query_correctness.yml'"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Step -2 Data Preparation\n",
 56 |     "\n",
 57 |     "```\n",
 58 |     "querycorrectness:\n",
 59 |     "    model_type: BERT\n",
 60 |     "    config_name: bert-base-uncased\n",
 61 |     "    dropout_prob: 0.2\n",
 62 |     "    class_num : 2\n",
 63 |     "    metrics:\n",
 64 |     "    - classification_accuracy\n",
 65 |     "    loss_type: CrossEntropyLoss\n",
 66 |     "    task_type: SingleSenClassification\n",
 67 |     "    file_names:\n",
 68 |     "    - query_correctness_train.tsv\n",
 69 |     "    - query_correctness_dev.tsv\n",
 70 |     "    - query_correctness_test.tsv\n",
 71 |     "```"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "!python ../../data_preparation.py \\\n",
 81 |     "    --task_file 'tasks_file_query_correctness.yml' \\\n",
 82 |     "    --data_dir '../../data' \\\n",
 83 |     "    --max_seq_len 50"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "# Step - 3 Running train"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "!python ../../train.py \\\n",
100 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
101 |     "    --task_file 'tasks_file_query_correctness.yml' \\\n",
102 |     "    --out_dir 'query_correctness_bert_base' \\\n",
103 |     "    --epochs 10 \\\n",
104 |     "    --train_batch_size 16 \\\n",
105 |     "    --eval_batch_size 32 \\\n",
106 |     "    --grad_accumulation_steps 1 \\\n",
107 |     "    --log_per_updates 20 \\\n",
108 |     "    --max_seq_len 50 \\\n",
109 |     "    --eval_while_train \\\n",
110 |     "    --test_while_train \\\n",
111 |     "    --silent"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "# Step - 4 Infering"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "import sys\n",
128 |     "sys.path.insert(1, '../../')\n",
129 |     "from infer_pipeline import inferPipeline"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.7.3"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 4
161 | }
162 | 


--------------------------------------------------------------------------------
/examples/query_correctness/tasks_file_query_correctness.yml:
--------------------------------------------------------------------------------
 1 | querycorrectness:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     class_num : 2
 6 |     metrics:
 7 |     - classification_accuracy
 8 |     loss_type: CrossEntropyLoss
 9 |     task_type: SingleSenClassification
10 |     file_names:
11 |     - query_correctness_train.tsv
12 |     - query_correctness_dev.tsv
13 |     - query_correctness_test.tsv


--------------------------------------------------------------------------------
/examples/query_correctness/transform_file_query_correctness.yml:
--------------------------------------------------------------------------------
1 | transform1:
2 |   transform_func: query_correctness_to_tsv
3 |   read_file_names:
4 |     - train.tsv
5 |     - dev.tsv
6 |     - test.tsv
7 | 
8 |   read_dir: query_correctness_data
9 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/query_pair_similarity/query_similarity_qqp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 7\n",
  8 |     "\n",
  9 |     "**Tasks :- Query similarity**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``Query similarity`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :-  In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. \n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://nlp.stanford.edu/projects/snli\">SNLI</a> data which is having sentence pairs and labels.\n",
 18 |     "\n",
 19 |     "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!wget qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P qqp_data/"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Step -1 Data Transformations\n",
 36 |     "\n",
 37 |     "Defining transform file\n",
 38 |     "\n",
 39 |     "```\n",
 40 |     "sample_transform:\n",
 41 |     "  transform_func: qqp_query_similarity_to_tsv\n",
 42 |     "  read_file_names:\n",
 43 |     "    - quora_duplicate_questions.tsv\n",
 44 |     "  read_dir : qqp_data\n",
 45 |     "  save_dir: ../../data\n",
 46 |     "```"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "!python ../../data_transformations.py \\\n",
 56 |     "    --transform_file 'transform_file_qqp.yml'"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "# Step -2 Data Preparation\n",
 64 |     "\n",
 65 |     "Defining task file for query similarity detection with QQP data\n",
 66 |     "\n",
 67 |     "```\n",
 68 |     "querysimilarity:\n",
 69 |     "    model_type: BERT\n",
 70 |     "    config_name: bert-base-uncased\n",
 71 |     "    dropout_prob: 0.2\n",
 72 |     "    metrics:\n",
 73 |     "    - classification_accuracy\n",
 74 |     "    loss_type: CrossEntropyLoss\n",
 75 |     "    class_num: 2\n",
 76 |     "    task_type: SentencePairClassification\n",
 77 |     "    file_names:\n",
 78 |     "    - qqp_query_similarity_train.tsv\n",
 79 |     "    - qqp_query_similarity_dev.tsv\n",
 80 |     "    - qqp_query_similarity_test.tsv\n",
 81 |     "```"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "!python ../../data_preparation.py \\\n",
 91 |     "    --task_file 'tasks_file_qqp.yml' \\\n",
 92 |     "    --data_dir '../../data' \\\n",
 93 |     "    --max_seq_len 200"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "# Step - 3 Running train\n",
101 |     "\n",
102 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n",
103 |     "\n",
104 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "!python ../../train.py \\\n",
114 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
115 |     "    --task_file 'tasks_file_qqp.yml' \\\n",
116 |     "    --out_dir 'qqp_query_similarity_bert_base' \\\n",
117 |     "    --epochs 3 \\\n",
118 |     "    --train_batch_size 32 \\\n",
119 |     "    --eval_batch_size 32 \\\n",
120 |     "    --grad_accumulation_steps 2 \\\n",
121 |     "    --log_per_updates 100 \\\n",
122 |     "    --save_per_updates 3000 \\\n",
123 |     "    --limit_save 6 \\\n",
124 |     "    --max_seq_len 200 \\\n",
125 |     "    --eval_while_train \\\n",
126 |     "    --test_while_train \\\n",
127 |     "    --silent"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "# Step - 4 Infering\n",
135 |     "\n",
136 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
137 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
138 |     "\n",
139 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "import sys\n",
149 |     "sys.path.insert(1, '../../')\n",
150 |     "from infer_pipeline import inferPipeline"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": []
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.7.3"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 4
182 | }
183 | 


--------------------------------------------------------------------------------
/examples/query_pair_similarity/tasks_file_qqp.yml:
--------------------------------------------------------------------------------
 1 | querysimilarity:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     metrics:
 6 |     - classification_accuracy
 7 |     loss_type: CrossEntropyLoss
 8 |     class_num: 2
 9 |     task_type: SentencePairClassification
10 |     file_names:
11 |     - qqp_query_similarity_train.tsv
12 |     - qqp_query_similarity_dev.tsv
13 |     - qqp_query_similarity_test.tsv


--------------------------------------------------------------------------------
/examples/query_pair_similarity/transform_file_qqp.yml:
--------------------------------------------------------------------------------
1 | sample_transform:
2 |   transform_func: qqp_query_similarity_to_tsv
3 |   read_file_names:
4 |     - quora_duplicate_questions.tsv
5 |   read_dir : qqp_data
6 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/query_type_detection/query_type_detection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example - 4\n",
  8 |     "\n",
  9 |     "**Tasks :- Query type detection**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``querytype`` :- This is a single sentence classification task to determine what type (category) of answer is expected for the given query. The queries are divided into 5 major classes according to the answer expected for them.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :-  While returning a response for a query, knowing what kind of answer is expected for the query can help in both curating and cross-verfying an answer according to the type.\n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://microsoft.github.io/msmarco/\">MSMARCO QnA</a> data. Queries are divided into 5 query types - NUMERIC, LOCATION, ENTITY, DESCRIPTION, PERSON.\n",
 18 |     "\n",
 19 |     "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P msmarco_qna_data\n",
 29 |     "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P msmarco_qna_data\n",
 30 |     "!wget https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz -P msmarco_qna_data"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "!gunzip msmarco_qna_data/train_v2.1.json.gz\n",
 40 |     "!gunzip msmarco_qna_data/dev_v2.1.json.gz\n",
 41 |     "!gunzip msmarco_qna_data/eval_v2.1_public.json.gz"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# Step - 1: Transforming data\n",
 49 |     "\n",
 50 |     "The data is present in *JSON* format containing various data fields for each sample. We only consider the ``query`` and ``query_type`` in this example. The data is fairly large, hence we set ``data_frac`` to 0.2 by default. You can change this in case, you want to consider more data.\n",
 51 |     "\n",
 52 |     "We already provide a sample transformation function ``msmarco_query_type_to_tsv`` to convert this data to required tsv format. \n",
 53 |     "\n",
 54 |     "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 55 |     "\n",
 56 |     "The transformation file should have the following details which is already created ``transform_file_querytype.yml``.\n",
 57 |     "\n",
 58 |     "```\n",
 59 |     "transform1:\n",
 60 |     "  transform_func: msmarco_query_type_to_tsv\n",
 61 |     "  transform_params:\n",
 62 |     "    data_frac : 0.2\n",
 63 |     "  read_file_names:\n",
 64 |     "    - train_v2.1.json\n",
 65 |     "    - dev_v2.1.json\n",
 66 |     "    - eval_v2.1_public.json\n",
 67 |     "\n",
 68 |     "  read_dir: msmarco_qna_data\n",
 69 |     "  save_dir: ../../data\n",
 70 |     " ```\n",
 71 |     " Following command can be used to run the data transformation for the tasks."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "!python ../../data_transformations.py \\\n",
 81 |     "    --transform_file 'transform_file_querytype.yml'"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# Step -2 Data Preparation\n",
 89 |     "\n",
 90 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 91 |     "\n",
 92 |     "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_querytype.yml``\n",
 93 |     "```\n",
 94 |     "querytype:\n",
 95 |     "    model_type: BERT\n",
 96 |     "    config_name: bert-base-uncased\n",
 97 |     "    dropout_prob: 0.2\n",
 98 |     "    label_map_or_file:\n",
 99 |     "    - DESCRIPTION\n",
100 |     "    - ENTITY\n",
101 |     "    - LOCATION\n",
102 |     "    - NUMERIC\n",
103 |     "    - PERSON\n",
104 |     "    metrics:\n",
105 |     "    - classification_accuracy\n",
106 |     "    loss_type: CrossEntropyLoss\n",
107 |     "    task_type: SingleSenClassification\n",
108 |     "    file_names:\n",
109 |     "    - querytype_train_v2.1.tsv\n",
110 |     "    - querytype_dev_v2.1.tsv\n",
111 |     "    - querytype_eval_v2.1_public.tsv\n",
112 |     "```"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "!python ../../data_preparation.py \\\n",
122 |     "    --task_file 'tasks_file_querytype.yml' \\\n",
123 |     "    --data_dir '../../data' \\\n",
124 |     "    --max_seq_len 60"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "# Step - 3 Running train\n",
132 |     "\n",
133 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n",
134 |     "\n",
135 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "!python ../../train.py \\\n",
145 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
146 |     "    --task_file 'tasks_file_querytype.yml' \\\n",
147 |     "    --out_dir 'msmarco_querytype_bert_base' \\\n",
148 |     "    --epochs 4 \\\n",
149 |     "    --train_batch_size 64 \\\n",
150 |     "    --eval_batch_size 64 \\\n",
151 |     "    --grad_accumulation_steps 1 \\\n",
152 |     "    --log_per_updates 100 \\\n",
153 |     "    --max_seq_len 60 \\\n",
154 |     "    --eval_while_train \\\n",
155 |     "    --test_while_train \\\n",
156 |     "    --silent"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "# Step - 4 Infering\n",
164 |     "\n",
165 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
166 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
167 |     "\n",
168 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 1,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stderr",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Using TensorFlow backend.\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "import sys\n",
186 |     "sys.path.insert(1, '../../')\n",
187 |     "from infer_pipeline import inferPipeline"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": []
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.7.3"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 4
233 | }
234 | 


--------------------------------------------------------------------------------
/examples/query_type_detection/tasks_file_querytype.yml:
--------------------------------------------------------------------------------
 1 | querytype:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     label_map_or_file:
 6 |     - DESCRIPTION
 7 |     - ENTITY
 8 |     - LOCATION
 9 |     - NUMERIC
10 |     - PERSON
11 |     metrics:
12 |     - classification_accuracy
13 |     loss_type: CrossEntropyLoss
14 |     task_type: SingleSenClassification
15 |     file_names:
16 |     - querytype_train_v2.1.tsv
17 |     - querytype_dev_v2.1.tsv
18 |     - querytype_eval_v2.1_public.tsv


--------------------------------------------------------------------------------
/examples/query_type_detection/transform_file_querytype.yml:
--------------------------------------------------------------------------------
 1 | transform1:
 2 |   transform_func: msmarco_query_type_to_tsv
 3 |   transform_params:
 4 |     data_frac : 0.2
 5 |   read_file_names:
 6 |     - train_v2.1.json
 7 |     - dev_v2.1.json
 8 |     - eval_v2.1_public.json
 9 | 
10 |   read_dir: msmarco_qna_data
11 |   save_dir: ../../data


--------------------------------------------------------------------------------
/examples/sentiment_analysis/IMDb_sentiment_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EXAMPLE - 8\n",
  8 |     "\n",
  9 |     "**Tasks :- Sentiment analysis**\n",
 10 |     "\n",
 11 |     "**Tasks Description**\n",
 12 |     "\n",
 13 |     "``sentiment`` :- This is modeled as single sentence classification task to determine where a piece of text conveys a positive or negative sentiment.\n",
 14 |     "\n",
 15 |     "**Conversational Utility** :- To determine whether a review is positive or negative.\n",
 16 |     "\n",
 17 |     "**Data** :- In this example, we are using the <a href=\"https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data\">IMDB</a> data which can be downloaded after accepting the terms and saved under `imdb_data` directory. The data is having total 50k samples labeled as positive or negative.\n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "!unzip imdb_data/134715_320111_bundle_archive.zip -d imdb_data/imdb_dataset.csv"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "!mv imdb_data/IMDB\\ Dataset.csv imdb_data/imdb_sentiment_data.csv"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Step - 1: Transforming data\n",
 43 |     "The data file `imdb_dataset` is having 50k samples with two columns - review and sentiment. Sentiment is the label which can be positive or negative.\n",
 44 |     "We already provide a sample transformation function ``imdb_sentiment_data_to_tsv`` to convert this data to required tsv format.\n",
 45 |     "Running data transformations will save the required train and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/data_transformations.html\">data transformations</a> in documentation.\n",
 46 |     "\n",
 47 |     "The transformation file should have the following details which is already created ``transform_file_imdb.yml``.\n",
 48 |     "\n",
 49 |     "```\n",
 50 |     "transform1:\n",
 51 |     "  transform_func: imdb_sentiment_data_to_tsv\n",
 52 |     "  read_file_names:\n",
 53 |     "  - imdb_sentiment_data.csv\n",
 54 |     "  read_dir: imdb_data\n",
 55 |     "  save_dir: ../../data\n",
 56 |     "```"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "!python ../../data_transformations.py \\\n",
 66 |     "    --transform_file 'transform_file_imdb.yml'"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "# Step -2 Data Preparation\n",
 74 |     "\n",
 75 |     "For more details on the data preparation process, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-data-preparation\">data preparation</a> in documentation.\n",
 76 |     "\n",
 77 |     "Defining tasks file for training single model for sentiment task. The file is already created at ``tasks_file_imdb.yml``\n",
 78 |     "\n",
 79 |     "```\n",
 80 |     "sentiment:\n",
 81 |     "    model_type: BERT\n",
 82 |     "    config_name: bert-base-uncased\n",
 83 |     "    dropout_prob: 0.2\n",
 84 |     "    label_map_or_file:\n",
 85 |     "    - negative\n",
 86 |     "    - positive\n",
 87 |     "    class_num: 2\n",
 88 |     "    metrics:\n",
 89 |     "    - classification_accuracy\n",
 90 |     "    loss_type: CrossEntropyLoss\n",
 91 |     "    task_type: SingleSenClassification\n",
 92 |     "    file_names:\n",
 93 |     "    - imdb_sentiment_train.tsv\n",
 94 |     "    - imdb_sentiment_test.tsv\n",
 95 |     "```"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "!python ../../data_preparation.py \\\n",
105 |     "    --task_file 'tasks_file_imdb.yml' \\\n",
106 |     "    --data_dir '../../data' \\\n",
107 |     "    --max_seq_len 200"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "# Step - 3 Running train\n",
115 |     "\n",
116 |     "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n",
117 |     "\n",
118 |     "For knowing more details about the train process, refer to <a href= \"https://multi-task-nlp.readthedocs.io/en/latest/training.html#running-train\">running training</a> in documentation."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "!python ../../train.py \\\n",
128 |     "    --data_dir '../../data/bert-base-uncased_prepared_data' \\\n",
129 |     "    --task_file 'tasks_file_imdb.yml' \\\n",
130 |     "    --out_dir 'imdb_sentiment_bert_base' \\\n",
131 |     "    --epochs 8 \\\n",
132 |     "    --train_batch_size 32 \\\n",
133 |     "    --eval_batch_size 32 \\\n",
134 |     "    --max_seq_len 200 \\\n",
135 |     "    --grad_accumulation_steps 1 \\\n",
136 |     "    --log_per_updates 50 \\\n",
137 |     "    --eval_while_train  \\\n",
138 |     "    --silent"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# Step - 4 Infering\n",
146 |     "\n",
147 |     "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n",
148 |     "The trained model and maximum sequence length to be used needs to be specified.\n",
149 |     "\n",
150 |     "For knowing more details about infering, refer to <a href=\"https://multi-task-nlp.readthedocs.io/en/latest/infering.html\">infer pipeline</a> in documentation."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "import sys\n",
160 |     "sys.path.insert(1, '../../')\n",
161 |     "from infer_pipeline import inferPipeline"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.7.3"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 4
193 | }
194 | 


--------------------------------------------------------------------------------
/examples/sentiment_analysis/tasks_file_imdb.yml:
--------------------------------------------------------------------------------
 1 | sentiment:
 2 |     model_type: BERT
 3 |     config_name: bert-base-uncased
 4 |     dropout_prob: 0.2
 5 |     label_map_or_file:
 6 |     - negative
 7 |     - positive
 8 |     class_num: 2
 9 |     metrics:
10 |     - classification_accuracy
11 |     loss_type: CrossEntropyLoss
12 |     task_type: SingleSenClassification
13 |     file_names:
14 |     - imdb_sentiment_train.tsv
15 |     - imdb_sentiment_test.tsv


--------------------------------------------------------------------------------
/examples/sentiment_analysis/transform_file_imdb.yml:
--------------------------------------------------------------------------------
1 | transform1:
2 |   transform_func: imdb_sentiment_data_to_tsv
3 |   read_file_names:
4 |   - imdb_sentiment_data.csv
5 |   read_dir: imdb_data
6 |   save_dir: ../../data


--------------------------------------------------------------------------------
/infer_pipeline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pipeline for inference on batch for multi-task
  3 | """
  4 | from utils.task_utils import TasksParam
  5 | from utils.data_utils import TaskType, ModelType, NLP_MODELS
  6 | from models.eval import evaluate
  7 | from models.model import multiTaskModel
  8 | from data_preparation import * 
  9 | from models.data_manager import allTasksDataset, Batcher, batchUtils
 10 | from torch.utils.data import Dataset, DataLoader, BatchSampler
 11 | import argparse
 12 | import os
 13 | import torch
 14 | import logging
 15 | logger = logging.getLogger("multi_task")
 16 | 
 17 | class inferPipeline:
 18 | 
 19 |     """
 20 |     For running inference on samples using a trained model for say TaskA, TaskB and TaskC,
 21 |     you can import this class and load the corresponding multi-task model by making an 
 22 |     object of this class with the following arguments
 23 | 
 24 |     Args:
 25 |         modelPath (:obj:`str`) : Path to the trained multi-task model for required tasks.
 26 |         maxSeqLen (:obj:`int`, defaults to :obj:`128`) : maximum sequence length to be considered for samples.
 27 |          Truncating and padding will happen accordingly.
 28 |         
 29 |     Example::
 30 | 
 31 |         >>> from infer_pipeline import inferPipeline
 32 |         >>> pipe = inferPipeline(modelPath = 'sample_out_dir/multi_task_model.pt', maxSeqLen = 50)
 33 |     
 34 |     """
 35 | 
 36 |     def __init__(self, modelPath, maxSeqLen = 128):
 37 | 
 38 |         device = torch.device('cpu')
 39 |         if torch.cuda.is_available():
 40 |             device = torch.device('cuda')
 41 | 
 42 |         self.maxSeqLen = maxSeqLen
 43 |         self.modelPath = modelPath
 44 |         assert os.path.exists(self.modelPath), "saved model not present at {}".format(self.modelPath)
 45 | 
 46 |         loadedDict = torch.load(self.modelPath, map_location=device)
 47 |         self.taskParams = loadedDict['task_params']
 48 |         logger.info('Task Params loaded from saved model.')
 49 | 
 50 |         modelName = self.taskParams.modelType.name.lower()
 51 |         _, _ , tokenizerClass, defaultName = NLP_MODELS[modelName]
 52 |         configName = self.taskParams.modelConfig
 53 |         if configName is None:
 54 |             configName = defaultName
 55 |         #making tokenizer for model
 56 |         self.tokenizer = tokenizerClass.from_pretrained(configName)
 57 |         logger.info('{} model tokenizer loaded for config {}'.format(modelName, configName))
 58 |     
 59 |         allParams = {}
 60 |         allParams['task_params'] = self.taskParams
 61 |         allParams['gpu'] = torch.cuda.is_available()
 62 |         # dummy values
 63 |         allParams['num_train_steps'] = 10
 64 |         allParams['warmup_steps'] = 0
 65 |         allParams['learning_rate'] = 2e-5
 66 |         allParams['epsilon'] = 1e-8
 67 | 
 68 |         #making and loading model
 69 |         self.model = multiTaskModel(allParams)
 70 |         self.model.load_multi_task_model(loadedDict)
 71 | 
 72 |     def make_feature_samples(self, dataList, taskType, taskName):
 73 |         allData = []
 74 |         for i, sample in enumerate(dataList):
 75 |             if taskType == TaskType.SingleSenClassification:
 76 |                 inputIds, typeIds, inputMask = standard_data_converter(self.maxSeqLen, self.tokenizer, sample[0])
 77 |                 features = {
 78 |                     'uid': i,
 79 |                     'label': 0,
 80 |                     'token_id': inputIds,
 81 |                     'type_id': typeIds,
 82 |                     'mask': inputMask}
 83 | 
 84 |             elif taskType == TaskType.SentencePairClassification:
 85 |                 inputIds, typeIds, inputMask = standard_data_converter(self.maxSeqLen, self.tokenizer, sample[0], sample[1])
 86 |                 features = {
 87 |                     'uid': i,
 88 |                     'label': 0,
 89 |                     'token_id': inputIds,
 90 |                     'type_id': typeIds,
 91 |                     'mask': inputMask}
 92 | 
 93 |             elif taskType == TaskType.NER:
 94 | 
 95 |                 splitSample = sample[0].split()
 96 |                 label = ["O"]*len(splitSample)
 97 |                 tempTokens = ['[CLS]']
 98 |                 tempLabels = ['[CLS]']
 99 |                 for word, label in zip(splitSample, label):
100 |                     tokens = self.tokenizer.tokenize(word)
101 |                     for m, token in enumerate(tokens):
102 |                         tempTokens.append(token)
103 |                         #only first piece would be marked with label
104 |                         if m==0:
105 |                             tempLabels.append(label)
106 |                         else:
107 |                             tempLabels.append('X')
108 |                 # adding [SEP] at end
109 |                 tempTokens.append('[SEP]')
110 |                 tempLabels.append('[SEP]')
111 | 
112 |                 out = self.tokenizer.encode_plus(text = tempTokens, add_special_tokens=False,
113 |                                         truncation_strategy ='only_first',
114 |                                         max_length = self.maxSeqLen, pad_to_max_length=True)
115 |                 typeIds = None
116 |                 inputMask = None
117 |                 tokenIds = out['input_ids']
118 |                 if 'token_type_ids' in out.keys():
119 |                     typeIds = out['token_type_ids']
120 |                 if 'attention_mask' in out.keys():
121 |                     inputMask = out['attention_mask']
122 | 
123 |                 labelMap = self.taskParams.labelMap[taskName]
124 |                 tempLabelsEnc = pad_sequences([ [labelMap[l] for l in tempLabels] ], 
125 |                                     maxlen=self.maxSeqLen, value=labelMap["O"], padding="post",
126 |                                     dtype="long", truncating="post").tolist()[0]
127 |                 #print(tempLabelsEnc)
128 |                 assert len(tempLabelsEnc) == len(tokenIds), "mismatch between processed tokens and labels"
129 |                 features = {
130 |                 'uid': i,
131 |                 'label': tempLabelsEnc,
132 |                 'token_id': tokenIds,
133 |                 'type_id': typeIds,
134 |                 'mask': inputMask}
135 |             else:
136 |                 raise ValueError(taskType)
137 | 
138 |             allData.append(features)
139 | 
140 |         return allData
141 |     def format_ner_output(self, sample, result):
142 |         assert len(sample) == len(result), "length of sample and result list not same"
143 |         returnList = []
144 |         for i, (sam, res) in enumerate(zip(sample, result)):
145 |             if res not in ["O", "[CLS]", "[SEP]", "X"]:
146 |                 curr = res.split('-')[-1]
147 |                 if len(returnList)>0:
148 |                     if curr == returnList[len(returnList)-1][0]:
149 |                         returnList[len(returnList)-1].append(sam)
150 |                     else:
151 |                         returnList.append([curr, sam])
152 |                 else:
153 |                     returnList.append([curr, sam])
154 |                     #print(returnList)
155 |         outList = []
156 |         for finalSam in returnList:
157 |             #print(finalSam)
158 |             outS = ' '.join(finalSam[1:])
159 |             #print(outS)
160 |             outList.append((finalSam[0], outS))
161 |             #print('{} : {}'.format(finalSam[0], outS))
162 | 
163 |         return outList
164 | 
165 |     def format_output(self, dataList, allIds, allPreds, allScores):
166 |         returnList = []
167 |         for sampleId in range(len(dataList)):
168 |             resDict = {}
169 |             #print("\nInput Sample : ", dataList[sampleId])
170 |             resDict['Query'] = dataList[sampleId]
171 |             for i in range(len(allIds)):
172 |                 taskName = self.taskParams.taskIdNameMap[i]
173 |                 taskType = self.taskParams.taskTypeMap[taskName]
174 |                 if allPreds[i] == []:
175 |                     continue
176 | 
177 |                 if taskType == TaskType.NER:
178 |                     result = allPreds[i][sampleId]
179 |                     inpp = dataList[sampleId][0].split()
180 |                     #print("{} : ".format(taskName))
181 |                     result = self.format_ner_output(inpp, result)
182 |                 else:
183 |                     result = [allPreds[i][sampleId], allScores[i][sampleId]]
184 | 
185 |                 resDict[taskName] = result
186 |                 #else:
187 |                     #print("{} : {}".format(taskName, result))
188 |             returnList.append(resDict)
189 |         #print(returnList)
190 |         return returnList
191 |                 
192 | 
193 |     def infer(self, dataList, taskNamesList, batchSize = 8, seed=42):
194 | 
195 |         """
196 |         This is the function which can be called to get the predictions for input samples
197 |         for the mentioned tasks.
198 | 
199 |         - Samples can be packed in a ``list of lists`` manner as the function processes inputs in batch.
200 |         - In case, an input sample requires sentence pair, the two sentences can be kept as elements of the list.
201 |         - In case of single sentence classification or NER tasks, only the first element of a sample will be used.
202 |         - For NER, the infer function automatically splits the sentence into tokens.
203 |         - All the tasks mentioned in ``taskNamesList`` are performed for all the input samples.
204 | 
205 |         Args:
206 | 
207 |             dataList (:obj:`list of lists`) : A batch of input samples. For eg.
208 |                 
209 |                 [
210 |                     [<sentenceA>, <sentenceB>],
211 |                     
212 |                     [<sentenceA>, <sentenceB>],
213 | 
214 |                 ]
215 | 
216 |                 or in case all the tasks just require single sentence inputs,
217 |                 
218 |                 [
219 |                     [<sentenceA>],
220 | 
221 |                     [<sentenceA>],
222 | 
223 |                 ]
224 | 
225 |             taskNamesList (:obj:`list`) : List of tasks to be performed on dataList samples. For eg.
226 | 
227 |                 ['TaskA', 'TaskB', 'TaskC']
228 | 
229 |                 You can choose the tasks you want to infer. For eg.
230 | 
231 |                 ['TaskB']
232 | 
233 |             batchSize (:obj:`int`, defaults to :obj:`8`) : Batch size for running inference.
234 | 
235 | 
236 |         Return:
237 | 
238 |             outList (:obj:`list of objects`) :
239 |                 List of dictionary objects where each object contains one corresponding input sample and it's tasks outputs. The task outputs
240 |                 can also contain the confidence scores. For eg.
241 | 
242 |                 [
243 |                     {'Query' : [<sentence>],
244 | 
245 |                     'TaskA' : <TaskA output>,
246 | 
247 |                     'TaskB' : <TaskB output>,
248 | 
249 |                     'TaskC' : <TaskC output>},
250 | 
251 |                 ]
252 | 
253 |         Example::
254 | 
255 |             >>> samples = [ ['sample_sentence_1'], ['sample_sentence_2'] ]
256 |             >>> tasks = ['TaskA', 'TaskB']
257 |             >>> pipe.infer(samples, tasks)
258 | 
259 |         """
260 |         #print(dataList)
261 |         #print(taskNamesList)
262 |         allTasksList = []
263 |         for taskName in taskNamesList:
264 |             assert taskName in self.taskParams.taskIdNameMap.values(), "task Name not in task names for loaded model"
265 |             taskId = [taskId for taskId, tName in self.taskParams.taskIdNameMap.items() if tName==taskName][0]
266 |             taskType = self.taskParams.taskTypeMap[taskName]
267 | 
268 |             taskData = self.make_feature_samples(dataList, taskType, taskName)
269 |             #print('task data :', taskData)
270 | 
271 |             tasksDict = {"data_task_id" : int(taskId),
272 |                         "data_" : taskData,
273 |                         "data_task_type" : taskType,
274 |                         "data_task_name" : taskName}
275 |             allTasksList.append(tasksDict)
276 | 
277 |         allData = allTasksDataset(allTasksList, pipeline=True)
278 |         batchSampler = Batcher(allData, batchSize=batchSize, seed =seed,
279 |                              shuffleBatch=False, shuffleTask=False)
280 |         # VERY IMPORTANT TO TURN OFF BATCH SHUFFLE IN INFERENCE. ELSE PREDICTION SCORES
281 |         # WILL GET JUMBLED
282 | 
283 |         batchSamplerUtils = batchUtils(isTrain = False, modelType= self.taskParams.modelType,
284 |                                   maxSeqLen = self.maxSeqLen)
285 |         inferDataLoader = DataLoader(allData, batch_sampler=batchSampler,
286 |                                     collate_fn=batchSamplerUtils.collate_fn,
287 |                                     pin_memory=torch.cuda.is_available())
288 | 
289 |         with torch.no_grad():
290 |             allIds, allPreds, allScores = evaluate(allData, batchSampler, inferDataLoader, self.taskParams,
291 |                     self.model, gpu=torch.cuda.is_available(), evalBatchSize=batchSize, needMetrics=False, hasTrueLabels=False,
292 |                     returnPred=True)
293 | 
294 |             finalOutList = self.format_output(dataList, allIds, allPreds, allScores)
295 |             #print(finalOutList)
296 |             return finalOutList
297 | 


--------------------------------------------------------------------------------
/logger_.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Custom log object to use across all files
 3 | '''
 4 | import logging
 5 | 
 6 | def make_logger(name, logFile, debugMode = False, silent = False):
 7 |     
 8 |     # Create a custom log
 9 |     log = logging.getLogger(name)
10 |     log.setLevel(logging.DEBUG)
11 |     log.propagate = False
12 |     # Create handlers
13 |     #setting level
14 |     if debugMode:
15 |         c_handler = logging.StreamHandler()
16 |         f_handler = logging.FileHandler(logFile)
17 |         c_handler.setLevel(logging.DEBUG)
18 |         f_handler.setLevel(logging.DEBUG)
19 |     elif silent:
20 |         f_handler = logging.FileHandler(logFile)
21 |         f_handler.setLevel(logging.INFO)
22 |     else:
23 |         c_handler = logging.StreamHandler()
24 |         f_handler = logging.FileHandler(logFile)
25 |         c_handler.setLevel(logging.INFO)
26 |         f_handler.setLevel(logging.INFO)        
27 | 
28 | 
29 |     # Create formatters and add it to handlers
30 |     f_format = logging.Formatter('%(levelname)s - %(message)s')
31 |     f_handler.setFormatter(f_format)
32 |     # Add handlers to the log
33 |     log.addHandler(f_handler)
34 | 
35 |     if not silent:
36 |         c_format = logging.Formatter('%(levelname)s - %(message)s')
37 |         c_handler.setFormatter(c_format)
38 |         log.addHandler(c_handler)
39 | 
40 |     return log


--------------------------------------------------------------------------------
/models/data_manager.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Script to manage datasets for multiple tasks
  3 | '''
  4 | from torch.utils.data import Dataset, DataLoader, BatchSampler
  5 | from utils.data_utils import TaskType, ModelType
  6 | import torch
  7 | import random
  8 | import logging
  9 | import json
 10 | logger = logging.getLogger("multi_task")
 11 | 
 12 | class allTasksDataset(Dataset):
 13 |     '''
 14 |     class to make pytorch dataset of the processed data for a specific task
 15 |     taskDict :- list of dictionaries. Each dictioanry belong to the details of a 
 16 |                 dataset to be created for a task
 17 |                 [ {"data_task_id" : "", "data_path" : "", "data_task_type" : ""},
 18 |                  ...]
 19 |     '''
 20 |     def __init__(self, taskDict, pipeline = False):
 21 |         self.taskDict = taskDict
 22 |         self.pipeline = pipeline
 23 |         self.allTasksData, self.taskIdTypeMap = self.make_all_datasets()
 24 | 
 25 |     def read_data(self, readPath):
 26 |         with open(readPath, 'r', encoding = 'utf-8') as file:
 27 |             logger.info('Reading data from file {}'.format(readPath))
 28 |             taskData = []
 29 |             for i, line in enumerate(file):
 30 |                 #if i >=1000:
 31 |                     #continue
 32 |                 sample = json.loads(line)
 33 |                 taskData.append(sample)
 34 |         return taskData
 35 | 
 36 |     def make_all_datasets(self):
 37 |         '''
 38 |         For each dataset entry in the taskDict, this function makes them into corresponding dataset 
 39 |         and returns a dictionary mapping like {<task_id> : <dataset>,}
 40 |         '''
 41 |         allTasksData = {}
 42 |         taskIdTypeMap = {} # mapping from task id to task type
 43 |         for task in self.taskDict:
 44 |             if self.pipeline:
 45 |                 logger.info('Reading data for pipeline')
 46 |                 data = task["data_"]
 47 |             else:
 48 |                 data = self.read_data(task["data_path"])
 49 |             allTasksData[task["data_task_id"]] = data
 50 |             taskIdTypeMap[task["data_task_id"]] = task["data_task_type"]
 51 |             logger.info('Read Data for Task Id: {} Task Name: {}. Samples {}'.format(task["data_task_id"], task["data_task_name"], len(data)))
 52 |         return allTasksData, taskIdTypeMap
 53 | 
 54 |     # some standard functions which need to be overridden from Dataset
 55 |     #class for item, len etc..
 56 |     def __len__(self):
 57 |         return sum(len(v) for k, v in self.allTasksData.items())
 58 | 
 59 |     # get item will be used to fetch a sample when required for the corresponding task id. 
 60 |     def __getitem__(self, idx):
 61 |         taskId, sampleId = idx
 62 |         out = {"task": {"task_id": taskId, "task_type": self.taskIdTypeMap[taskId]},
 63 |                 "sample": self.allTasksData[taskId][sampleId]}
 64 |         return out
 65 | 
 66 | class Batcher(BatchSampler):
 67 |     def __init__(self, dataObj, batchSize, shuffleTask = True, shuffleBatch = True, seed = 42):
 68 |         '''
 69 |         dataObj :- An instance of allTasksDataset containing data for all tasks
 70 |         '''
 71 |         self.dataObj = dataObj
 72 |         self.allTasksData = dataObj.allTasksData
 73 |         self.batchSize = batchSize
 74 |         # to shuffle the indices in a batch
 75 |         self.shuffleBatch = shuffleBatch
 76 |         # to shuffle the samples picked up among all the tasks
 77 |         self.shuffleTask = shuffleTask
 78 |         self.seed = seed
 79 |         
 80 |         self.allTasksDataBatchIdxs = []
 81 |         self.taskIdxId = []
 82 |         for taskId, data in self.allTasksData.items():
 83 |             self.allTasksDataBatchIdxs.append(self.make_batches(len(data)))
 84 |             self.taskIdxId.append(taskId)
 85 | 
 86 |     def make_batches(self, dataSize):
 87 |         batchIdxs = [list(range(i, min(i+self.batchSize, dataSize))) for i in range(0, dataSize, self.batchSize)]
 88 |         if self.shuffleBatch:
 89 |             random.seed(self.seed)
 90 |             random.shuffle(batchIdxs)
 91 |         return batchIdxs
 92 | 
 93 |     def make_task_idxs(self):
 94 |         '''
 95 |         This fn makes task indices for which a corresponding batch is created
 96 |         eg. [0, 0, 1, 3, 0, 2, 3, 1, 1, ..] if task ids are 0,1,2,3
 97 |         '''
 98 |         taskIdxs = []
 99 |         for i in range(len(self.allTasksDataBatchIdxs)):
100 |             taskIdxs += [i]*len(self.allTasksDataBatchIdxs[i])
101 |         if self.shuffleTask:
102 |             random.seed(self.seed)
103 |             random.shuffle(taskIdxs)
104 |         return taskIdxs
105 | 
106 |     #over riding BatchSampler functions to generate iterators for all tasks
107 |     # and iterate
108 |     def __len__(self):
109 |         return sum(len(data) for taskId, data in self.allTasksData.items())
110 | 
111 |     def __iter__(self):
112 |         allTasksIters = [iter(item) for item in self.allTasksDataBatchIdxs]
113 |         #all_iters = [iter(item) for item in self._train_data_list]
114 |         allIdxs = self.make_task_idxs()
115 |         for taskIdx in allIdxs:
116 |             # this batch belongs to a specific task id
117 |             batchTaskId = self.taskIdxId[taskIdx]
118 |             batch = next(allTasksIters[taskIdx])
119 |             yield [(batchTaskId, sampleIdx) for sampleIdx in batch]
120 |             
121 |     def patch_data(self, batch_info, batch_data, gpu = None):
122 |         if gpu:
123 |             for i, part in enumerate(batch_data):
124 |                 if part is not None:
125 |                     if isinstance(part, torch.Tensor):
126 |                         batch_data[i] = part.pin_memory().cuda(non_blocking=True)
127 |                     elif isinstance(part, tuple):
128 |                         batch_data[i] = tuple(sub_part.pin_memory().cuda(non_blocking=True) for sub_part in part)
129 |                     elif isinstance(part, list):
130 |                         batch_data[i] = [sub_part.pin_memory().cuda(non_blocking=True) for sub_part in part]
131 |                     else:
132 |                         raise TypeError("unknown batch data type at %s: %s" % (i, part))
133 | 
134 |         return batch_info, batch_data
135 | 
136 | class batchUtils:
137 |     '''
138 |     This class is supposed to perform function which will help complete the batch data
139 |     when DataLoader creates batch using allTasksDataset and Batcher.
140 |     Main function would be
141 |     1. A function to make get the various components of input in batch samples and make them into 
142 |     Pytorch Tensors like token_id, type_ids, masks.
143 | 
144 |     2. Collater function :- This function will use the above function to convert the batch into 
145 |     pytorch tensor inputs. As converting all the data into pytorch tensors before might not be a good 
146 |     idea due to space, hence this custom function will be used to convert the batches into tensors on the fly
147 |     by acting as custom collater function to DataLoader
148 |     '''
149 | 
150 |     def __init__(self, isTrain, modelType, maxSeqLen, dropout = 0.005):
151 |         self.isTrain = isTrain
152 |         self.modelType = modelType
153 |         self.maxSeqLen = maxSeqLen
154 |         #self.dropout = dropout
155 | 
156 |     def check_samples_len(self, batch):
157 |         #function to check whether all samples are having the maxSeqLen mentioned
158 |         for samp in batch:
159 |             assert len(samp['token_id']) == self.maxSeqLen, "token_id len doesn't match max seq len"
160 |             # for multiple encoders
161 |             if samp['type_id'] is not None:
162 |                 assert len(samp['type_id']) == self.maxSeqLen, "type_id len doesn't match max seq len"
163 |             if samp['mask'] is not None:
164 |                 assert len(samp['mask']) == self.maxSeqLen, "mask len doesn't match max seq len"
165 | 
166 |     def make_batch_to_input_tensor(self, batch):
167 |         #check len in batch data
168 |         self.check_samples_len(batch)
169 |         batchSize = len(batch)
170 | 
171 |         hasTypeIds = True
172 |         hasAttnMasks = True
173 |         if batch[0]['type_id'] is None:
174 |             hasTypeIds = False
175 |         if batch[0]['mask'] is None:
176 |             hasAttnMasks = False
177 | 
178 |         #initializing token id, type id, attention mask tensors for this batch
179 |         tokenIdsBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0)
180 |         typeIdsBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0)
181 |         masksBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0)
182 | 
183 |         #fillling in data from sample
184 |         for i, sample in enumerate(batch):
185 |             tokenIdsBatchTensor[i] = torch.LongTensor(sample['token_id'])
186 |             if hasTypeIds:
187 |                 typeIdsBatchTensor[i] = torch.LongTensor(sample['type_id'])
188 |             if hasAttnMasks:
189 |                 masksBatchTensor[i] = torch.LongTensor(sample['mask'])
190 | 
191 |         # meta deta will store more things like task id, task type etc. 
192 |         batchMetaData = {"token_id_pos" : 0, "type_id_pos" : 1, "mask_pos" : 2}
193 |         batchData = [tokenIdsBatchTensor, None, None]  #None, None in case type ids, attnMasks not required by model
194 |         if hasTypeIds:
195 |             batchData[1] = typeIdsBatchTensor
196 |         if hasAttnMasks:
197 |             batchData[2] = masksBatchTensor
198 |         return batchMetaData, batchData
199 | 
200 |     def collate_fn(self, batch):
201 |         '''
202 |         This function will be used by DataLoader to return batches
203 |         '''
204 |         taskId = batch[0]["task"]["task_id"]
205 |         taskType = batch[0]["task"]["task_type"]
206 | 
207 |         orgBatch = []
208 |         labels = []
209 |         for sample in batch:
210 |             assert sample["task"]["task_id"] == taskId
211 |             assert sample["task"]["task_type"] == taskType
212 |             orgBatch.append(sample["sample"])
213 |             labels.append(sample["sample"]["label"])
214 |             
215 |         batch = orgBatch
216 |         #making tensor batch data
217 |         batchMetaData, batchData = self.make_batch_to_input_tensor(batch)
218 |         batchMetaData['task_id'] = taskId
219 |         batchMetaData['task_type'] = taskType
220 | 
221 |         #adding label tensor when training (as they'll used for loss calculatoion and update)
222 |         # and in evaluation, it won't go with batch data, rather will keep it with meta data for metrics
223 |         if self.isTrain:
224 | 
225 |             if taskType in (TaskType.SingleSenClassification, TaskType.SentencePairClassification, TaskType.NER):
226 |                 batchData.append(torch.LongTensor(labels))
227 | 
228 |             #position for label
229 |             batchMetaData['label_pos'] = len(batchData) - 1
230 |         else:
231 |             # for test/eval labels won't be added into batch, but kept in meta data
232 |             # so metric evaluation can be done
233 |             #batchData :- [tokenIdsBatchTensor, typeIdsBatchTensor, MasksBatchTensor]
234 |             batchMetaData['label'] = labels
235 | 
236 |         batchMetaData['uids'] = [sample['uid'] for sample in batch]  # used in scoring
237 |         return batchMetaData, batchData
238 | 


--------------------------------------------------------------------------------
/models/dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class DropoutWrapper(nn.Module):
 6 |     """
 7 |     This is a dropout wrapper which supports the fix mask dropout
 8 |     """
 9 |     def __init__(self, dropout_p=0, enable_vbp=True):
10 |         super(DropoutWrapper, self).__init__()
11 |         """variational dropout means fix dropout mask
12 |         ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11
13 |         """
14 |         self.enable_variational_dropout = enable_vbp
15 |         self.dropout_p = dropout_p
16 | 
17 |     def forward(self, x):
18 |         """
19 |             :param x: batch * len * input_size
20 |         """
21 |         if self.training == False or self.dropout_p == 0:
22 |             return x
23 | 
24 |         if len(x.size()) == 3:
25 |             mask = 1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1))
26 |             mask.requires_grad = False
27 |             return mask.unsqueeze(1).expand_as(x) * x
28 |         else:
29 |             return F.dropout(x, p=self.dropout_p, training=self.training)
30 | 


--------------------------------------------------------------------------------
/models/eval.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | from utils.data_utils import METRICS, TaskType
  4 | import math
  5 | import os
  6 | from tqdm import tqdm
  7 | logger = logging.getLogger("multi_task")
  8 | 
  9 | def evaluate(dataSet, batchSampler, dataLoader, taskParams,
 10 |             model, gpu, evalBatchSize, needMetrics, hasTrueLabels,
 11 |             wrtDir=None, wrtPredPath = None, returnPred=False):
 12 |     '''
 13 |     Function to make predictions on the given data. The provided data can be multiple tasks or single task
 14 |     It will seprate out the predictions based on task id for metrics evaluation
 15 |     '''
 16 |     numTasks = len(taskParams.taskIdNameMap)
 17 |     numStep = math.ceil(len(dataLoader)/evalBatchSize)
 18 |     allPreds = [[] for _ in range(numTasks)]
 19 |     allLabels = [[] for _ in range(numTasks)]
 20 |     allScores = [[] for _ in range(numTasks)]
 21 |     allIds = [[] for _ in range(numTasks)]
 22 | 
 23 |     for batchMetaData, batchData in tqdm(dataLoader, total=numStep, desc = 'Eval'):
 24 |         batchMetaData, batchData = batchSampler.patch_data(batchMetaData,batchData, gpu = gpu)
 25 |         prediction, scores = model.predict_step(batchMetaData, batchData)
 26 | 
 27 |         logger.debug("predictions in eval: {}".format(prediction))       
 28 |         batchTaskId = int(batchMetaData['task_id'])
 29 |         
 30 |         orgLabels = batchMetaData['label']
 31 |         allLabels[batchTaskId].extend(orgLabels)
 32 | 
 33 |         logger.debug("batch task id in eval: {}".format(batchTaskId))
 34 |         allPreds[batchTaskId].extend(prediction)
 35 |         allScores[batchTaskId].extend(scores)
 36 |         allIds[batchTaskId].extend(batchMetaData['uids'])
 37 | 
 38 |     for i in range(len(allPreds)):
 39 |         if allPreds[i] == []:
 40 |             continue
 41 |         taskName = taskParams.taskIdNameMap[i]
 42 |         taskType = taskParams.taskTypeMap[taskName]
 43 |         labMap = taskParams.labelMap[taskName]
 44 | 
 45 |         if taskType == TaskType.NER:
 46 |             # NER requires label clipping. We''ve already clipped our predictions
 47 |             #using attn Masks, so we will clip labels to predictions len
 48 |             # Also we need to remove the extra tokens from predictions based on labels
 49 |             #print(labMap)
 50 |             labMapRevN = {v:k for k,v in labMap.items()}
 51 | 
 52 |             for j, (p, l) in enumerate(zip(allPreds[i], allLabels[i])):
 53 |                 allLabels[i][j] = l[:len(p)]
 54 |                 allPreds[i][j] = [labMapRevN[int(ele)] for ele in p]
 55 |                 allLabels[i][j] = [labMapRevN[int(ele)] for ele in allLabels[i][j]]
 56 |             #allPreds[i] = [ [ labMapRev[int(p)] for p in pp ] for pp in allPreds[i] ]
 57 |             #allLabels[i] = [ [labMapRev[int(l)] for l in ll] for ll in allLabels[i] ]
 58 | 
 59 |             newPreds = []
 60 |             newLabels = []
 61 |             newScores = []
 62 |             for m, samp in enumerate(allLabels[i]):
 63 |                 Preds = []
 64 |                 Labels = []
 65 |                 Scores = []
 66 |                 for n, ele in enumerate(samp):
 67 |                     #print(ele)
 68 |                     if ele != '[CLS]' and ele != '[SEP]' and ele != 'X':
 69 |                         #print('inside')
 70 |                         Preds.append(allPreds[i][m][n])
 71 |                         Labels.append(ele)
 72 |                         Scores.append(allScores[i][m][n])
 73 |                         #del allLabels[i][m][n]
 74 |                         #del allPreds[i][m][n]
 75 |                 newPreds.append(Preds)
 76 |                 newLabels.append(Labels)
 77 |                 newScores.append(Scores)
 78 |             
 79 |             allLabels[i] = newLabels
 80 |             allPreds[i] = newPreds
 81 |             allScores[i] = newScores
 82 | 
 83 |         if taskType == TaskType.SingleSenClassification and labMap is not None:
 84 | 
 85 |             labMapRevC = {v:k for k,v in labMap.items()}
 86 |             allPreds[i] = [labMapRevC[int(ele)] for ele in allPreds[i]]
 87 |             allLabels[i] = [labMapRevC[int(ele)] for ele in allLabels[i]]
 88 | 
 89 |     if needMetrics:
 90 |         # fetch metrics from task id
 91 |         for i in range(len(allPreds)):
 92 |             if allPreds[i] == []:
 93 |                 continue
 94 |             taskName = taskParams.taskIdNameMap[i]
 95 |             metrics = taskParams.metricsMap[taskName]
 96 |             if metrics is None:
 97 |                 logger.info("No metrics are provided in task params (file)")
 98 |                 continue
 99 | 
100 |             logger.info("********** {} Evaluation************\n".format(taskName))
101 |             for m in metrics:
102 |                 metricVal = METRICS[m](allLabels[i], allPreds[i])
103 |                 logger.info("{} : {}".format(m, metricVal))
104 | 
105 |     if wrtPredPath is not None and wrtDir is not None:
106 |         for i in range(len(allPreds)):
107 |             if allPreds[i] == []:
108 |                 continue
109 |             taskName = taskParams.taskIdNameMap[i]
110 |             if hasTrueLabels:
111 |                 df = pd.DataFrame({"uid" : allIds[i], "prediction" : allPreds[i], "label" : allLabels[i]})
112 |             else:
113 |                 df = pd.DataFrame({"uid" : allIds[i], "prediction" : allPreds[i]})
114 | 
115 |             savePath = os.path.join(wrtDir, "{}_{}".format(taskName, wrtPredPath))
116 |             df.to_csv(savePath, sep = "\t", index = False)
117 |             logger.info("Predictions File saved at {}".format(savePath))
118 | 
119 |     if returnPred:
120 |         return allIds, allPreds, allScores


--------------------------------------------------------------------------------
/models/loss.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.loss import _Loss
 2 | import torch.nn.functional as F
 3 | import torch
 4 | from enum import IntEnum
 5 | 
 6 | class CrossEntropyLoss(_Loss):
 7 |     def __init__(self, alpha=1.0, name='Cross Entropy Loss'):
 8 |         super().__init__()
 9 |         
10 |         self.alpha = alpha
11 |         self.name = name
12 |         self.ignore_index = -1
13 |     def forward(self, inp, target, attnMasks = None):
14 |         """
15 |         This is the standard cross entropy loss as defined in pytorch.
16 |         This loss should be used for single sentence or sentence pair classification tasks.
17 | 
18 |         To use this loss for training, set ``loss_type`` : **CrossEntropyLoss** in task file
19 |         """
20 |         loss = F.cross_entropy(inp, target, ignore_index=self.ignore_index) 
21 |         loss *= self.alpha
22 |         return loss
23 | 
24 | class NERLoss(_Loss):
25 |     def __init__(self, alpha=1.0, name='Cross Entropy Loss'):
26 |         super().__init__()
27 |         
28 |         self.alpha = alpha
29 |         self.name = name
30 |         self.ignore_index = -1  #used to return 0 loss for such values
31 |     def forward(self, inp, target, attnMasks = None):
32 | 
33 |         """
34 |         This loss is a modified version of cross entropy loss for NER/sequence labelling tasks.
35 |         This loss ignores extra ‘O’ values through attention masks.
36 | 
37 |         To use this loss for training, set ``loss_type`` : **NERLoss** in task file
38 |         """
39 |         
40 |         '''
41 |         inp shape would be (batchSize, maxSeqlen, classNum). But for loss calculation
42 |         we need (batchSize, classNum). Hence we will squeeze the batchSize and maxSeqlen together.
43 | 
44 |         In NER, we have to ignore the loss created for the extra padding that
45 |         has been done for making labels till max seq length. Hence we will use
46 |         attention masks to ignore losses with those indices
47 |         '''
48 |         if attnMasks is not None:
49 |             nerLoss = attnMasks.view(-1) == 1
50 |             nerlogits = inp.view(-1, inp.size(-1))
51 |             nerLabels = torch.where(
52 |                 nerLoss, target.view(-1), torch.tensor(self.ignore_index).type_as(target)
53 |             )
54 |             finalLoss = F.cross_entropy(nerlogits, nerLabels, ignore_index=self.ignore_index)
55 | 
56 |         else:
57 |             finalLoss = F.cross_entropy(inp.view(-1, inp.size(-1)), target.view(-1),
58 |                                         ignore_index=self.ignore_index)
59 |  
60 |         finalLoss *= self.alpha
61 |         return finalLoss
62 | 
63 | class SpanLoss(_Loss):
64 |     def __init__(self, alpha=1.0, name='Span Cross Entropy Loss'):
65 |         super().__init__()
66 | 
67 |         self.alpha = alpha
68 |         self.name = name
69 |         self.ignore_index = -1
70 |     def forward(self, inp, target, attnMasks = None):
71 | 
72 |         #assert if inp and target has both start and end values
73 |         assert len(inp) == 2, "start and end logits should be present for span loss calc"
74 |         assert len(target) == 2, "start and end logits should be present for span loss calc"
75 | 
76 |         startInp, endInp = inp
77 |         startTarg, endTarg = target
78 |         
79 |         startloss = F.cross_entropy(startInp, startTarg, ignore_index=self.ignore_index)
80 |         endLoss = F.cross_entropy(endInp, endTarg, ignore_index=self.ignore_index)
81 | 
82 |         loss = 0.5 * (startloss + endLoss) * self.alpha
83 |         return loss
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | seqeval==0.0.12
 2 | tqdm==4.30.0
 3 | ipywidgets==7.4.2
 4 | Keras==2.3.1
 5 | transformers==2.8.0
 6 | joblib==0.13.2
 7 | torch==1.2.0
 8 | tensorflow==1.15.2
 9 | numpy==1.18.1
10 | sphinx_rtd_theme==0.4.3
11 | pandas==1.0.1
12 | scikit_learn==0.23.1
13 | PyYAML==5.3.1
14 | 


--------------------------------------------------------------------------------
/run_inference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File for making inference on a testing file with a saved multi-task model over 
  3 | The input data file here has to be the data prepared file for the corresponding test file
  4 | 
  5 | For getting inference on a test file, (say test.tsv) 
  6 | """
  7 | from utils.task_utils import TasksParam
  8 | from utils.data_utils import TaskType, ModelType, NLP_MODELS
  9 | from models.eval import evaluate
 10 | from models.model import multiTaskModel
 11 | from data_preparation import * 
 12 | from models.data_manager import allTasksDataset, Batcher, batchUtils
 13 | from torch.utils.data import Dataset, DataLoader, BatchSampler
 14 | import argparse
 15 | import os
 16 | import torch
 17 | import logging
 18 | logger = logging.getLogger("multi_task")
 19 | device = torch.device('cpu')
 20 | if torch.cuda.is_available():
 21 |     device = torch.device('cuda')
 22 | 
 23 | def main():
 24 |     parser = argparse.ArgumentParser()
 25 |     parser.add_argument('--pred_file_path', type=str, required=True,
 26 |                         help="path to the tsv file on which predictions to be made")
 27 |     parser.add_argument('--out_dir', type = str, required=True,
 28 |                         help="path to save the predictions")
 29 |     parser.add_argument('--has_labels', type=str, default=False,
 30 |                         help = "If labels are not present in file then False")
 31 |     parser.add_argument('--task_name', type=str, required = True,
 32 |                         help = "task name for which prediction is required.")
 33 |     parser.add_argument('--saved_model_path', type=str, required = True,
 34 |                         help = "path to the trained model to load")
 35 |     parser.add_argument('--eval_batch_size', type=int, default = 32,
 36 |                         help = "batch size for prediction")
 37 |     parser.add_argument('--max_seq_len', type=int, 
 38 |                         help = "max seq len used during training of model")
 39 |     parser.add_argument('--seed', type=int, default = 42,
 40 |                         help = "seed")
 41 |     args = parser.parse_args()
 42 | 
 43 |     allParams = vars(args)
 44 |     assert os.path.exists(args.saved_model_path), "saved model not present at {}".format(args.saved_model_path)
 45 |     assert os.path.exists(args.pred_file_path), "prediction tsv file not present at {}".format(args.pred_file_path)
 46 |     loadedDict = torch.load(args.saved_model_path, map_location=device)
 47 |     taskParamsModel = loadedDict['task_params']
 48 |     logger.info('Task Params loaded from saved model.')
 49 | 
 50 |     assert args.task_name in taskParamsModel.taskIdNameMap.values(), "task Name not in task names for loaded model"
 51 |     
 52 |     taskId = [taskId for taskId, taskName in taskParamsModel.taskIdNameMap.items() if taskName==args.task_name][0]
 53 |     taskType = taskParamsModel.taskTypeMap[args.task_name]
 54 | 
 55 |     # preparing data from tsv file
 56 |     rows = load_data(args.pred_file_path, taskType, hasLabels = args.has_labels)
 57 | 
 58 |     modelName = taskParamsModel.modelType.name.lower()
 59 |     _, _ , tokenizerClass, defaultName = NLP_MODELS[modelName]
 60 |     configName = taskParamsModel.modelConfig
 61 |     if configName is None:
 62 |         configName = defaultName
 63 |     
 64 |     #making tokenizer for model
 65 |     tokenizer = tokenizerClass.from_pretrained(configName)
 66 |     logger.info('{} model tokenizer loaded for config {}'.format(modelName, configName))
 67 |     
 68 |     dataPath = os.path.join(args.out_dir, '{}_prediction_data'.format(configName))
 69 |     if not os.path.exists(dataPath):
 70 |         os.makedirs(dataPath)
 71 |     wrtFile = os.path.join(dataPath, '{}.json'.format(args.pred_file_path.split('/')[-1].split('.')[0]))
 72 |     print('Processing Started...')
 73 |     create_data_multithreaded(rows, wrtFile, tokenizer, taskParamsModel, args.task_name,
 74 |                             args.max_seq_len, multithreaded = True)
 75 |     print('Data Processing done for {}. File saved at {}'.format(args.task_name, wrtFile))
 76 | 
 77 |     allTaskslist = [ 
 78 |         {"data_task_id" : int(taskId),
 79 |          "data_path" : wrtFile,
 80 |          "data_task_type" : taskType,
 81 |          "data_task_name" : args.task_name}
 82 |         ]
 83 |     allData = allTasksDataset(allTaskslist)
 84 |     batchSampler = Batcher(allData, batchSize=args.eval_batch_size, seed = args.seed)
 85 |     batchSamplerUtils = batchUtils(isTrain = False, modelType= taskParamsModel.modelType,
 86 |                                   maxSeqLen = args.max_seq_len)
 87 |     inferDataLoader = DataLoader(allData, batch_sampler=batchSampler,
 88 |                                 collate_fn=batchSamplerUtils.collate_fn,
 89 |                                 pin_memory=torch.cuda.is_available())
 90 | 
 91 |     allParams['task_params'] = taskParamsModel
 92 |     allParams['gpu'] = torch.cuda.is_available()
 93 |     # dummy values
 94 |     allParams['num_train_steps'] = 10
 95 |     allParams['warmup_steps'] = 0
 96 |     allParams['learning_rate'] = 2e-5
 97 |     allParams['epsilon'] = 1e-8
 98 | 
 99 |     #making and loading model
100 |     model = multiTaskModel(allParams)
101 |     model.load_multi_task_model(loadedDict)
102 | 
103 |     with torch.no_grad():
104 |         wrtPredFile = 'predictions.tsv'
105 |         evaluate(allData, batchSampler, inferDataLoader, taskParamsModel,
106 |                 model, gpu=allParams['gpu'], evalBatchSize=args.eval_batch_size, needMetrics=False, hasTrueLabels=False,
107 |                 wrtDir=args.out_dir, wrtPredPath=wrtPredFile)
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum
 2 | from transformers import *
 3 | from models.loss import *
 4 | from utils.eval_metrics import *
 5 | from utils.tranform_functions import *
 6 | 
 7 | NLP_MODELS = {
 8 |     "bert": (BertConfig, BertModel, BertTokenizer, 'bert-base-uncased'),
 9 |     "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
10 |     "albert": (AlbertConfig, AlbertModel, AlbertTokenizer, 'albert-base-v2'),
11 |     "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer, 'roberta-base'),
12 |     "xlnet" : (XLNetConfig, XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
13 |     "electra" : (ElectraConfig, ElectraModel, ElectraTokenizer, 'google/electra-small-generator')
14 | }
15 | LOSSES = {
16 |     "crossentropyloss" : CrossEntropyLoss,
17 |     "nerloss" : NERLoss
18 | }
19 | 
20 | METRICS = {
21 |     "classification_accuracy": classification_accuracy,
22 |     "classification_f1_score": classification_f1_score,
23 |     "seqeval_f1_score" : seqeval_f1_score,
24 |     "seqeval_precision" : seqeval_precision,
25 |     "seqeval_recall" : seqeval_recall,
26 |     "snips_f1_score" : snips_f1_score,
27 |     "snips_precision" : snips_precision,
28 |     "snips_recall" : snips_recall,
29 |     "classification_recall" : classification_recall
30 | }
31 | 
32 | TRANSFORM_FUNCS = {
33 |     "snips_intent_ner_to_tsv" : snips_intent_ner_to_tsv,
34 |     "coNLL_ner_pos_to_tsv" : coNLL_ner_pos_to_tsv,
35 |     "snli_entailment_to_tsv" : snli_entailment_to_tsv,
36 |     "bio_ner_to_tsv" : bio_ner_to_tsv,
37 |     "create_fragment_detection_tsv" : create_fragment_detection_tsv,
38 |     "msmarco_query_type_to_tsv" : msmarco_query_type_to_tsv,
39 |     "imdb_sentiment_data_to_tsv" : imdb_sentiment_data_to_tsv,
40 |     "qqp_query_similarity_to_tsv" : qqp_query_similarity_to_tsv,
41 |     "msmarco_answerability_detection_to_tsv" : msmarco_answerability_detection_to_tsv,
42 |     "query_correctness_to_tsv" : query_correctness_to_tsv,
43 |     "clinc_out_of_scope_to_tsv" : clinc_out_of_scope_to_tsv
44 | }
45 | 
46 | class ModelType(IntEnum):
47 |     BERT = 1
48 |     DISTILBERT = 2
49 |     ALBERT = 3
50 |     ROBERTA = 4
51 |     XLNET = 5
52 |     ELECTRA = 6
53 | 
54 | class TaskType(IntEnum):
55 |     SingleSenClassification = 1
56 |     SentencePairClassification = 2
57 |     NER = 3
58 | 
59 | class LossType(IntEnum):
60 |     CrossEntropyLoss = 0
61 |     NERLoss = 1
62 | 
63 | 


--------------------------------------------------------------------------------
/utils/eval_metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File for creating metric functions
  3 | """
  4 | from sklearn.metrics import accuracy_score, f1_score
  5 | from sklearn.metrics import recall_score as class_recall_score
  6 | from seqeval.metrics import f1_score as seq_f1
  7 | from seqeval.metrics import precision_score, recall_score
  8 | 
  9 | def classification_accuracy(yTrue, yPred):
 10 |     """
 11 |     Accuracy score for classification tasks using the label provided in file and predictions from multi-task model.
 12 |     It takes a batch of predictions and labels.
 13 | 
 14 |     To use this metric, add **classification_accuracy** into list of ``metrics`` in task file.
 15 | 
 16 |     Args:
 17 |         yPred (:obj:`list`) : [0, 2, 1, 3]
 18 |         yTrue (:obj:`list`) : [0, 1, 2, 3]
 19 | 
 20 |     """
 21 |     return accuracy_score(yTrue, yPred)*100
 22 | 
 23 | def classification_f1_score(yTrue, yPred):
 24 |     """
 25 |     Standard f1 score from sklearn for classification tasks.
 26 |     It takes a batch of predictions and labels.
 27 | 
 28 |     To use this metric, add **classification_f1_score** into list of ``metrics`` in task file.
 29 | 
 30 |     Args:
 31 |         yPred (:obj:`list`) : [0, 2, 1, 3]
 32 |         yTrue (:obj:`list`) : [0, 1, 2, 3]
 33 | 
 34 |     """
 35 |     return f1_score(yTrue, yPred, average='micro')
 36 | 
 37 | def classification_recall(yTrue, yPred):
 38 |     """
 39 |     Standard recall score from sklearn for classification tasks.
 40 |     It takes a batch of predictions and labels.
 41 | 
 42 |     To use this metric, add **classification_f1_score** into list of ``metrics`` in task file.
 43 | 
 44 |     Args:
 45 |         yPred (:obj:`list`) : [0, 2, 1, 3]
 46 |         yTrue (:obj:`list`) : [0, 1, 2, 3]
 47 | 
 48 |     """
 49 |     return class_recall_score(yTrue, yPred, average='micro')
 50 | 
 51 | def seqeval_f1_score(yTrue, yPred):
 52 |     """
 53 |     f1 score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
 54 |     
 55 |     To use this metric, add **seqeval_f1_score** into list of ``metrics`` in task file.
 56 | 
 57 |     Args:
 58 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 59 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 60 |     """
 61 |     return seq_f1(yTrue, yPred)
 62 | 
 63 | def seqeval_precision(yTrue, yPred):
 64 |     """
 65 |     Precision score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
 66 |     
 67 |     To use this metric, add **seqeval_precision** into list of ``metrics`` in task file.
 68 | 
 69 |     Args:
 70 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 71 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 72 |     """
 73 |     return precision_score(yTrue, yPred)
 74 | 
 75 | def seqeval_recall(yTrue, yPred):
 76 | 
 77 |     """
 78 |     Recall score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
 79 |     
 80 |     To use this metric, add **seqeval_recall** into list of ``metrics`` in task file.
 81 | 
 82 |     Args:
 83 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 84 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
 85 |     """
 86 |     return recall_score(yTrue, yPred)
 87 | 
 88 | 
 89 | # compute f1 score is modified from conlleval.pl
 90 | def __startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart = False):
 91 |     if prevTag == 'B' and tag == 'B':
 92 |         chunkStart = True
 93 |     if prevTag == 'I' and tag == 'B':
 94 |         chunkStart = True
 95 |     if prevTag == 'O' and tag == 'B':
 96 |         chunkStart = True
 97 |     if prevTag == 'O' and tag == 'I':
 98 |         chunkStart = True
 99 | 
100 |     if prevTag == 'E' and tag == 'E':
101 |         chunkStart = True
102 |     if prevTag == 'E' and tag == 'I':
103 |         chunkStart = True
104 |     if prevTag == 'O' and tag == 'E':
105 |         chunkStart = True
106 |     if prevTag == 'O' and tag == 'I':
107 |         chunkStart = True
108 | 
109 |     if tag != 'O' and tag != '.' and prevTagType != tagType:
110 |         chunkStart = True
111 |     return chunkStart
112 | 
113 | def __endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd = False):
114 |     if prevTag == 'B' and tag == 'B':
115 |         chunkEnd = True
116 |     if prevTag == 'B' and tag == 'O':
117 |         chunkEnd = True
118 |     if prevTag == 'I' and tag == 'B':
119 |         chunkEnd = True
120 |     if prevTag == 'I' and tag == 'O':
121 |         chunkEnd = True
122 | 
123 |     if prevTag == 'E' and tag == 'E':
124 |         chunkEnd = True
125 |     if prevTag == 'E' and tag == 'I':
126 |         chunkEnd = True
127 |     if prevTag == 'E' and tag == 'O':
128 |         chunkEnd = True
129 |     if prevTag == 'I' and tag == 'O':
130 |         chunkEnd = True
131 | 
132 |     if prevTag != 'O' and prevTag != '.' and prevTagType != tagType:
133 |         chunkEnd = True
134 |     return chunkEnd
135 | 
136 | def __splitTagType(tag):
137 |     s = tag.split('-')
138 |     if len(s) > 2 or len(s) == 0:
139 |         raise ValueError('tag format wrong. it must be B-xxx.xxx')
140 |     if len(s) == 1:
141 |         tag = s[0]
142 |         tagType = ""
143 |     else:
144 |         tag = s[0]
145 |         tagType = s[1]
146 |     return tag, tagType
147 | 
148 | def computeF1Score(correct_slots, pred_slots):
149 | 
150 |     correctChunk = {}
151 |     correctChunkCnt = 0
152 |     foundCorrect = {}
153 |     foundCorrectCnt = 0
154 |     foundPred = {}
155 |     foundPredCnt = 0
156 |     correctTags = 0
157 |     tokenCount = 0
158 |     for correct_slot, pred_slot in zip(correct_slots, pred_slots):
159 |         inCorrect = False
160 |         lastCorrectTag = 'O'
161 |         lastCorrectType = ''
162 |         lastPredTag = 'O'
163 |         lastPredType = ''
164 |         for c, p in zip(correct_slot, pred_slot):
165 |             correctTag, correctType = __splitTagType(c)
166 |             predTag, predType = __splitTagType(p)
167 | 
168 |             if inCorrect == True:
169 |                 if __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
170 |                    __endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
171 |                    (lastCorrectType == lastPredType):
172 |                     inCorrect = False
173 |                     correctChunkCnt += 1
174 |                     if lastCorrectType in correctChunk:
175 |                         correctChunk[lastCorrectType] += 1
176 |                     else:
177 |                         correctChunk[lastCorrectType] = 1
178 |                 elif __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \
179 |                      __endOfChunk(lastPredTag, predTag, lastPredType, predType) or \
180 |                      (correctType != predType):
181 |                     inCorrect = False
182 | 
183 |             if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
184 |                __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
185 |                (correctType == predType):
186 |                 inCorrect = True
187 | 
188 |             if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True:
189 |                 foundCorrectCnt += 1
190 |                 if correctType in foundCorrect:
191 |                     foundCorrect[correctType] += 1
192 |                 else:
193 |                     foundCorrect[correctType] = 1
194 | 
195 |             if __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True:
196 |                 foundPredCnt += 1
197 |                 if predType in foundPred:
198 |                     foundPred[predType] += 1
199 |                 else:
200 |                     foundPred[predType] = 1
201 | 
202 |             if correctTag == predTag and correctType == predType:
203 |                 correctTags += 1
204 | 
205 |             tokenCount += 1
206 | 
207 |             lastCorrectTag = correctTag
208 |             lastCorrectType = correctType
209 |             lastPredTag = predTag
210 |             lastPredType = predType
211 | 
212 |         if inCorrect == True:
213 |             correctChunkCnt += 1
214 |             if lastCorrectType in correctChunk:
215 |                 correctChunk[lastCorrectType] += 1
216 |             else:
217 |                 correctChunk[lastCorrectType] = 1
218 | 
219 |     if foundPredCnt > 0:
220 |         precision = 100*correctChunkCnt/foundPredCnt
221 |     else:
222 |         precision = 0
223 | 
224 |     if foundCorrectCnt > 0:
225 |         recall = 100*correctChunkCnt/foundCorrectCnt
226 |     else:
227 |         recall = 0
228 | 
229 |     if (precision+recall) > 0:
230 |         f1 = (2*precision*recall)/(precision+recall)
231 |     else:
232 |         f1 = 0
233 | 
234 |     return f1, precision, recall
235 | 
236 | def snips_f1_score(yTrue, yPred):
237 |     
238 |     """
239 |     f1 score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
240 |     
241 |     To use this metric, add **snips_f1_score** into list of ``metrics`` in task file.
242 | 
243 |     Args:
244 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
245 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
246 |         
247 |     """
248 |     
249 |     snipsF1, _, _ = computeF1Score(yTrue, yPred)
250 |     return snipsF1
251 | 
252 | def snips_precision(yTrue, yPred):
253 |     """
254 |     Precision score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
255 |     
256 |     To use this metric, add **snips_precision** into list of ``metrics`` in task file.
257 | 
258 |     Args:
259 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
260 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
261 |         
262 |     """
263 |     
264 |     _, snipsPrecision, _ = computeF1Score(yTrue, yPred)
265 |     return snipsPrecision
266 |     
267 | def snips_recall(yTrue, yPred):
268 |     
269 |     """
270 |     Recall score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
271 |     
272 |     To use this metric, add **snips_recall** into list of ``metrics`` in task file.
273 | 
274 |     Args:
275 |         yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
276 |         yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
277 |         
278 |     """
279 |     _, _, snipsRecall = computeF1Score(yTrue, yPred)
280 |     return snipsRecall
281 | 
282 | 


--------------------------------------------------------------------------------
/utils/task_utils.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import joblib
  4 | from collections import OrderedDict
  5 | from utils.data_utils import TaskType, ModelType, LossType, METRICS
  6 | 
  7 | class TasksParam:
  8 |     '''
  9 |     This class keeps the details mentioned in the tasks yml file as attributes.
 10 |     '''
 11 |     def __init__(self, taskFilePath):
 12 |         # dictioanry holding all the tasks details with 
 13 |         # task name as key.
 14 |         #The idea to store, retrieve task information in yaml file and process using dictionary maps and IntEnum classes
 15 |         # is inspired from Microsoft's mt-dnn <https://github.com/namisan/mt-dnn>
 16 |         
 17 |         self.taskDetails = yaml.safe_load(open(taskFilePath))
 18 |         self.modelType = self.validity_checks()
 19 | 
 20 |         classNumMap = {}
 21 |         taskTypeMap = {}
 22 |         taskNameIdMap = {}
 23 |         taskIdNameMap = OrderedDict()
 24 |         metricsMap = {}
 25 |         dropoutProbMap = {}
 26 |         lossMap = {}
 27 |         labelMap = {}
 28 |         lossWeightMap = {}
 29 |         fileNamesMap = {}
 30 | 
 31 |         for i, (taskName, taskVals) in enumerate(self.taskDetails.items()):
 32 |             taskNameIdMap[taskName] = i
 33 |             taskIdNameMap[i] = taskName
 34 |             taskTypeMap[taskName] = TaskType[taskVals["task_type"]]
 35 |             fileNamesMap[taskName] = list(taskVals["file_names"])
 36 | 
 37 |             modelConfig = None
 38 |             dropoutProbMap[taskName] = 0.05
 39 |             lossMap[taskName] = None
 40 |             lossWeightMap[taskName] = float(1.0)
 41 |             labelMap[taskName] = None
 42 |             metricsMap[taskName] = None
 43 | 
 44 |             if "class_num" in taskVals:
 45 |                 classNumMap[taskName] = taskVals["class_num"]
 46 | 
 47 |             if "config_name" in taskVals:
 48 |                 modelConfig = taskVals["config_name"]
 49 | 
 50 |             if "dropout_prob" in taskVals:
 51 |                 dropoutProbMap[taskName] = taskVals["dropout_prob"]
 52 |             
 53 |             if "metrics" in taskVals:
 54 |                 metricsMap[taskName] = [m.lower() for m in taskVals["metrics"]]
 55 | 
 56 |             # loss map
 57 |             if "loss_type" in taskVals:
 58 |                 lossMap[taskName] = LossType[taskVals["loss_type"]]
 59 | 
 60 |             if "label_map_or_file" in taskVals:
 61 |                 '''
 62 |                 Label Map is the list of label names (or tag names in NER) which are
 63 |                 present in the data. We make it into dict. This dict will be used to create the label to index
 64 |                 map and hence is important to maintain order. It is required in case of 
 65 |                 NER. For classification tasks, if the labels are already numeric in data,
 66 |                 label map is not required, but if not, then required.
 67 | 
 68 |                 DO NOT ADD ANY EXTRA SPECIAL TOKEN LIKE ['CLS'], 'X', ['SEP'] IN LABEL MAP OR COUNT IN CLASS NUMBER
 69 | 
 70 |                 It can also take the generated label map joblib file from data transformations
 71 |                 '''
 72 |                 if type(taskVals["label_map_or_file"]) == list:
 73 |                     labelMap[taskName] = {lab:i for i, lab in enumerate(taskVals["label_map_or_file"])}
 74 | 
 75 |                 elif type(taskVals["label_map_or_file"]) == str:
 76 |                     labelMap[taskName] = joblib.load(taskVals["label_map_or_file"])
 77 | 
 78 |                 else:
 79 |                     raise ValueError("label_map_or_file not recognized")
 80 |                 
 81 |                 if taskTypeMap[taskName] == TaskType.NER:
 82 |                     labelMap[taskName]['[CLS]'] = len(labelMap[taskName])
 83 |                     labelMap[taskName]['[SEP]'] = len(labelMap[taskName])
 84 |                     labelMap[taskName]['X'] = len(labelMap[taskName])
 85 |                     if "O" not in labelMap[taskName]:
 86 |                         labelMap[taskName]["O"] = len(labelMap[taskName])
 87 |                         
 88 |                 classNumMap[taskName] = len(labelMap[taskName])
 89 | 
 90 |             if "loss_weight" in taskVals:
 91 |                 '''
 92 |                 loss weight for individual task. This factor 
 93 |                 will be multiplied directly to the loss calculated
 94 |                 for backpropagation
 95 |                 '''
 96 |                 lossWeightMap[taskName] = float(taskVals["loss_weight"])
 97 |             else:
 98 |                 lossWeightMap[taskName] = float(1.0)
 99 | 
100 |         self.classNumMap = classNumMap
101 |         self.taskTypeMap = taskTypeMap
102 |         self.taskNameIdMap = taskNameIdMap
103 |         self.taskIdNameMap = taskIdNameMap
104 |         self.modelConfig = modelConfig
105 |         self.metricsMap = metricsMap
106 |         self.fileNamesMap = fileNamesMap
107 |         self.dropoutProbMap = dropoutProbMap
108 |         self.lossMap = lossMap
109 |         self.labelMap =labelMap
110 |         self.lossWeightMap = lossWeightMap
111 | 
112 |     def validity_checks(self):
113 |         '''
114 |         Check if the yml has correct form or not.
115 |         '''
116 |         requiredParams = {"task_type", "loss_type", "file_names"}
117 |         uniqueModel = set()
118 |         uniqueConfig = set()
119 |         for taskName, taskVals in self.taskDetails.items():
120 |             # check task name
121 |             assert taskName.isalpha(), "only alphabets are allowed in task name. No special chracters/numbers/whitespaces allowed. Task Name: %s" % taskName
122 | 
123 |             # check all required arguments
124 |             assert len(requiredParams.intersection(set(taskVals.keys()))) == len(requiredParams), "following parameters are required {}".format(requiredParams)
125 | 
126 |             #check is loss,  model type is correct
127 |             try:
128 |                 LossType[taskVals["loss_type"]]
129 |                 ModelType[taskVals["model_type"]]
130 |             except:
131 |                 print("allowed loss {}".format(list(LossType)))
132 |                 print("allowed model type {}".format(list( ModelType)))
133 |                 raise
134 | 
135 |             # check metric if present
136 |             if "metrics" in taskVals:
137 |                 for m in taskVals["metrics"]:
138 |                     assert m.lower() in METRICS, "allowed metrics are {}".format(METRICS.keys())
139 | 
140 |             # check model type, only one model type is allowed for all tasks
141 |             uniqueModel.add(ModelType[taskVals["model_type"]])
142 |             if "config_name" in taskVals:
143 |                 uniqueConfig.add(taskVals["config_name"])
144 | 
145 |             #check if all data files exists for task
146 |             #for fileName in taskVals['file_names']:
147 |                 #assert os.path.exists(fileName)
148 | 
149 |             #either label map/file is required or class_num is required.
150 |             assert "label_map_or_file" in taskVals or "class_num" in taskVals, "either class_num or label_map_or_file is required"
151 | 
152 |             # we definitely require label mapping for NER task
153 |             if taskVals["task_type"] == 'NER':
154 |                 assert "label_map_or_file" in taskVals, "Unique Tags/Labels or map file needs to be mentioned in label_map_or_file for NER"
155 | 
156 |         assert len(uniqueModel) == 1, "Only one type of model can be shared across all tasks"
157 |         assert len(uniqueConfig) <= 1, "Model config has to be same across all shared tasks"
158 | 
159 |         #return model type from here
160 |         return list(uniqueModel)[0]
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/utils/transform_utils.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | import joblib
 4 | from utils.data_utils import TRANSFORM_FUNCS
 5 | 
 6 | class TransformParams:
 7 |     '''
 8 |     This class keeps the details mentioned in transform yaml file for the 
 9 |     case when data transformations is required to be performed.
10 |     '''
11 |     def __init__(self, transformFilePath):
12 | 
13 |         self.transformDetails = yaml.safe_load(open(transformFilePath))
14 |         self.validity_checks()
15 |         transformFnMap = {}
16 |         transformParamsMap = {}
17 |         readFileNamesMap = {}
18 |         readDirMap = {}
19 |         saveDirMap = {}
20 | 
21 |         for i, (transformName, transformVals) in enumerate(self.transformDetails.items()):
22 |             transformFnMap[transformName] = transformVals['transform_func']
23 |             transformParamsMap[transformName] = {}
24 |             readFileNamesMap[transformName] = list(transformVals['read_file_names'])
25 |             readDirMap[transformName] = transformVals['read_dir']
26 |             saveDirMap[transformName] = transformVals['save_dir']
27 | 
28 |             if 'transform_params' in transformVals:
29 |                 transformParamsMap[transformName] = dict(transformVals['transform_params'])
30 | 
31 |         self.transformFnMap = transformFnMap
32 |         self.transformParamsMap = transformParamsMap
33 |         self.readFileNamesMap = readFileNamesMap
34 |         self.readDirMap = readDirMap
35 |         self.saveDirMap = saveDirMap
36 | 
37 |     def validity_checks(self):
38 |         '''
39 |         Check if the transform yml is correct or not
40 |         '''
41 |         requiredParams = {"transform_func", "read_dir", "read_file_names", "save_dir"}
42 |         for i, (transformName, transformVals) in enumerate(self.transformDetails.items()):
43 |             # check all required arguments
44 |             assert len(requiredParams.intersection(set(transformVals.keys()))) == len(requiredParams), "following parameters are required {}".format(requiredParams)
45 | 
46 |             #check if transform functions is in the defined transform function
47 |             assert transformVals['transform_func'] in TRANSFORM_FUNCS.keys(), "{} transform fn is not in following defined functions {}".format(transformVals['transform_func'],
48 |                                                                                                                                                 TRANSFORM_FUNCS.keys())
49 | 
50 | 
51 | 
52 |             
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------