├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── demo_data
    ├── classification.csv
    ├── generation.csv
    ├── mask.csv
    ├── mcq.csv
    ├── qa.csv
    ├── tag.csv
    ├── tok_list.txt
    └── unk_tok.csv
├── docs
    ├── benchmark.md
    ├── img
    │   ├── flow.png
    │   ├── tfkit-icon.png
    │   └── tfkit.png
    ├── index.md
    ├── installation.md
    ├── models.md
    ├── structure.md
    └── tasks.md
├── mkdocs.yml
├── requirements.txt
├── setup.py
└── tfkit
    ├── __init__.py
    ├── dump.py
    ├── eval.py
    ├── task
        ├── __init__.py
        ├── clas
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
        ├── clm
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
        ├── once
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
        ├── oncectc
        │   ├── __init__.py
        │   └── model.py
        ├── qa
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
        ├── seq2seq
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
        └── tag
        │   ├── __init__.py
        │   ├── model.py
        │   └── preprocessor.py
    ├── test
        ├── __init__.py
        ├── task
        │   └── test_task_model.py
        ├── test_atrain.py
        ├── test_package.py
        ├── test_zeval.py
        ├── test_zzdump.py
        └── utility
        │   ├── test_utility_data_filereader.py
        │   ├── test_utility_data_loader.py
        │   ├── test_utility_data_processor.py
        │   ├── test_utility_dataset.py
        │   ├── test_utility_eval_metric.py
        │   ├── test_utility_logger.py
        │   ├── test_utility_loss.py
        │   ├── test_utility_model.py
        │   └── test_utility_tok.py
    ├── train.py
    └── utility
        ├── __init__.py
        ├── data_filereader.py
        ├── data_loader.py
        ├── data_processor.py
        ├── dataset.py
        ├── eval_metric.py
        ├── logger.py
        ├── loss.py
        ├── model.py
        ├── predictor.py
        └── tok.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: [ 3.9 ]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v2
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |       - uses: actions/cache@v2
27 |         with:
28 |           path: ~/.cache/pip
29 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
30 |           restore-keys: |
31 |             ${{ runner.os }}-pip-
32 |       - name: Install dependencies
33 |         run: |
34 |           python -m pip install --upgrade pip
35 |           pip install flake8 pytest
36 |           pip install -r requirements.txt
37 |           pip install .
38 |       - name: Lint with flake8
39 |         run: |
40 |           # stop the build if there are Python syntax errors or undefined names
41 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
42 |       - name: Test with pytest
43 |         run: |
44 |           pytest
45 |       - name: Generate coverage report
46 |         run: |
47 |           pip install pytest-cov
48 |           pytest --cov=./ --cov-report=xml
49 |       - name: Upload coverage to Codecov
50 |         uses: codecov/codecov-action@v1
51 |         with:
52 |           fail_ci_if_error: false
53 |           verbose: false
54 |       - name: Build
55 |         run: |
56 |           python setup.py install


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # General
  2 | .DS_Store
  3 | .AppleDouble
  4 | .LSOverride
  5 | 
  6 | # Icon must end with two \r
  7 | Icon  
  8 | 
  9 | # Thumbnails
 10 | ._*
 11 | 
 12 | # Files that might appear in the root of a volume
 13 | .DocumentRevisions-V100
 14 | .fseventsd
 15 | .Spotlight-V100
 16 | .TemporaryItems
 17 | .Trashes
 18 | .VolumeIcon.icns
 19 | .com.apple.timemachine.donotpresent
 20 | 
 21 | # Directories potentially created on remote AFP share
 22 | .AppleDB
 23 | .AppleDesktop
 24 | Network Trash Folder
 25 | Temporary Items
 26 | .apdisk
 27 | 
 28 | # IntelliJ project files
 29 | .idea
 30 | *.iml
 31 | out
 32 | gen### Example user template template
 33 | ### Example user template
 34 | 
 35 | # IntelliJ project files
 36 | .idea
 37 | *.iml
 38 | out
 39 | gen### Python template
 40 | # Byte-compiled / optimized / DLL files
 41 | __pycache__/
 42 | *.py[cod]
 43 | *$py.class
 44 | 
 45 | # C extensions
 46 | *.so
 47 | 
 48 | # Distribution / packaging
 49 | .Python
 50 | build/
 51 | develop-eggs/
 52 | dist/
 53 | downloads/
 54 | eggs/
 55 | .eggs/
 56 | lib/
 57 | lib64/
 58 | parts/
 59 | sdist/
 60 | var/
 61 | wheels/
 62 | *.egg-info/
 63 | .installed.cfg
 64 | *.egg
 65 | MANIFEST
 66 | 
 67 | # PyInstaller
 68 | #  Usually these files are written by a python script from a template
 69 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 70 | *.manifest
 71 | *.spec
 72 | 
 73 | # Installer logs
 74 | pip-log.txt
 75 | pip-delete-this-directory.txt
 76 | 
 77 | # Unit test / coverage reports
 78 | htmlcov/
 79 | .tox/
 80 | .coverage
 81 | .coverage.*
 82 | .cache
 83 | nosetests.xml
 84 | coverage.xml
 85 | *.cover
 86 | .hypothesis/
 87 | .pytest_cache/
 88 | 
 89 | # Translations
 90 | *.mo
 91 | *.pot
 92 | 
 93 | # Django stuff:
 94 | *.log
 95 | local_settings.py
 96 | db.sqlite3
 97 | 
 98 | # Flask stuff:
 99 | instance/
100 | .webassets-cache
101 | 
102 | # Scrapy stuff:
103 | .scrapy
104 | 
105 | # Sphinx documentation
106 | docs/_build/
107 | 
108 | # PyBuilder
109 | target/
110 | 
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 | 
114 | # pyenv
115 | .python-version
116 | 
117 | # celery beat schedule file
118 | celerybeat-schedule
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # how2
143 | .how2
144 | how2
145 | /how2
146 | 
147 | # test cache
148 | ./tfkit/test/cache
149 | /tfkit/test/cache
150 | tfkit/test/cache
151 | 
152 | # test cache
153 | ./tfkit/test/runs
154 | /tfkit/test/runs
155 | tfkit/test/runs
156 | 
157 | ./tfkit/test/wandb
158 | /tfkit/test/wandb
159 | tfkit/test/wandb
160 | 
161 | # cache
162 | ./cache
163 | cache
164 | /cache
165 | 
166 | # mypy
167 | .mypy_cache/
168 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to tfkit
 2 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
 3 | 
 4 | - Reporting a bug
 5 | - Discussing the current state of the code
 6 | - Submitting a fix
 7 | - Proposing new features
 8 | - Becoming a maintainer
 9 | 
10 | ## We Develop with Github
11 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
12 | 
13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
15 | 
16 | 1. Fork the repo and create your branch from `master`.
17 | 2. If you've added code that should be tested, add tests.
18 | 3. If you've changed APIs, update the documentation.
19 | 4. Ensure the test suite passes.
20 | 5. Make sure your code lints.
21 | 6. Issue that pull request!
22 | 
23 | ## Any contributions you make will be under the Apache 2.0 Software License
24 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/) that covers the project. Feel free to contact the maintainers if that's a concern.
25 | 
26 | ## Report bugs using Github's [issues](https://github.com/voidful/tfkit/issues)
27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](); it's that easy!
28 | 
29 | ## Write bug reports with detail, background, and sample code
30 | **Great Bug Reports** tend to have:
31 | 
32 | - A quick summary and/or background
33 | - Steps to reproduce
34 |   - Be specific!
35 |   - Give sample code if you can.
36 | - What you expected would happen
37 | - What actually happens
38 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
39 | 
40 | People *love* thorough bug reports. I'm not even kidding.
41 | 
42 | ## License
43 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
44 | 
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-devel
 2 | 
 3 | ENV LANG=C.UTF-8
 4 | WORKDIR /workspace/
 5 | COPY ./ /workspace/
 6 | 
 7 | # install basics
 8 | RUN apt-get update -y
 9 | RUN apt-get install -y git curl htop wget tmux
10 | 
11 | # install python deps
12 | RUN pip install -r /workspace/requirements.txt
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright voidful
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p  align="center">
  2 |     <br>
  3 |     <img src="https://raw.githubusercontent.com/voidful/TFkit/master/docs/img/tfkit.png" width="300"/>
  4 |     <br>
  5 | </p>
  6 | <br/>
  7 | <p align="center">
  8 |     <a href="https://pypi.org/project/tfkit/">
  9 |         <img alt="PyPI" src="https://img.shields.io/pypi/v/tfkit">
 10 |     </a>
 11 |     <a href="https://github.com/voidful/tfkit">
 12 |         <img alt="Download" src="https://img.shields.io/pypi/dm/tfkit">
 13 |     </a>
 14 |     <a href="https://github.com/voidful/tfkit">
 15 |         <img alt="Last Commit" src="https://img.shields.io/github/last-commit/voidful/tfkit">
 16 |     </a>
 17 |     <a href="https://www.codefactor.io/repository/github/voidful/tfkit/overview/master">
 18 |         <img src="https://www.codefactor.io/repository/github/voidful/tfkit/badge/master" alt="CodeFactor" />
 19 |     </a>
 20 |     <a href="https://github.com/voidful/tfkit">
 21 |         <img src="https://visitor-badge.glitch.me/badge?page_id=voidful.tfkit" alt="Visitor" />
 22 |     </a>
 23 |     <a href="https://codecov.io/gh/voidful/TFkit">
 24 |       <img src="https://codecov.io/gh/voidful/TFkit/branch/master/graph/badge.svg" />
 25 |     </a>
 26 | </p>
 27 | 
 28 | ## What is it
 29 | TFKit is a tool kit mainly for language generation.  
 30 | It leverages the use of transformers on many tasks with different models in this all-in-one framework.   
 31 | All you need is a little change of config.  
 32 | 
 33 | ## Task Supported
 34 | With transformer models - BERT/ALBERT/T5/BART......  
 35 | |  |  |
 36 | |-|-|
 37 | | Text Generation | :memo: seq2seq language model |
 38 | | Text Generation | :pen: causal language model |
 39 | | Text Generation | :printer: once generation model / once generation model with ctc loss |
 40 | | Text Generation | :pencil: onebyone generation model |
 41 | 
 42 | # Getting Started
 43 | Learn more from the [document](https://voidful.github.io/TFkit/).  
 44 | 
 45 | ## How To Use
 46 | 
 47 | ### Step 0: Install
 48 | Simple installation from PyPI
 49 | ```bash
 50 | pip install git+https://github.com/voidful/TFkit.git@refactor-dataset
 51 | ```
 52 | 
 53 | ### Step 1: Prepare dataset in csv format
 54 | [Task format](https://voidful.tech/TFkit/tasks/)
 55 | ``` 
 56 | input, target
 57 | ```
 58 | 
 59 | ### Step 2: Train model
 60 | ```bash
 61 | tfkit-train \
 62 | --task clas \
 63 | --config xlm-roberta-base \
 64 | --train training_data.csv \
 65 | --test testing_data.csv \
 66 | --lr 4e-5 \
 67 | --maxlen 384 \
 68 | --epoch 10 \
 69 | --savedir roberta_sentiment_classificer
 70 | ```
 71 | 
 72 | ### Step 3: Evaluate
 73 | ```bash
 74 | tfkit-eval \
 75 | --task roberta_sentiment_classificer/1.pt \
 76 | --metric clas \
 77 | --valid testing_data.csv
 78 | ```
 79 | 
 80 | ## Advanced features
 81 | <details>
 82 |   <summary>Multi-task training </summary>
 83 | 
 84 |   ```bash
 85 |   tfkit-train \
 86 |     --task clas clas \
 87 |     --config xlm-roberta-base \
 88 |     --train training_data_taskA.csv training_data_taskB.csv \
 89 |     --test testing_data_taskA.csv testing_data_taskB.csv \
 90 |     --lr 4e-5 \
 91 |     --maxlen 384 \
 92 |     --epoch 10 \
 93 |     --savedir roberta_sentiment_classificer_multi_task
 94 |   ```
 95 | </details>
 96 | 
 97 | ## Not maintained task
 98 | Due to time constraints, the following tasks are temporarily not supported
 99 | |  |  |
100 | |-|-|
101 | | Classification  | :label: multi-class and multi-label classification |
102 | | Question Answering  | :page_with_curl: extractive qa |
103 | | Question Answering  | :radio_button: multiple-choice qa |
104 | | Tagging  | :eye_speech_bubble: sequence level tagging / sequence level with crf  |
105 | | Self-supervise Learning | :diving_mask: mask language model |
106 | 
107 | ## Supplement
108 | - [transformers models list](https://huggingface.co/models): you can find any pretrained models here   
109 | - [nlprep](https://github.com/voidful/NLPrep): download and preprocessing data in one line     
110 | - [nlp2go](https://github.com/voidful/nlp2go): create demo api as quickly as possible.
111 | 
112 | 
113 | ## Contributing
114 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
115 | 
116 | ## License ![PyPI - License](https://img.shields.io/github/license/voidful/tfkit)
117 | 
118 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
119 | 
120 | ## Icons reference
121 | Icons modify from <a href="http://www.freepik.com/" title="Freepik">Freepik</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a>      
122 | Icons modify from <a href="https://www.flaticon.com/authors/nikita-golubev" title="Nikita Golubev">Nikita Golubev</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a>      
123 | 


--------------------------------------------------------------------------------
/demo_data/classification.csv:
--------------------------------------------------------------------------------
1 | We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS
2 | Calotropis procera (ushaar) keratitis.,Not-Related
3 | Fixed drug eruption is associated with many drugs but this is the first such report with omeprazole.,Related///CONCLUSION


--------------------------------------------------------------------------------
/demo_data/generation.csv:
--------------------------------------------------------------------------------
1 | "Dan's parents were overweight .  Dan was overweight as well .  The doctors told his parents it was unhealthy .  His parents understood and decided to make a change .","They got themselves and Dan on a diet ."
2 | "Jane was working at a diner .  Suddenly , a customer barged up to the counter .  He began yelling about how long his food was taking . /// Jane didn't know how to react .","Luckily , her coworker intervened and calmed the man down ."
3 | Peter was a truck driver .  He was running a little behind on schedule .  Peter decided to run past the weigh station .  He was stopped by a cop .,"Peter ended up running late and getting a fine ."


--------------------------------------------------------------------------------
/demo_data/mask.csv:
--------------------------------------------------------------------------------
1 | "i go to [MASK] by [MASK]","school bus"
2 | "how did i [MASK] [MASK]","get here"


--------------------------------------------------------------------------------
/demo_data/mcq.csv:
--------------------------------------------------------------------------------
1 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos   . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation   of the clone 's right to its own genetic identity and personality . </s> According to the article , what is the author 's opinion about identity ? [MASK] People 's identity is completely determined by their genes . [MASK] Government has the rights to confirm people 's identities . [MASK] Cloning itself gives parents great power over identity . [MASK] Cloning may prevent people from possessing their identities .",3
2 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos   . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation   of the clone 's right to its own genetic identity and personality . </s> According to Paragraph 4 , which is right ? [MASK] cloning means running the risk of wasting too much effort [MASK] numbers of baby animals are likely to be created by cloning [MASK] human cloning is much more difficult than animal cloning [MASK] there are 200 sheep successfully cloned .",0
3 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos   . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation   of the clone 's right to its own genetic identity and personality . </s> What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2


--------------------------------------------------------------------------------
/demo_data/qa.csv:
--------------------------------------------------------------------------------
1 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. <s> What did Beyoncé announce in January 2010?", 18,25
2 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. <s> Who suggested the hiatus for Beyoncé?", 74,84
3 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. <s> In what year did Beyonce have her hiatus?", 60,64


--------------------------------------------------------------------------------
/demo_data/tag.csv:
--------------------------------------------------------------------------------
1 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。  陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
2 | "1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 會 中 , 威 廉 · 瓊 斯 發 表 了 下 面 這 段 著 名 的 言 論 : 「 梵 語 儘 管 非 常 古 老 , 構 造 卻 精 妙 絕 倫 : 比 希 臘 語 還 完 美 , 比 拉 丁 語 還 豐 富 , 精 緻 之 處 同 時 勝 過 此 兩 者 , 但 在 動 詞 詞 根 和 語 法 形 式 上 , 又 跟 此 兩 者 無 比 相 似 , 不 可 能 是 巧 合 的 結 果 。 這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。  陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
3 | "這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。 基 於 相 似 的 原 因 , 儘 管 缺 少 同 樣 有 力 的 證 據 , 我 們 可 以 推 想 哥 德 語 和 凱 爾 特 語 , 雖 然 混 入 了 迥 然 不 同 的 語 彙 , 也 與 梵 語 有 著 相 同 的 起 源 ; 而 古 波 斯 語 可 能 也 是 這 一 語 系 的 子 裔 。 」  陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
4 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。  印 歐 語 系 因 為 哪 一 門 語 言 而 被 發 現 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
5 | 實 驗 室,LOA LOB LOC
6 | 溫 者 必 良 ， 自 古 而 然 。,O O O O O O O O O O
7 | 狼 煙 逝 去 ， 幽 夢 醒 來 。,B_Thing I_Thing O O O O O O O O


--------------------------------------------------------------------------------
/demo_data/tok_list.txt:
--------------------------------------------------------------------------------
1 | 闕
2 | :mbk1:
3 | >gg<


--------------------------------------------------------------------------------
/demo_data/unk_tok.csv:
--------------------------------------------------------------------------------
1 | 紫府東風放夜時。步蓮穠李伴人歸,五更鐘動笙歌散，十里月明燈火稀。
2 | 香苒苒，夢依依。天涯寒盡減春衣,鳳凰城闕知何處，寥落星河一雁飛。


--------------------------------------------------------------------------------
/docs/benchmark.md:
--------------------------------------------------------------------------------
 1 | ##DRCD
 2 | ### Test
 3 | | model | EM | F1 | 
 4 | | :----:|:----: |:----: |
 5 | | <a href="https://huggingface.co/voidful/albert_chinese_small">albert-small</a>	| 74.45% | 86.08% |
 6 | | <a href="https://huggingface.co/hfl/chinese-electra-small-discriminator">electra-small</a>	| 76.64% | 87.49% |
 7 | | <a href="https://huggingface.co/voidful/albert_chinese_base">albert-base</a>	| 80.17% | 89.87% |
 8 | 
 9 | ### Dev
10 | | model | EM | F1 | 
11 | | :----:|:----: |:----: |
12 | | <a href="https://huggingface.co/voidful/albert_chinese_small">albert-small</a>	| 73.70% | 85.33% |
13 | | <a href="https://huggingface.co/hfl/chinese-electra-small-discriminator">electra-small</a>	| 77.61% | 87.33% |
14 | | <a href="https://huggingface.co/voidful/albert_chinese_base">albert-base</a>	| 80.52% | 89.92% |
15 | 


--------------------------------------------------------------------------------
/docs/img/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/flow.png


--------------------------------------------------------------------------------
/docs/img/tfkit-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit-icon.png


--------------------------------------------------------------------------------
/docs/img/tfkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | <p  align="center">
  2 |     <br>
  3 |     <img src="https://raw.githubusercontent.com/voidful/TFkit/master/docs/img/tfkit.png" width="300"/>
  4 |     <br>
  5 | </p>
  6 | <br/>
  7 | <p align="center">
  8 |     <a href="https://pypi.org/project/tfkit/">
  9 |         <img alt="PyPI" src="https://img.shields.io/pypi/v/tfkit">
 10 |     </a>
 11 |     <a href="https://github.com/voidful/tfkit">
 12 |         <img alt="Download" src="https://img.shields.io/pypi/dm/tfkit">
 13 |     </a>
 14 |     <a href="https://github.com/voidful/tfkit">
 15 |         <img alt="Build" src="https://img.shields.io/github/workflow/status/voidful/tfkit/Python package">
 16 |     </a>
 17 |     <a href="https://github.com/voidful/tfkit">
 18 |         <img alt="Last Commit" src="https://img.shields.io/github/last-commit/voidful/tfkit">
 19 |     </a>
 20 | </p>
 21 | 
 22 | ## Getting started
 23 | 
 24 | ### Installing via pip
 25 | ```bash
 26 | pip install tfkit
 27 | ```
 28 | 
 29 | * You can use tfkit for model training and evaluation with `tfkit-train` and `tfkit-eval`.
 30 | 
 31 | ### Running TFKit on the task you wanted
 32 | 
 33 | ### First step - prepare your dataset
 34 | The key to combine different task together is to make different task with same data format.
 35 | 
 36 | **notice**  
 37 | 
 38 | * All data will be in csv format - tfkit will use **csv** for all task, normally it will have two columns, first columns is the input of models, the second column is the output of models.
 39 | * Plane text with no tokenization - there is no need to tokenize text before training, or do re-calculating for tokenization, tfkit will handle it for you.
 40 | * No header is needed.
 41 | 
 42 | For example, a sentiment classification dataset will be like:
 43 | ```csv
 44 | how dare you,negative
 45 | ```
 46 | 
 47 | !!! hint 
 48 |     For the detail and example format on different, you can check [here](tasks/) 
 49 | 
 50 | !!! hint 
 51 |     nlprep is a tool for data split/preprocessing/argumentation, it can help you to create ready to train data for tfkit, check [here](https://github.com/voidful/NLPrep)
 52 | 
 53 | ### Second step - model training
 54 | 
 55 | Using `tfkit-train` for model training, you can use 
 56 | 
 57 | Before training a model, there is something you need to clarify:
 58 | 
 59 | - `--model` what is your model to handle this task? check [here](models/) to the detail of models.
 60 | - `--config` what pretrained model you want to use？ you can go [https://huggingface.co/models](https://huggingface.co/models) to search for available pretrained models.
 61 | - `--train` and `--test` training and testing dataset path, which is in csv format.
 62 | - `--savedir` model saving directory, default will be in '/checkpoints' folder
 63 |   
 64 | you can leave the rest to the default config, or use `tfkit-train -h` to more configuration.
 65 | 
 66 | An example about training a sentiment classifier:
 67 | ```bash
 68 | tfkit-train \
 69 | --task clas \
 70 | --config xlm-roberta-base \
 71 | --train training_data.csv \
 72 | --test testing_data.csv \
 73 | --lr 4e-5 \
 74 | --maxlen 384 \
 75 | --epoch 10 \
 76 | --savedir roberta_sentiment_classificer
 77 | ```
 78 | 
 79 | #### Third step - model eval
 80 | 
 81 | Using `tfkit-eval` for model evaluation.   
 82 | - `--model` saved model's path.  
 83 | - `--metric` the evaluation metric eg: emf1, nlg(BLEU/ROUGE), clas(confusion matrix).  
 84 | - `--valid` validation data, also in csv format.  
 85 | - `--panel` a input panel for model specific parameter.  
 86 | 
 87 | for more configuration detail, you may use `tfkit-eval -h`.
 88 | 
 89 | After evaluate, It will print evaluate result in your console, and also generate three report for debugging.  
 90 | - `*_score.csv` overall score, it is the copy of the console result.  
 91 | - `*each_data_score.csv` score on each data, 3 column `predicted,targets,score`, ranked from the lowest to the highest.  
 92 | - `*predicted.csv` csv file include 3 column `input,predicted,targets`.  
 93 | 
 94 | !!! hint 
 95 |     nlp2go is a tool for demonstration, with CLI and Restful interface. check [here](https://github.com/voidful/nlp2go) 
 96 | 
 97 | ### Example
 98 | #### Use distilbert to train NER Model
 99 | ```bash
100 | nlprep --dataset tag_clner  --outdir ./clner_row --util s2t
101 | tfkit-train --batch 10 --epoch 3 --lr 5e-6 --train ./clner_row/train --test ./clner_row/test --maxlen 512 --task tag --config distilbert-base-multilingual-cased 
102 | nlp2go --task ./checkpoints/3.pt  --cli     
103 | ```
104 | 
105 | #### Use Albert to train DRCD Model Model
106 | ```bash
107 | nlprep --dataset qa_zh --outdir ./zhqa/   
108 | tfkit-train --maxlen 512 --savedir ./drcd_qa_model/ --train ./zhqa/drcd-train --test ./zhqa/drcd-test --task qa --config voidful/albert_chinese_small  --cache
109 | nlp2go --task ./drcd_qa_model/3.pt --cli 
110 | ```
111 | 
112 | #### Use Albert to train both DRCD Model and NER Model
113 | ```bash
114 | nlprep --dataset tag_clner  --outdir ./clner_row --util s2t
115 | nlprep --dataset qa_zh --outdir ./zhqa/ 
116 | tfkit-train --maxlen 300 --savedir ./mt-qaner --train ./clner_row/train ./zhqa/drcd-train --test ./clner_row/test ./zhqa/drcd-test --task tag qa --config voidful/albert_chinese_small
117 | nlp2go --task ./mt-qaner/3.pt --cli 
118 | ```
119 | 
120 | **You can also try tfkit in Google Colab: [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg "tfkit")](https://colab.research.google.com/drive/1hqaTKxd3VtX2XkvjiO0FMtY-rTZX30MJ?usp=sharing)**
121 | 
122 | ## Contributing
123 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
124 | 
125 | ## License 
126 | ![PyPI - License](https://img.shields.io/github/license/voidful/tfkit)
127 | 
128 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
129 | 
130 | ## Icons reference
131 | Icons modify from <a href="http://www.freepik.com/" title="Freepik">Freepik</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a>      
132 | Icons modify from <a href="https://www.flaticon.com/authors/nikita-golubev" title="Nikita Golubev">Nikita Golubev</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a>      
133 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | tfkit is tested on Python 3.6+, and PyTorch 1.1.0+.  
 3 | 
 4 | ### Installing via pip
 5 | ```bash
 6 | pip install tfkit
 7 | ```
 8 | ### Installing via source
 9 | ```bash
10 | git clone https://github.com/voidful/tfkit.git
11 | python setup.py install
12 | # or
13 | pip install .
14 | ```
15 | 
16 | ## Running tfkit
17 | Model you've installed tfkit, you can run with  
18 | 
19 | ### pip installed version: 
20 | `tfkit-train`   
21 | `tfkit-eval`   
22 | `tfkit-dump`   
23 | 
24 | ### local version: 
25 | `python -m tfkit.train`   
26 | `python -m tfkit.eval`    
27 | `python -m tfkit.dump`    


--------------------------------------------------------------------------------
/docs/models.md:
--------------------------------------------------------------------------------
 1 | ## Models Overview
 2 | 
 3 | | task        | available models                    |
 4 | | ----------- | ------------------------------------ |
 5 | | text generation       | `seq2seq` `clm` `onebyone` `once` `oncectc`  |
 6 | | extractive question answering       | `qa` |
 7 | | multiple choice question answering       | `mcq` |
 8 | | sequence tagging        | `tag` `tagcrf` |
 9 | | sentence classification        | `clas` |  
10 | | mask language model        | `clm` |  
11 | 
12 | ## Text Generation
13 | ### `seq2seq`
14 | [comment]: <> (::: tfkit.model.seq2seq.model.Model.forward)
15 | [comment]: <> (::: tfkit.model.seq2seq.dataloader)
16 | encoder decoder models for text generation, eg: T5/BART
17 | 
18 | ### `clm`
19 | causal language model, decoder only models for text generation, eg: GPT
20 | 
21 | ### `onebyone`
22 | onebyone text generation, for mask lm generation.
23 | 
24 | ### `once`
25 | once text generation
26 | 
27 | ### `oncectc`
28 | once text generation with ctc loss
29 | 
30 | ## Extractive Question Answering
31 | ### `qa`
32 | SQuAD like question answer
33 | 
34 | ## Multiple Choice Question Answering
35 | ### `mcq`
36 | softmax from mask token in input
37 | 
38 | ## Sequence Tagging
39 | ### `tag`
40 | token classification
41 | 
42 | ### `tagcrf`
43 | token classification with crf layer 
44 | 
45 | ## Sentence Classification 
46 | ### `clas`
47 | sentence classification using pooling head from transformer models.
48 | 
49 | ## Mask Language Model
50 | ### `mask`
51 | mask token prediction, for self-supervised learning


--------------------------------------------------------------------------------
/docs/structure.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | Flow  
 3 | ![Flow](https://raw.githubusercontent.com/voidful/TFkit/master/docs/img/flow.png)
 4 | 
 5 | Project directory:
 6 | ```
 7 | .
 8 | ├─ demo_data/                          # Example data for training and evaluation
 9 | ├─ docs/                               # Documents
10 | ├─ tfkit/
11 | │  ├─ model/                           # all of the models, subdir name will be model name 
12 | │  │  ├─ model_name                    # - name will be dynamic import to tfkit-train
13 | │  │  │  ├─ __init__.py                
14 | │  │  │  ├─ dataloader.py              # - for data loading and preprocessing
15 | │  │  │  └─ model.py                   # - model forward and prediction
16 | │  │  └─ __init__.py                   
17 | │  ├─ test/                            # project unit test
18 | │  │  ├─ __init__.py                   
19 | │  │  ├─ test_atrain.py                # - test tfkit-train
20 | │  │  ├─ test_dataloader.py            # - test all model/*/dataloader.py
21 | │  │  ├─ test_model.py                 # - test all model/*/model.py
22 | │  │  ├─ test_package.py               # - test package import
23 | │  │  ├─ test_utility_dataset.py       # - test utility/dataset.py
24 | │  │  ├─ test_utility_eval_metric.py   # - test utility/eval_metric.py
25 | │  │  ├─ test_utility_logger.py        # - test utility/logger.py
26 | │  │  ├─ test_utility_loss.py          # - test utility/loss.py
27 | │  │  ├─ test_utility_model_loader.py  # - test utility/model_loader.py
28 | │  │  ├─ test_utility_tok.py           # - test utility/predictor.py
29 | │  │  ├─ test_zeval.py                 # - test tfkit-eval
30 | │  │  └─ test_zzdump.py                # - test tfkit-dump
31 | │  ├─ utility/                         # project utility
32 | │  │  ├─ __init__.py                   
33 | │  │  ├─ dataset.py                    # - handle dataset loading
34 | │  │  ├─ eval_metric.py                # - handle evaluation metric calculation
35 | │  │  ├─ logger.py                     # - handle logging and printing
36 | │  │  ├─ loss.py                       # - custom loss function
37 | │  │  ├─ model_loader.py               # - handle model loading
38 | │  │  ├─ predictor.py                  # - handle model prediction
39 | │  │  └─ tok.py                        # - handle tokenization
40 | │  ├─ __init__.py                      # package init
41 | │  ├─ dump.py                          # tfkit-dump handler
42 | │  ├─ eval.py                          # tfkit-eval handler
43 | │  └─ train.py                         # tfkit-train handler
44 | ├─ Dockerfile                          # recommend docker file
45 | ├─ mkdocs.yml                          # document config
46 | ├─ README.md                           # project readme
47 | ├─ requirements.txt                    # package requirement
48 | └─ setup.py                            # package setup
49 | ```


--------------------------------------------------------------------------------
/docs/tasks.md:
--------------------------------------------------------------------------------
 1 | ## Task format
 2 | 
 3 | ### Classification
 4 | 
 5 | !!! info 
 6 |     #### multi-class classification:
 7 |     Format: 
 8 |     `input sentence,label`     
 9 |     
10 |     Example:      
11 |     ```
12 |     Calotropis procera (ushaar) keratitis.,Not-Related
13 |     ```
14 |     
15 |     ####  multi-label classification
16 |     use `///` to separate each label.
17 |     
18 |     Format: 
19 |     `input sentence,label1///label2`  
20 |     
21 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/classification.csv):      
22 |     ```
23 |     We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS
24 |     ``` 
25 | 
26 | ### Text Generation
27 | 
28 | !!! info
29 |     Format:   
30 |     `input sentence, target sentence`
31 |     
32 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/generation.csv):     
33 |     ```
34 |     Peter was a truck driver . He was running a little behind on schedule . Peter decided to run past the weigh station . He was stopped by a cop .,"Peter ended up running late and getting a fine ."
35 |     ```
36 | 
37 | ### Extractive Question Answering
38 | 
39 | !!! info
40 |     Format:    
41 |     `input sentence with question, answer start position, answer end position`      
42 |     
43 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/qa.csv):   
44 |     ```
45 |     Beyoncé announced a hiatus from her music ... <s> Who suggested the hiatus for Beyoncé?, 74,84
46 |     ```
47 | 
48 | ### Multiple-Choice Question Answering
49 | 
50 | !!! info     
51 |     Input passage should include all available, $each choice must start with a mask token$  
52 |     choice id will be start from 0  
53 |     
54 |     Format:    
55 |     `input passage [MASK]choiceA [MASK]choiceB, 1`      
56 |     
57 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mcq.csv):   
58 |     ```
59 |     "I 'm sure many of you have seen Star Wars ... </s> What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2
60 |     ```
61 | 
62 | ### Mask Language Modeling
63 | 
64 | !!! info    
65 |     input sentence with mask, can be multiple     
66 |     target of each mask should be separate by blank     
67 |     Format:    
68 |     `input sentence with [MASK] [MASK],target_token target_token`      
69 |     
70 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mask.csv):   
71 |     ```
72 |     "how did i [MASK] [MASK]","get here"
73 |     ```
74 | 
75 | ### Sequence Tagging
76 | 
77 | !!! info    
78 |     input sentence with blank between each word    
79 |     target label separate with blank, should be one to one to the input    
80 |     Format:     
81 |     `input sentence,tag tag`      
82 |     
83 |     [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/tag.csv):   
84 |     ```
85 |     "welcome to New York,O O B_place B_place"
86 |     ```
87 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | # Project information
  2 | site_name: tfkit
  3 | site_description: 🤖📇 Transformers kit - End2End toolkit for NLP task
  4 | site_author: Voidful
  5 | site_url: https://github.com/voidful/tfkit
  6 | repo_name: tfkit
  7 | repo_url: https://github.com/voidful/tfkit
  8 | copyright: Copyright &copy; Voidful
  9 | 
 10 | nav:
 11 |   - Home: index.md
 12 |   - Installation: installation.md
 13 |   - Tasks: tasks.md
 14 |   - Models: models.md
 15 |   - Structure: structure.md
 16 |   - Benchmark: benchmark.md
 17 | 
 18 | plugins:
 19 |   - search
 20 |   - mkdocstrings:
 21 |       default_handler: python
 22 |       handlers:
 23 |         python:
 24 |           setup_commands:
 25 |             - import sys
 26 |             - sys.path.append("docs")
 27 |           rendering:
 28 |             show_root_heading: True
 29 |             heading_level: 3
 30 |             show_source: false
 31 |       watch:
 32 |         - tfkit
 33 | 
 34 | theme:
 35 |   name: material
 36 |   language: en
 37 |   palette:
 38 |     primary: blue grey
 39 |     accent: blue grey
 40 |   font:
 41 |     text: Roboto
 42 |     code: Roboto Mono
 43 |   logo: img/tfkit-icon.png
 44 |   favicon: img/tfkit-icon.png
 45 | 
 46 | # Extras
 47 | extra:
 48 |   social:
 49 |     - icon: fontawesome/brands/github-alt
 50 |       link: https://github.com/voidful/tfkit
 51 |     - icon: fontawesome/brands/twitter
 52 |       link: https://twitter.com/voidful_stack
 53 |     - icon: fontawesome/brands/linkedin
 54 |       link: https://www.linkedin.com/in/voidful/
 55 |   version:
 56 |     provider: mike
 57 | 
 58 | # Google Analytics
 59 | google_analytics:
 60 |   - UA-127062540-5
 61 |   - auto
 62 | 
 63 | # Extensions
 64 | markdown_extensions:
 65 |   - markdown.extensions.admonition
 66 |   - markdown.extensions.attr_list
 67 |   - markdown.extensions.codehilite:
 68 |       guess_lang: false
 69 |   - markdown.extensions.def_list
 70 |   - markdown.extensions.footnotes
 71 |   - markdown.extensions.meta
 72 |   - markdown.extensions.toc:
 73 |       permalink: true
 74 |   - pymdownx.arithmatex
 75 |   - pymdownx.betterem:
 76 |       smart_enable: all
 77 |   - pymdownx.caret
 78 |   - pymdownx.critic
 79 |   - pymdownx.details
 80 |   - pymdownx.emoji:
 81 |       emoji_index: !!python/name:materialx.emoji.twemoji
 82 |       emoji_generator: !!python/name:materialx.emoji.to_svg
 83 |    # - pymdownx.highlight:
 84 |    #     linenums_style: pymdownx-inline
 85 |   - pymdownx.inlinehilite
 86 |   - pymdownx.keys
 87 |   - pymdownx.magiclink:
 88 |       repo_url_shorthand: true
 89 |       user: squidfunk
 90 |       repo: mkdocs-material
 91 |   - pymdownx.mark
 92 |   - pymdownx.smartsymbols
 93 |   - pymdownx.snippets:
 94 |       check_paths: true
 95 |   - pymdownx.superfences
 96 |   - pymdownx.tabbed
 97 |   - pymdownx.tasklist:
 98 |       custom_checkbox: true
 99 |   - pymdownx.tilde
100 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers>=3.3.0
 2 | tensorboard
 3 | tensorboardX
 4 | torch
 5 | matplotlib
 6 | nlp2>=1.8.44
 7 | tqdm>=4.45.0
 8 | inquirer
 9 | numpy
10 | scipy>=1.10.1
11 | pytorch-crf
12 | sentencepiece
13 | pandas
14 | accelerate>=0.5.1
15 | joblib
16 | scikit-learn
17 | editdistance


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('requirements.txt') as f:
 4 |     required = f.read().splitlines()
 5 | 
 6 | setup(
 7 |     name='tfkit',
 8 |     version='0.8.20',
 9 |     description='Transformers kit - Multi-task QA/Tagging/Multi-label Multi-Class Classification/Generation with BERT/ALBERT/T5/BERT',
10 |     url='https://github.com/voidful/TFkit',
11 |     author='Voidful',
12 |     author_email='voidful.stack@gmail.com',
13 |     long_description=open("README.md", encoding="utf8").read(),
14 |     long_description_content_type="text/markdown",
15 |     setup_requires=['setuptools-git'],
16 |     classifiers=[
17 |         'Development Status :: 4 - Beta',
18 |         "Intended Audience :: Science/Research",
19 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
20 |         "License :: OSI Approved :: Apache Software License",
21 |         'Programming Language :: Python :: 3.6'
22 |     ],
23 |     license="Apache",
24 |     keywords='transformer huggingface nlp multi-task multi-class multi-label classification generation tagging deep learning machine reading',
25 |     packages=find_packages(),
26 |     install_requires=required,
27 |     entry_points={
28 |         'console_scripts': ['tfkit-train=tfkit.train:main', 'tfkit-eval=tfkit.eval:main', 'tfkit-dump=tfkit.dump:main']
29 |     },
30 |     py_modules=['tfkit'],
31 |     python_requires=">=3.5.0",
32 |     zip_safe=False,
33 | )
34 | 


--------------------------------------------------------------------------------
/tfkit/__init__.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility
2 | import tfkit.dump
3 | import tfkit.train
4 | import tfkit.eval
5 | from tfkit.task import *


--------------------------------------------------------------------------------
/tfkit/dump.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, \
 5 |     AutoModelForCausalLM
 6 | 
 7 | from tfkit.utility.model import load_trained_model, add_tokens_to_pretrain
 8 | 
 9 | 
10 | def parse_dump_args(args):
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--model", required=True, type=str)
13 |     parser.add_argument("--dumpdir", required=True, type=str)
14 |     return vars(parser.parse_args(args))
15 | 
16 | 
17 | def main(arg=None):
18 |     arg = parse_dump_args(sys.argv[1:]) if arg is None else parse_dump_args(arg)
19 |     model, model_type, model_class, model_info, model_preprocessor = load_trained_model(arg.get('model'))
20 |     tokenizer = model.tokenizer
21 |     pretrained_config = model_info.get("model_config")
22 |     if model_type == 'clm':
23 |         hf_model = AutoModelForCausalLM.from_pretrained(model_info.get("model_config"))
24 |         hf_model.eval()
25 |         hf_model.transformer = model.pretrained
26 |         if hasattr(hf_model, 'lm_head'):
27 |             hf_model.lm_head.weight = model.model.weight
28 |         else:
29 |             hf_model.cls.weight = model.model.weight
30 |         hf_model.config.tie_word_embeddings = False
31 |         hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', []))
32 |         hf_model.save_pretrained(arg.get('dumpdir'))
33 |     elif model_type == 'seq2seq':
34 |         hf_model = AutoModelForSeq2SeqLM.from_pretrained(model_info.get("model_config"))
35 |         hf_model.eval()
36 |         hf_model.model = model.pretrained
37 |         hf_model.lm_head = model.model
38 |         hf_model.config.tie_word_embeddings = False
39 |         hf_model.config.tie_encoder_decoder = False
40 |         hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', []))
41 |         hf_model.save_pretrained(arg.get('dumpdir'))
42 |     elif model_type == 'clas':
43 |         hf_model = AutoModelForSequenceClassification.from_pretrained(model_info.get("model_config"))
44 |         hf_model.classifier.weight = model.classifier_list[0].weight
45 |         hf_model.save_pretrained(arg.get('dumpdir'))
46 |     else:
47 |         model.pretrained.save_pretrained(arg.get('dumpdir'))
48 | 
49 |     tokenizer.save_pretrained(arg.get('dumpdir'))
50 |     print('==================')
51 |     print("Finish model dump.")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/tfkit/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import logging
  4 | import sys
  5 | import time
  6 | from datetime import timedelta
  7 | 
  8 | import nlp2
  9 | import torch
 10 | from tqdm.auto import tqdm
 11 | 
 12 | from tfkit.utility.eval_metric import EvalMetric
 13 | from tfkit.utility.model import load_trained_model, load_predict_parameter
 14 | 
 15 | transformers_logger = logging.getLogger('transformers')
 16 | transformers_logger.setLevel(logging.CRITICAL)
 17 | 
 18 | 
 19 | def parse_eval_args(args):
 20 |     parser = argparse.ArgumentParser()
 21 |     group = parser.add_mutually_exclusive_group(required=True)
 22 |     group.add_argument("--model", nargs='+', type=str, help="evaluation model")
 23 |     parser.add_argument("--config", type=str, help='pre-trained task path after add token')
 24 |     parser.add_argument("--metric", required=True, type=str, choices=['emf1', 'nlg', 'clas', 'er'],
 25 |                         help="evaluate metric")
 26 |     parser.add_argument("--valid", required=True, type=str, nargs='+', help="evaluate data path")
 27 |     parser.add_argument("--tag", type=str, help="evaluate task tag for select multi-task task")
 28 |     parser.add_argument("--print", action='store_true', help="print each pair of evaluate data")
 29 |     parser.add_argument("--panel", action='store_true', help="enable panel to input argument")
 30 | 
 31 |     input_arg, model_arg = parser.parse_known_args(args)
 32 |     input_arg = {k: v for k, v in vars(input_arg).items() if v is not None}
 33 |     model_arg = {k.replace("--", ""): v for k, v in zip(model_arg[:-1:2], model_arg[1::2])}
 34 |     return input_arg, model_arg
 35 | 
 36 | 
 37 | def main(arg=None):
 38 |     with torch.no_grad():
 39 |         eval_arg, model_arg = parse_eval_args(sys.argv[1:]) if arg is None else parse_eval_args(arg)
 40 |         models_path = eval_arg.get('model', [])
 41 | 
 42 |         if nlp2.is_dir_exist(models_path[0]):
 43 |             models = [f for f in nlp2.get_files_from_dir(models_path[0]) if f.endswith('.pt')]
 44 |         else:
 45 |             models = models_path
 46 | 
 47 |         for model_path in models:
 48 |             start_time = time.time()
 49 |             valid = eval_arg.get('valid')[0]
 50 |             model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path,
 51 |                                                                                           pretrained_config=eval_arg.get(
 52 |                                                                                               'config'),
 53 |                                                                                           tag=eval_arg.get('tag'))
 54 |             predict_parameter = load_predict_parameter(model, model_arg, eval_arg.get('panel'))
 55 | 
 56 |             eval_metrics = [EvalMetric(model.tokenizer)
 57 |                             for _ in range(int(predict_parameter.get('decodenum', 1)))]
 58 | 
 59 |             print("PREDICT PARAMETER")
 60 |             print("=======================")
 61 |             print(predict_parameter)
 62 |             print("=======================")
 63 | 
 64 |             get_data_item = preprocessor.read_file_to_data(valid)
 65 |             for chunk in tqdm(get_data_item):
 66 |                 for i in chunk:
 67 |                     input = i['input']
 68 |                     target = i['target']
 69 |                     predict_parameter.update({'input': input})
 70 |                     result, result_dict = model.predict(**predict_parameter)
 71 |                     for eval_pos, eval_metric in enumerate(eval_metrics):
 72 |                         # predicted can be list of string or string
 73 |                         # target should be list of string
 74 |                         predicted = result
 75 |                         processed_target = target
 76 |                         if 'qa' in model_type:
 77 |                             processed_target = " ".join(input.split(" ")[int(target[0]): int(target[1])])
 78 |                             if len(result) > 0:
 79 |                                 predicted = result[0][0] if isinstance(result[0], list) else result[0]
 80 |                             else:
 81 |                                 predicted = ''
 82 |                         elif 'onebyone' in model_type or 'seq2seq' in model_type or 'clm' in model_type:
 83 |                             processed_target = target
 84 |                             if len(result) < eval_pos:
 85 |                                 print("Decode size smaller than decode num:", result_dict['label_map'])
 86 |                             predicted = result[eval_pos]
 87 |                         elif 'once' in model_type:
 88 |                             processed_target = target
 89 |                             predicted = result[eval_pos]
 90 |                         elif 'mask' in model_type:
 91 |                             processed_target = target.split(" ")
 92 |                             predicted = result
 93 |                         elif 'tag' in model_type:
 94 |                             predicted = " ".join([list(d.values())[0] for d in result_dict[0]['label_map']])
 95 |                             processed_target = target[0].split(" ")
 96 |                             predicted = predicted.split(" ")
 97 | 
 98 |                         if eval_arg.get('print'):
 99 |                             print('===eval===')
100 |                             print("input: ", input)
101 |                             print("target: ", processed_target)
102 |                             print("predicted: ", predicted)
103 |                             print('==========')
104 | 
105 |                         eval_metric.add_record(input, predicted, processed_target, eval_arg.get('metric'))
106 | 
107 |                     for eval_pos, eval_metric in enumerate(eval_metrics):
108 |                         argtype = f"_dataset{valid.replace('/', '_').replace('.', '_')}"
109 |                         if 'decodenum' in predict_parameter and int(predict_parameter['decodenum']) > 1:
110 |                             argtype += f"_num_{eval_pos}"
111 |                         if 'mode' in predict_parameter:
112 |                             para_mode = predict_parameter['mode'][0] if isinstance(predict_parameter['mode'], list) else \
113 |                                 predict_parameter['mode'].lower()
114 |                             argtype += f"_mode_{para_mode}"
115 |                         if 'filtersim' in predict_parameter:
116 |                             argtype += f"_filtersim_{predict_parameter['filtersim']}"
117 |                         outfile_name = f"{model_path}{argtype}"
118 | 
119 |                         with open(f"{outfile_name}_predicted.csv", "w", encoding='utf8') as f:
120 |                             writer = csv.writer(f)
121 |                             records = eval_metric.get_record(eval_arg.get('metric'))
122 |                             writer.writerow(['input', 'predicted', 'targets'])
123 |                             for i, p, t in zip(records['ori_input'], records['ori_predicted'], records['ori_target']):
124 |                                 writer.writerow([i, p, t])
125 |                         print("write result at:", outfile_name)
126 | 
127 |                         with open(f"{outfile_name}_each_data_score.csv", "w", encoding='utf8') as edsf:
128 |                             eds = csv.writer(edsf)
129 |                             with open(f"{outfile_name}_score.csv", "w", encoding='utf8') as f:
130 |                                 for i in eval_metric.cal_score(eval_arg.get('metric')):
131 |                                     f.write(f"TASK: {i[0]} , {eval_pos}\n")
132 |                                     f.write(f"{i[1]}\n")
133 |                                     eds.writerows(i[2])
134 | 
135 |                         print("write score at:", outfile_name)
136 | 
137 |                         for i in eval_metric.cal_score(eval_arg.get('metric')):
138 |                             print("TASK: ", i[0], eval_pos)
139 |                             print(i[1])
140 | 
141 |                     print(f"=== Execution time: {timedelta(seconds=(time.time() - start_time))} ===")
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     main()
146 | 


--------------------------------------------------------------------------------
/tfkit/task/__init__.py:
--------------------------------------------------------------------------------
1 | import os, pkgutil
2 | 
3 | __all__ = list(module for _, module, _ in pkgutil.iter_modules([os.path.dirname(__file__)]))
4 | 


--------------------------------------------------------------------------------
/tfkit/task/clas/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/clas/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from tfkit.utility.predictor import ClassificationPredictor
  8 | 
  9 | dir_path = os.path.dirname(os.path.realpath(__file__))
 10 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 11 | 
 12 | from torch import softmax, sigmoid
 13 | from tfkit.task.clas import Preprocessor
 14 | from tfkit.utility.loss import FocalLoss, BCEFocalLoss
 15 | 
 16 | 
 17 | class Model(nn.Module):
 18 | 
 19 |     def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.1, **kwargs):
 20 |         super().__init__()
 21 |         self.tokenizer = tokenizer
 22 |         self.pretrained = pretrained
 23 | 
 24 |         self.dropout = nn.Dropout(dropout)
 25 |         self.loss_fct = FocalLoss()
 26 |         self.loss_fct_mt = BCEFocalLoss()
 27 | 
 28 |         self.tasks = dict()
 29 |         self.tasks_detail = tasks_detail
 30 |         self.classifier_list = nn.ModuleList()
 31 |         for task, labels in tasks_detail.items():
 32 |             self.classifier_list.append(nn.Linear(self.pretrained.config.hidden_size, len(labels)))
 33 |             self.tasks[task] = len(self.classifier_list) - 1
 34 |         self.maxlen = maxlen
 35 | 
 36 |         self.pretrained = self.pretrained
 37 |         self.classifier_list = self.classifier_list
 38 |         self.loss_fct = self.loss_fct
 39 |         self.loss_fct_mt = self.loss_fct_mt
 40 | 
 41 |         predictor = ClassificationPredictor(self, Preprocessor)
 42 |         self.predictor = predictor
 43 |         self.predict = predictor.predict
 44 | 
 45 |     def get_all_task(self):
 46 |         """
 47 |         list all classification task
 48 |         :return: tasks list
 49 |         """
 50 |         return list(self.tasks.keys())
 51 | 
 52 |     def mean_pooling(self, model_output, attention_mask):
 53 |         """
 54 |         Mean Pooling - Take attention mask into account for correct averaging
 55 |         from https://github.com/UKPLab/sentence-transformers
 56 |         modify - mask from -1 to 0
 57 |         :param model_output:
 58 |         :param attention_mask:
 59 |         :return:
 60 |         """
 61 |         input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
 62 |         input_mask_expanded[input_mask_expanded < 0] = 0
 63 |         sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
 64 |         sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 65 |         return sum_embeddings / sum_mask
 66 | 
 67 |     def forward(self, batch_data, eval=False, **kwargs):
 68 |         # covert input to correct data type
 69 |         tasks = batch_data['task']
 70 |         tasks = [bytes(t).decode(encoding="utf-8", errors="ignore") for t in tasks]
 71 |         inputs = torch.as_tensor(batch_data['input'])
 72 |         targets = torch.as_tensor(batch_data['target'])
 73 |         masks = torch.as_tensor(batch_data['mask'])
 74 |         # define model output
 75 |         result_dict = {
 76 |             'max_item': [],
 77 |             'prob_list': [],
 78 |             'label_prob': []
 79 |         }
 80 | 
 81 |         result_logits = []
 82 |         result_labels = []
 83 |         for p, zin in enumerate(zip(tasks, inputs, masks)):
 84 |             task, input, mask = zin
 85 |             task_id = self.tasks[task]
 86 |             task_labels = self.tasks_detail[task]
 87 |             output = self.pretrained(input.unsqueeze(0), mask.unsqueeze(0))[0]
 88 |             pooled_output = self.dropout(self.mean_pooling(output, mask.unsqueeze(0)))
 89 |             classifier_output = self.classifier_list[task_id](pooled_output)
 90 |             reshaped_logit = classifier_output.view(-1, len(task_labels))  # 0 for cls position
 91 |             result_logits.append(reshaped_logit)
 92 |             if not eval:
 93 |                 target = targets[p]
 94 |                 result_labels.append(target)
 95 |             else:
 96 |                 if 'multi_label' in task:
 97 |                     reshaped_logit = sigmoid(reshaped_logit)
 98 |                 else:
 99 |                     reshaped_logit = softmax(reshaped_logit, dim=1)
100 |                 logit_prob = reshaped_logit[0].data.tolist()
101 |                 logit_label = dict(zip(task_labels, logit_prob))
102 |                 result_dict['label_prob'].append({task: logit_label})
103 |                 if 'multi_label' in task:
104 |                     result_dict['max_item'].append({task: [k for k, v in logit_label.items() if v > 0.5]})
105 |                 else:
106 |                     result_dict['max_item'].append({task: [task_labels[logit_prob.index(max(logit_prob))]]})
107 | 
108 |         if eval:
109 |             outputs = result_dict
110 |         else:
111 |             loss = 0
112 |             for logit, labels, task in zip(result_logits, result_labels, tasks):
113 |                 if 'multi_label' in task:
114 |                     loss += self.loss_fct_mt(logit, labels.type_as(logit))
115 |                 else:
116 |                     loss += self.loss_fct(logit, labels)
117 |             outputs = loss
118 | 
119 |         return outputs
120 | 


--------------------------------------------------------------------------------
/tfkit/task/clas/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from sklearn.preprocessing import MultiLabelBinarizer
 3 | 
 4 | from tfkit.utility import tok
 5 | from tfkit.utility.data_filereader import get_multiclas_data_from_file
 6 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 7 | 
 8 | 
 9 | class Preprocessor(GeneralNLPPreprocessor):
10 | 
11 |     def read_file_to_data(self, path):
12 |         return get_multiclas_data_from_file(path)
13 | 
14 |     def preprocess_component_convert_to_id(self, item, **param_dict):
15 |         item['input'] = self.tokenizer.convert_tokens_to_ids(item['input'])
16 |         yield item
17 | 
18 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
19 |         tinput, task = item['input'], item['task']
20 |         row_dict = {'task': list(task.encode("utf-8"))}
21 |         tokenized_input_id = [tok.tok_begin_id(tokenizer)] + tinput + [tok.tok_sep_id(tokenizer)]
22 |         mask_id = [1] * len(tokenized_input_id)
23 |         row_dict['input'] = tokenized_input_id
24 |         row_dict['mask'] = mask_id
25 |         row_dict['target'] = [-1]
26 |         if 'target' in item:
27 |             target = item['target']
28 |             if 'multi_label' in task:
29 |                 mlb = MultiLabelBinarizer(classes=item['task_dict'][task])
30 |                 tar = mlb.fit_transform([target])
31 |                 tokenize_label = tar
32 |             else:
33 |                 tokenize_label = [item['task_dict'][task].index(target[0])]
34 |             row_dict['target'] = tokenize_label
35 |         return {key: torch.tensor(value) for key, value in row_dict.items()}
36 | 


--------------------------------------------------------------------------------
/tfkit/task/clm/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/clm/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from tfkit.task.clm import Preprocessor
 5 | from tfkit.utility.predictor import AutoRegressivePredictor
 6 | 
 7 | dir_path = os.path.dirname(os.path.realpath(__file__))
 8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 9 | 
10 | import torch
11 | from torch import nn
12 | from torch.nn.functional import softmax
13 | 
14 | 
15 | class Model(nn.Module):
16 |     def __init__(self, tokenizer, pretrained, maxlen=512, **kwargs):
17 |         super().__init__()
18 |         self.tokenizer = tokenizer
19 |         self.pretrained = pretrained
20 |         self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
21 |         self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
22 |         self.maxlen = maxlen
23 |         predictor = AutoRegressivePredictor(self, Preprocessor)
24 |         self.predictor = predictor
25 |         self.predict = predictor.predict
26 | 
27 |     def clean_cache(self):
28 |         self.encoder_outputs = None
29 |         self.past_key_values = None
30 | 
31 |     def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs):
32 |         inputs = batch_data['input']
33 |         masks = batch_data['mask']
34 |         tokens_tensor = torch.as_tensor(inputs)
35 |         mask_tensors = torch.as_tensor(masks)
36 | 
37 |         outputs = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
38 |         prediction_scores = self.model(outputs[0])
39 | 
40 |         if eval:
41 |             result_dict = {}
42 |             start = batch_data['start'][0]
43 |             softmax_score = softmax(prediction_scores[0][start], dim=-1).flatten()
44 |             max_item_id = torch.argmax(softmax_score, -1).item()
45 |             max_item_prob = softmax_score[max_item_id].item()
46 |             result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob)
47 |             if max_return > 1:
48 |                 topK = torch.topk(softmax_score, max_return)
49 |                 prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in
50 |                                zip(topK.values.data.tolist(), topK.indices.data.tolist())]
51 |                 result_dict['prob_list'] = softmax_score.data.tolist()[:max_return]
52 |                 result_dict['label_prob'] = prob_result
53 |             outputs = result_dict
54 |         else:
55 |             targets = batch_data['target']
56 |             loss_tensors = torch.as_tensor(targets)
57 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
58 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
59 |                                       loss_tensors.view(-1))
60 | 
61 |             outputs = masked_lm_loss
62 |         return outputs
63 | 


--------------------------------------------------------------------------------
/tfkit/task/clm/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from tfkit.utility.data_filereader import get_gen_data_from_file
 4 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 5 | 
 6 | 
 7 | class Preprocessor(GeneralNLPPreprocessor):
 8 |     def read_file_to_data(self, path):
 9 |         return get_gen_data_from_file(path)
10 | 
11 |     def preprocess_component_convert_to_id(self, item, **param_dict):
12 |         tokenized_input, target = item['input'], item.get('target', None)
13 |         tokenized_target = self.tokenizer.tokenize(target) if target else None
14 |         previous = item.get("previous", [])
15 |         if tokenized_target is None:
16 |             yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
17 |                    'previous': self.tokenizer.convert_tokens_to_ids(previous)}
18 |         else:
19 |             yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
20 |                    'previous': self.tokenizer.convert_tokens_to_ids(previous),
21 |                    'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
22 | 
23 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
24 |         t_input_id, previous = item['input'], item['previous']
25 |         row_dict = {}
26 |         if 'target' in item:
27 |             target = item['target']
28 |             t_target_id = [-1] * len(t_input_id)
29 |             mask_id = [0] * (len(t_target_id))
30 |             t_target_id += target + [self.tok_sep_id]
31 |             mask_id += [1] * (len(target + [self.tok_sep_id]))
32 | 
33 |             row_dict['start'] = [len(t_input_id)]
34 |             t_input_id += [self.tok_bos_id] + target
35 |             mask_id = [1] * (len(t_input_id))
36 |             row_dict['target'] = t_target_id
37 |         else:
38 |             t_prev_id = [self.tok_sep_id] + previous
39 |             t_input_id.extend(t_prev_id)
40 |             mask_id = [1] * (len(t_input_id))
41 |             row_dict['start'] = [len(t_input_id) - 1]
42 |         row_dict['input'] = t_input_id
43 |         row_dict['mask'] = mask_id
44 |         row_dict['target_pad'] = [-1]
45 |         return {key: torch.tensor(value) for key, value in row_dict.items()}
46 | 


--------------------------------------------------------------------------------
/tfkit/task/once/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/once/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from collections import defaultdict
 4 | 
 5 | from tfkit.task.once import Preprocessor
 6 | from tfkit.utility.predictor import NonAutoRegressivePredictor
 7 | 
 8 | dir_path = os.path.dirname(os.path.realpath(__file__))
 9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
10 | 
11 | from torch.nn.functional import softmax
12 | from tfkit.utility.loss import *
13 | from tfkit.utility.tok import *
14 | 
15 | 
16 | class Model(nn.Module):
17 |     def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None):
18 |         super().__init__()
19 |         self.tokenizer = tokenizer
20 |         self.pretrained = pretrained
21 |         self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
22 |         self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
23 |         self.maxlen = maxlen
24 | 
25 |         predictor = NonAutoRegressivePredictor(self, Preprocessor)
26 |         self.predictor = predictor
27 |         self.predict = predictor.predict
28 | 
29 |     def clean_cache(self):
30 |         self.encoder_outputs = None
31 |         self.past_key_values = None
32 | 
33 |     def forward(self, batch_data, eval=False, max_return=1, **kwargs):
34 |         inputs = batch_data['input']
35 |         masks = batch_data['mask']
36 |         starts = batch_data['start']
37 |         ends = batch_data['end']
38 |         tokens_tensor = torch.as_tensor(inputs)
39 |         mask_tensors = torch.as_tensor(masks)
40 | 
41 |         output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
42 |         sequence_output = output[0]
43 |         prediction_scores = self.model(sequence_output)
44 | 
45 |         if eval:
46 |             result_dict = {
47 |                 'max_item': [],
48 |                 'label_prob': defaultdict(list),
49 |                 'prob_list': []
50 |             }
51 |             start = batch_data['start'][0]
52 |             stop = False
53 |             topK_ids = [[]] * max_return
54 |             topK_probs = [1] * max_return
55 |             while start < self.maxlen and not stop:
56 |                 softmax_score = softmax(prediction_scores[0][start], dim=0)
57 |                 max_item_id = torch.argmax(softmax_score, -1).item()
58 |                 max_item_prob = softmax_score[max_item_id].item()
59 |                 if max_return > 1:
60 |                     topK = torch.topk(softmax_score, max_return)
61 |                     for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
62 |                         topK_ids[k].append(tid)
63 |                         topK_probs[k] *= prob
64 |                 else:
65 |                     topK_ids[0].append(max_item_id)
66 |                     topK_probs[0] *= max_item_prob
67 | 
68 |                 if tok_sep_id(self.tokenizer) == max_item_id:
69 |                     stop = True
70 |                 start += 1
71 |             result_dict['prob_list'] = topK_probs
72 |             result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
73 |                                          zip(topK_ids, topK_probs)]
74 |             result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
75 |             outputs = result_dict
76 |         else:
77 |             targets = batch_data['target']
78 |             negative_targets = batch_data['ntarget']
79 |             loss_tensors = torch.as_tensor(targets)
80 |             negativeloss_tensors = torch.as_tensor(negative_targets)
81 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
82 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
83 |                                       loss_tensors.view(-1))
84 |             if not torch.all(negativeloss_tensors.eq(-1)).item():
85 |                 negative_loss_fct = NegativeCElLoss()
86 |                 negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
87 |                                                   negativeloss_tensors.view(-1))
88 |                 masked_lm_loss += negative_loss
89 |             outputs = masked_lm_loss
90 | 
91 |         return outputs
92 | 


--------------------------------------------------------------------------------
/tfkit/task/once/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import tfkit.utility.tok as tok
 2 | from tfkit.utility.data_filereader import get_gen_data_from_file
 3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 4 | 
 5 | 
 6 | class Preprocessor(GeneralNLPPreprocessor):
 7 |     def read_file_to_data(self, path):
 8 |         return get_gen_data_from_file(path)
 9 | 
10 |     def set_global_parameters(self):
11 |         self.tokenize_target = True
12 | 
13 |     def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
14 |         likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
15 |         tokenized_input, tokenized_target, n_target = item['input'], item.get('target', None), item.get('ntarget', None)
16 |         yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
17 |                'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
18 |         if "neg" in likelihood:
19 |             # formatting neg data in csv
20 |             if n_target is None:
21 |                 ntext_arr = [
22 |                     tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
23 |             elif tok.tok_sep(self.tokenizer) in n_target:
24 |                 ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
25 |             else:
26 |                 ntext_arr = [n_target.strip()]
27 |             for neg_text in ntext_arr:
28 |                 yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
29 |                        'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
30 |                        'ntarget': self.tokenizer.convert_tokens_to_ids(neg_text)}
31 | 
32 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
33 |         tok_pad = tok.tok_pad_id(tokenizer)
34 |         tok_bos = tok.tok_begin_id(tokenizer)
35 |         tok_sep = tok.tok_sep_id(tokenizer)
36 |         tok_mask = tok.tok_mask_id(tokenizer)
37 | 
38 |         row_dict = {}
39 |         t_input_id = item['input']
40 |         encoder_mask_id = [1] * (len(t_input_id))
41 |         encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id)))
42 |         target_start = len(t_input_id)
43 |         target_end = maxlen
44 |         target_length = target_end - target_start
45 |         t_input_id.extend([tok_pad] * (maxlen - len(t_input_id)))
46 |         if 'target' in item and item['target'] is not None:
47 |             target = item['target'] + [tok_sep]
48 |             target.extend([-1] * (maxlen - len(target)))
49 |             row_dict['target'] = target
50 |             row_dict['ntarget'] = [-1] * maxlen
51 |             if 'ntarget' in item and len(item['ntarget'].strip()) > 0:
52 |                 tokenized_ntarget_id = item['ntarget']
53 |                 tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id)))
54 |                 if len(tokenized_ntarget_id) <= maxlen:
55 |                     row_dict['ntarget'] = tokenized_ntarget_id
56 | 
57 |         input_length = min(maxlen, target_start * 3)
58 |         row_dict['input'] = t_input_id
59 |         row_dict['mask'] = encoder_mask_id
60 |         row_dict['start'] = target_start
61 |         row_dict['end'] = maxlen
62 |         row_dict['input_length'] = input_length
63 |         row_dict['target_length'] = target_length
64 |         return row_dict
65 | 


--------------------------------------------------------------------------------
/tfkit/task/oncectc/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Model
2 | from tfkit.task.once.preprocessor import Preprocessor
3 | 


--------------------------------------------------------------------------------
/tfkit/task/oncectc/model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from collections import defaultdict
  4 | 
  5 | from tfkit.task.once import Preprocessor
  6 | from tfkit.utility.predictor import NonAutoRegressivePredictor
  7 | 
  8 | dir_path = os.path.dirname(os.path.realpath(__file__))
  9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 10 | 
 11 | from torch.nn.functional import softmax
 12 | from tfkit.utility.loss import *
 13 | from tfkit.utility.tok import *
 14 | from tfkit.utility.loss import SeqCTCLoss
 15 | 
 16 | 
 17 | class Model(nn.Module):
 18 |     def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None):
 19 |         super().__init__()
 20 |         self.tokenizer = tokenizer
 21 |         self.pretrained = pretrained
 22 |         self.maxlen = maxlen
 23 |         self.blank_token = "<BLANK>"
 24 |         self.tokenizer.add_tokens(self.blank_token)
 25 |         self.pretrained.resize_token_embeddings(len(tokenizer))
 26 |         self.blank_index = self.tokenizer.convert_tokens_to_ids([self.blank_token])[0]
 27 |         self.loss = SeqCTCLoss(blank_index=self.blank_index)
 28 |         self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
 29 |         self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
 30 |         predictor = NonAutoRegressivePredictor(self, Preprocessor)
 31 |         self.predictor = predictor
 32 |         self.predict = predictor.predict
 33 | 
 34 |     def clean_cache(self):
 35 |         self.encoder_outputs = None
 36 |         self.past_key_values = None
 37 | 
 38 |     def forward(self, batch_data, eval=False, max_return=1, **kwargs):
 39 |         inputs = batch_data['input']
 40 |         masks = batch_data['mask']
 41 |         starts = batch_data['start']
 42 |         ends = batch_data['end']
 43 |         tokens_tensor = torch.as_tensor(inputs)
 44 |         mask_tensors = torch.as_tensor(masks)
 45 | 
 46 |         output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
 47 |         sequence_output = output[0]
 48 |         prediction_scores = self.model(sequence_output)
 49 |         batch_size = list(tokens_tensor.shape)[0]
 50 |         prediction_scores = prediction_scores.view(batch_size, -1, self.vocab_size)
 51 | 
 52 |         if eval:
 53 |             result_dict = {
 54 |                 'max_item': [],
 55 |                 'label_prob': defaultdict(list),
 56 |                 'prob_list': []
 57 |             }
 58 |             start = batch_data['start'][0]
 59 |             topK_ids = [[]] * max_return
 60 |             topK_probs = [1] * max_return
 61 | 
 62 |             pscore = prediction_scores.detach().cpu()
 63 |             predicted_indexs = pscore.argmax(2).tolist()[0]
 64 |             predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indexs)
 65 |             output = []
 66 |             for pos, (predicted_index, predicted_token) in enumerate(zip(predicted_indexs, predicted_tokens)):
 67 |                 if len(output) > 0 and predicted_index == output[-1]:
 68 |                     continue
 69 |                 if predicted_token == self.blank_token:
 70 |                     continue
 71 |                 if predicted_token == tok_pad(self.tokenizer):
 72 |                     continue
 73 |                 if predicted_token == tok_sep(self.tokenizer):
 74 |                     break
 75 | 
 76 |                 softmax_score = softmax(prediction_scores[0][pos], dim=0)
 77 |                 max_item_id = torch.argmax(softmax_score, -1).item()
 78 |                 max_item_prob = softmax_score[max_item_id].item()
 79 |                 if max_return > 1:
 80 |                     topK = torch.topk(softmax_score, max_return)
 81 |                     for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
 82 |                         topK_ids[k].append(tid)
 83 |                         topK_probs[k] *= prob
 84 |                 else:
 85 |                     topK_ids[0].append(max_item_id)
 86 |                     topK_probs[0] *= max_item_prob
 87 |                 start += 1
 88 | 
 89 |             result_dict['prob_list'] = topK_probs
 90 |             result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
 91 |                                          zip(topK_ids, topK_probs)]
 92 |             result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
 93 |             outputs = result_dict
 94 |         else:
 95 |             targets = batch_data['target']
 96 |             negative_targets = batch_data['ntarget']
 97 |             input_lengths = batch_data['input_length']
 98 |             target_lengths = batch_data['target_length']
 99 | 
100 |             target_tensors = torch.as_tensor(targets)
101 |             input_length_tensors = torch.as_tensor(input_lengths)
102 |             target_length_tensors = torch.as_tensor(target_lengths)
103 | 
104 |             loss_tensors = torch.as_tensor(targets)
105 |             negativeloss_tensors = torch.as_tensor(negative_targets)
106 |             ctc_lm_loss = self.loss(prediction_scores,
107 |                                     input_length_tensors,
108 |                                     target_tensors.view(batch_size, -1),
109 |                                     target_length_tensors)
110 | 
111 |             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
112 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
113 |                                       loss_tensors.view(-1))
114 |             if not torch.all(negativeloss_tensors.eq(-1)).item():
115 |                 negative_loss_fct = NegativeCElLoss()
116 |                 negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
117 |                                                   negativeloss_tensors.view(-1))
118 |                 masked_lm_loss += negative_loss
119 |             outputs = ctc_lm_loss + masked_lm_loss
120 | 
121 |         return outputs
122 | 


--------------------------------------------------------------------------------
/tfkit/task/qa/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/qa/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from tfkit.utility.predictor import QuestionAnsweringPredictor
 5 | 
 6 | dir_path = os.path.dirname(os.path.realpath(__file__))
 7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | from torch.nn.functional import softmax
12 | from tfkit.task.qa.preprocessor import Preprocessor
13 | 
14 | 
15 | class Model(nn.Module):
16 | 
17 |     def __init__(self, tokenizer, pretrained, maxlen=128, dropout=0.1, **kwargs):
18 |         super().__init__()
19 |         self.tokenizer = tokenizer
20 |         self.pretrained = pretrained
21 |         self.maxlen = maxlen
22 | 
23 |         self.dropout = nn.Dropout(dropout)
24 |         self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
25 |         # self.loss_fct = FocalLoss(ignore_index=-1)
26 |         # self.loss_fct = GWLoss()
27 | 
28 |         self.pretrained = self.pretrained
29 |         self.qa_classifier = nn.Linear(self.pretrained.config.hidden_size, 2)
30 |         self.loss_fct = self.loss_fct
31 | 
32 |         predictor = QuestionAnsweringPredictor(self, Preprocessor)
33 |         self.predictor = predictor
34 |         self.predict = predictor.predict
35 | 
36 |     def forward(self, batch_data, eval=False, **kwargs):
37 |         print("batch_data",batch_data)
38 |         inputs = torch.as_tensor(batch_data['input'])
39 |         masks = torch.as_tensor(batch_data['mask'])
40 |         targets = torch.as_tensor(batch_data['target'])
41 |         start_positions, end_positions = targets.split(1, dim=1)
42 |         start_positions = start_positions.squeeze(1)
43 |         end_positions = end_positions.squeeze(1)
44 | 
45 |         output = self.pretrained(inputs, attention_mask=masks)[0]
46 |         logits = self.qa_classifier(output)
47 |         start_logits, end_logits = logits.split(1, dim=-1)
48 |         start_logits = start_logits.squeeze(-1)
49 |         end_logits = end_logits.squeeze(-1)
50 | 
51 |         if eval:
52 |             result_dict = {
53 |                 'label_prob_all': [],
54 |                 'label_map': []
55 |             }
56 |             reshaped_start_logits = softmax(start_logits, dim=1)
57 |             reshaped_end_logits = softmax(end_logits, dim=1)
58 |             start_prob = reshaped_start_logits.data.tolist()[0]
59 |             end_prob = reshaped_end_logits.data.tolist()[0]
60 |             result_dict['label_prob_all'].append({'start': dict(zip(range(len(start_prob)), start_prob)),
61 |                                                   'end': dict(zip(range(len(end_prob)), end_prob))})
62 |             result_dict['label_map'].append({'start': start_prob.index(max(start_prob)),
63 |                                              'end': end_prob.index(max(end_prob))})
64 |             outputs = result_dict
65 |         else:
66 |             start_loss = self.loss_fct(start_logits, start_positions)
67 |             end_loss = self.loss_fct(end_logits, end_positions)
68 |             total_loss = (start_loss + end_loss) / 2
69 |             outputs = total_loss
70 | 
71 |         return outputs
72 | 


--------------------------------------------------------------------------------
/tfkit/task/qa/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import nlp2
 2 | import tfkit.utility.tok as tok
 3 | import torch
 4 | from tfkit.utility.data_filereader import get_qa_data_from_file
 5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 6 | 
 7 | 
 8 | class Preprocessor(GeneralNLPPreprocessor):
 9 |     def read_file_to_data(self, path):
10 |         return get_qa_data_from_file(path)
11 | 
12 |     def preprocess_component_prepare_input(self, item):
13 |         mapping_index = []
14 |         pos = 1  # cls as start 0
15 |         input_text_list = nlp2.split_sentence_to_array(item['input'])
16 |         for i in input_text_list:
17 |             for _ in range(len(self.tokenizer.tokenize(i))):
18 |                 if _ < 1:
19 |                     mapping_index.append({'char': i, 'pos': pos})
20 |                 pos += 1
21 |         item['mapping_index'] = mapping_index
22 |         return item
23 | 
24 |     def preprocess_component_convert_to_id(self, item, **param_dict):
25 |         input_text, target = item['input'], item.get('target', None)
26 |         tokenized_input = [tok.tok_begin(self.tokenizer)] + input_text + [tok.tok_sep(self.tokenizer)]
27 |         input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
28 |         start_index = item['input_index'][0]
29 |         end_index = item['input_index'][1]
30 |         if target:
31 |             item['target'] = [0, 0]
32 |             target_start, target_end = target
33 |             ori_start = target_start = int(target_start)
34 |             ori_end = target_end = int(target_end)
35 |             ori_ans = tokenized_input[ori_start:ori_end]
36 |             target_start -= start_index
37 |             target_end -= start_index
38 |             # print("target_start", self.parameters['maxlen'],item['mapping_index'][target_start]['pos'],ori_end)
39 |             # if item['mapping_index'][target_start]['pos'] > ori_end or target_start < 0 \
40 |             #         or target_start > self.parameters['maxlen'] \
41 |             #         or target_end >= self.parameters['maxlen'] - 2:
42 |             #     target_start = 0
43 |             #     target_end = 0
44 |             # else:
45 |             for map_pos, map_tok in enumerate(item['mapping_index'][start_index:]):
46 |                 if start_index < map_tok['pos'] <= end_index:
47 |                     length = len(self.tokenizer.tokenize(map_tok['char']))
48 |                     if map_pos < ori_start:
49 |                         target_start += length - 1
50 |                     if map_pos < ori_end:
51 |                         target_end += length - 1
52 |             item['target'] = [target_start + 1, target_end + 1]  # cls +1
53 | 
54 |         item['input'] = input_id
55 |         item['mask'] = [1] * len(input_id)
56 |         item['raw_input'] = tokenized_input
57 |         yield item
58 | 
59 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
60 |         row_dict = {
61 |             'input': item['input'],
62 |             'mask': item['mask']
63 |         }
64 |         if 'target' in item:
65 |             row_dict['target'] = item['target']
66 |         return {key: torch.tensor(value) for key, value in row_dict.items()}
67 | 


--------------------------------------------------------------------------------
/tfkit/task/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/seq2seq/model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn.functional import softmax
  6 | from transformers import AutoModel
  7 | import torch.nn.functional as F
  8 | from tfkit.task.seq2seq import Preprocessor
  9 | from tfkit.utility.loss import NegativeCElLoss, SelfKDLoss
 10 | from tfkit.utility.model import tie_encoder_decoder_weights
 11 | from tfkit.utility.predictor import AutoRegressivePredictor
 12 | 
 13 | 
 14 | class Model(nn.Module):
 15 |     def __init__(self, tokenizer, pretrained, maxlen=512, selfkd=False, **kwargs):
 16 |         super().__init__()
 17 |         self.maxlen = maxlen
 18 |         self.tokenizer = tokenizer
 19 |         self.pretrained = pretrained
 20 |         self.selfkd = selfkd
 21 |         self.decoder_model, init_weight = self.initialize_decoder()
 22 |         self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
 23 |         self.model = nn.Linear(self.decoder_hidden_size, self.vocab_size, bias=False)
 24 |         if init_weight is not None:
 25 |             self.model.weight = init_weight
 26 |         self.predictor = AutoRegressivePredictor(self, Preprocessor)
 27 |         self.predict = self.predictor.predict
 28 | 
 29 |     def initialize_decoder(self):
 30 |         init_weight = None
 31 | 
 32 |         if hasattr(self.pretrained, 'decoder'):
 33 |             decoder_model = None
 34 |             self.decoder_hidden_size = self.pretrained.config.hidden_size
 35 |             if hasattr(self.pretrained, 'shared'):
 36 |                 init_weight = copy.deepcopy(self.pretrained.shared.weight)
 37 |         else:
 38 |             decoder_config = copy.deepcopy(self.pretrained.config)
 39 |             decoder_config.is_decoder = True
 40 |             decoder_config.add_cross_attention = True
 41 |             decoder_model = AutoModel.from_config(decoder_config)
 42 |             tie_encoder_decoder_weights(self.pretrained, decoder_model, decoder_model.base_model_prefix)
 43 |             self.decoder_hidden_size = decoder_config.hidden_size
 44 | 
 45 |         return decoder_model, init_weight
 46 | 
 47 |     def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs):
 48 |         if self.decoder_model:
 49 |             prediction_output, prediction_all_hidden = self.decoder_forward(batch_data, eval)
 50 |         else:
 51 |             prediction_output, prediction_all_hidden = self.encoder_forward(batch_data, eval, beamsearch)
 52 | 
 53 |         prediction_scores = self.model(prediction_output)
 54 | 
 55 |         if eval:
 56 |             outputs = self.process_eval_output(prediction_scores, max_return)
 57 |         else:
 58 |             outputs = self.calculate_loss(batch_data, prediction_scores, prediction_all_hidden)
 59 |         return outputs
 60 | 
 61 |     def decoder_forward(self, batch_data, eval):
 62 |         input_tensors = torch.as_tensor(batch_data['input'])
 63 |         prev_tensors = torch.as_tensor(batch_data['prev'])
 64 |         encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask'])
 65 |         decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask'])
 66 | 
 67 |         if not eval:
 68 |             outputs = self.pretrained(input_tensors, attention_mask=encoder_mask_tensors)
 69 |         prediction = self.decoder_model(
 70 |             input_ids=prev_tensors,
 71 |             attention_mask=decoder_mask_tensors,
 72 |             output_hidden_states=self.selfkd,
 73 |             use_cache=False,
 74 |             return_dict=True,
 75 |         )
 76 |         prediction_output = prediction['last_hidden_state']
 77 |         prediction_all_hidden = prediction.get('hidden_states')
 78 |         return prediction_output, prediction_all_hidden
 79 | 
 80 |     def encoder_forward(self, batch_data, eval, beamsearch):
 81 |         input_tensors = torch.as_tensor(batch_data['input'])
 82 |         prev_tensors = torch.as_tensor(batch_data['prev'])
 83 |         encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask'])
 84 |         decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask'])
 85 | 
 86 |         prediction = self.pretrained(
 87 |             input_ids=input_tensors,
 88 |             attention_mask=encoder_mask_tensors,
 89 |             decoder_input_ids=prev_tensors,
 90 |             decoder_attention_mask=decoder_mask_tensors,
 91 |             output_hidden_states=self.selfkd,
 92 |             use_cache=False,
 93 |             return_dict=True
 94 |         )
 95 |         prediction_output = prediction['last_hidden_state']
 96 |         prediction_all_hidden = prediction.get('decoder_hidden_states')
 97 |         return prediction_output, prediction_all_hidden
 98 | 
 99 |     def process_eval_output(self, prediction_scores, max_return):
100 |         result_dict = {}
101 |         softmax_score = softmax(prediction_scores[0][0], dim=0)
102 |         max_item_id = torch.argmax(softmax_score, -1).item()
103 |         max_item_prob = softmax_score[max_item_id].item()
104 |         result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob)
105 | 
106 |         if max_return > 1:
107 |             topK = torch.topk(softmax_score, max_return)
108 |             prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in
109 |                            zip(topK.values.data.tolist(), topK.indices.data.tolist())]
110 |             result_dict['prob_list'] = softmax_score.data.tolist()[:max_return]
111 |             result_dict['label_prob'] = prob_result
112 | 
113 |         return result_dict
114 | 
115 |     def calculate_loss(self, batch_data, prediction_scores, prediction_all_hidden):
116 |         targets = batch_data['target']
117 |         negative_targets = batch_data['ntarget']
118 |         loss_tensors = torch.as_tensor(targets)
119 |         loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
120 |         lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
121 |                            loss_tensors.view(-1))
122 | 
123 |         if self.selfkd:
124 |             selfkdloss_fct = SelfKDLoss(ignore_index=-1)
125 |             for decoder_hidden in prediction_all_hidden[:-1]:
126 |                 student = self.model(decoder_hidden)
127 |                 lm_loss += selfkdloss_fct(student.view(-1, self.vocab_size),
128 |                                           prediction_scores.view(-1, self.vocab_size), loss_tensors.view(-1))
129 | 
130 |         if 'btarget' in batch_data:
131 |             backtran_tensors = torch.as_tensor(batch_data['btarget'])
132 |             if not torch.all(backtran_tensors.eq(-1)).item():
133 |                 backtran_predation = self.pretrained(
134 |                     input_ids=backtran_tensors,
135 |                     output_hidden_states=True,
136 |                     return_dict=True
137 |                 )
138 |                 backtran_hidden = backtran_predation['encoder_last_hidden_state']
139 |                 backtran_loss = F.cosine_similarity(self.encoder_hidden, backtran_hidden).mean()
140 |                 lm_loss += backtran_loss
141 | 
142 |         negativeloss_tensors = torch.as_tensor(negative_targets)
143 |         if not torch.all(negativeloss_tensors.eq(-1)).item():
144 |             negative_loss_fct = NegativeCElLoss(ignore_index=-1)
145 |             negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
146 |                                               negativeloss_tensors.view(-1))
147 |             lm_loss += negative_loss
148 | 
149 |         return lm_loss
150 | 


--------------------------------------------------------------------------------
/tfkit/task/seq2seq/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import tfkit.utility.tok as tok
 4 | from tfkit.utility.data_filereader import get_gen_data_from_file
 5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 6 | 
 7 | 
 8 | class Preprocessor(GeneralNLPPreprocessor):
 9 |     def read_file_to_data(self, path):
10 |         return get_gen_data_from_file(path)
11 | 
12 |     def set_global_parameters(self):
13 |         self.tokenize_target = True
14 | 
15 |     def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
16 |         likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
17 |         tokenized_input, tokenized_target, n_target, b_target = item['input'], \
18 |                                                       item.get('target', None), \
19 |                                                       item.get('ntarget', None), \
20 |                                                       item.get('btarget', None)
21 |         previous = item.get("previous", [])
22 |         if tokenized_target is None:
23 |             yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
24 |                    'previous': self.tokenizer.convert_tokens_to_ids(previous)}
25 |         elif b_target and len(b_target) > 0:
26 |             yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
27 |                    'previous': self.tokenizer.convert_tokens_to_ids(previous),
28 |                    'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
29 |                    'btarget': self.tokenizer.encode(b_target)}
30 |         else:
31 |             if "neg" in likelihood or 'both' in likelihood:
32 |                 # formatting neg data in csv
33 |                 if n_target is None:
34 |                     ntext_arr = [
35 |                         tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
36 |                 elif tok.tok_sep(self.tokenizer) in n_target:
37 |                     ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
38 |                 else:
39 |                     ntext_arr = [n_target.strip()]
40 |                 for neg_text in ntext_arr:
41 |                     yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
42 |                            'previous': self.tokenizer.convert_tokens_to_ids(previous),
43 |                            'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
44 |                            'ntarget': self.tokenizer.encode(neg_text)}
45 |             else:
46 |                 yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
47 |                        'previous': self.tokenizer.convert_tokens_to_ids(previous),
48 |                        'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
49 | 
50 |             # whole sentence masking
51 |             if 'pos' in likelihood:
52 |                 yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
53 |                        'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
54 |                        'previous': self.tokenizer.convert_tokens_to_ids(
55 |                            [tok.tok_mask(self.tokenizer)] * len(tokenized_target))}
56 |             elif 'both' in likelihood:
57 |                 for neg_text in ntext_arr:
58 |                     yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
59 |                            'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
60 |                            'previous': self.tokenizer.convert_tokens_to_ids(
61 |                                [tok.tok_mask(self.tokenizer)] * len(tokenized_target)),
62 |                            'ntarget': self.tokenizer.encode(neg_text)}
63 | 
64 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 |         t_input_id, previous = item['input'], item['previous']
66 |         row_dict = {}
67 |         if 'target' in item:
68 |             target = item['target']
69 |             tokenized_target_id = []
70 |             if len(previous) == len(target):
71 |                 tokenized_prev_id = [self.tok_mask_id] * maxlen
72 |             else:
73 |                 tokenized_prev_id = [self.tok_sep_id] + target
74 |             tokenized_target_id.extend(target + [self.tok_sep_id])
75 |             row_dict['target'] = tokenized_target_id
76 |             row_dict['target_pad'] = [-1]
77 |             row_dict['prev'] = tokenized_prev_id
78 |             row_dict['ntarget'] = [-1] * maxlen
79 |             if 'ntarget' in item and len(item['ntarget']) > 0:
80 |                 tokenized_ntarget_id = item['ntarget']
81 |                 if len(tokenized_ntarget_id) <= maxlen:
82 |                     row_dict['ntarget'] = tokenized_ntarget_id
83 |             if 'btarget' in item and len(item['btarget']) > 0:
84 |                 row_dict['btarget'] = tokenizer.encode(item['btarget'])
85 |         else:
86 |             tokenized_prev_id = [self.tok_sep_id]
87 |             tokenized_prev_id.extend(previous)
88 |             row_dict['prev'] = tokenized_prev_id
89 | 
90 |         row_dict['input'] = t_input_id
91 |         row_dict['encoder_mask'] = [1] * len(t_input_id)
92 |         row_dict['decoder_mask'] = [1] * len(tokenized_prev_id)
93 |         return {key: torch.tensor(value) for key, value in row_dict.items()}
94 | 


--------------------------------------------------------------------------------
/tfkit/task/tag/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 | 


--------------------------------------------------------------------------------
/tfkit/task/tag/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from collections import Counter
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn.functional import softmax
  8 | 
  9 | from tfkit.task.tag import Preprocessor
 10 | from tfkit.utility.loss import FocalLoss
 11 | from tfkit.utility.predictor import TaggingPredictor
 12 | 
 13 | dir_path = os.path.dirname(os.path.realpath(__file__))
 14 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 15 | 
 16 | 
 17 | class Model(nn.Module):
 18 |     def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.2, **kwargs):
 19 |         super().__init__()
 20 |         self.initialize_components(tokenizer, pretrained, tasks_detail, maxlen, dropout)
 21 | 
 22 |     def initialize_components(self, tokenizer, pretrained, tasks_detail, maxlen, dropout):
 23 |         labels = list(tasks_detail.values())[0]
 24 |         self.tokenizer = tokenizer
 25 |         self.pretrained = pretrained
 26 |         self.dropout = nn.Dropout(dropout)
 27 |         self.tagger = nn.Linear(self.pretrained.config.hidden_size, len(labels))
 28 |         self.labels = labels
 29 |         self.maxlen = maxlen
 30 |         self.loss_fct = FocalLoss()
 31 | 
 32 |         self.pretrained = self.pretrained
 33 |         self.loss_fct = self.loss_fct
 34 | 
 35 |         predictor = TaggingPredictor(self, Preprocessor)
 36 |         self.predictor = predictor
 37 |         self.predict = predictor.predict
 38 | 
 39 |     def forward(self, batch_data, eval=False, separator=" ", **kwargs):
 40 |         inputs = batch_data["input"]
 41 |         masks = batch_data["mask"]
 42 | 
 43 |         bert_output = self.compute_bert_output(inputs, masks)
 44 | 
 45 |         if eval:
 46 |             outputs = self.compute_eval_output(batch_data, bert_output)
 47 |         else:
 48 |             outputs = self.compute_loss_output(batch_data, bert_output)
 49 | 
 50 |         return outputs
 51 | 
 52 |     def compute_bert_output(self, inputs, masks):
 53 |         token_tensor = torch.as_tensor(inputs, dtype=torch.long)
 54 |         mask_tensors = torch.as_tensor(masks)
 55 |         bert_output = self.pretrained(token_tensor, attention_mask=mask_tensors)
 56 |         res = bert_output[0]
 57 |         pooled_output = self.dropout(res)
 58 |         reshaped_logits = self.tagger(pooled_output)
 59 | 
 60 |         return reshaped_logits
 61 | 
 62 |     def compute_eval_output(self, batch_data, reshaped_logits):
 63 |         result_dict = {
 64 |             'label_prob_all': [],
 65 |             'label_map': []
 66 |         }
 67 | 
 68 |         ilogit = softmax(reshaped_logits[0], dim=1)
 69 |         result_labels = ilogit.data.tolist()
 70 |         start, end = batch_data['pos'][0]
 71 |         token_word_mapping = batch_data['token_word_mapping']
 72 | 
 73 |         for pos, logit_prob in enumerate(result_labels[1:]):  # skip cls and sep
 74 |             if start + pos >= len(token_word_mapping):
 75 |                 break
 76 | 
 77 |             word, pos = self.compute_word_pos(token_word_mapping, start, pos)
 78 |             self.update_result_dict(result_dict, logit_prob, word, pos)
 79 | 
 80 |         result_dict['token_word_mapping'] = token_word_mapping[start:end]
 81 | 
 82 |         return result_dict
 83 | 
 84 |     @staticmethod
 85 |     def compute_word_pos(token_word_mapping, start, pos):
 86 |         word = token_word_mapping[start + pos]['word']
 87 |         pos = token_word_mapping[start + pos]['pos']
 88 | 
 89 |         return word, pos
 90 | 
 91 |     def update_result_dict(self, result_dict, logit_prob, word, pos):
 92 |         if len(result_dict['label_map']) > pos:
 93 |             self.update_existing_result(result_dict, logit_prob, word, pos)
 94 |         else:
 95 |             self.append_new_result(result_dict, logit_prob, word)
 96 | 
 97 |     def update_existing_result(self, result_dict, logit_prob, word, pos):
 98 |         O = Counter(result_dict['label_prob_all'][-1][word])
 99 |         N = Counter(dict(zip(self.labels, logit_prob)))
100 |         mean_prob = {k: v / 2 for k, v in (O + N).items()}
101 |         result_dict['label_prob_all'][-1] = {word: mean_prob}
102 |         result_dict['label_map'][-1] = {
103 |             word: max(mean_prob, key=mean_prob.get)}
104 | 
105 |     def append_new_result(self, result_dict, logit_prob, word):
106 |         max_index = logit_prob.index(max(logit_prob))
107 |         result_dict['label_map'].append({word: self.labels[max_index]})
108 |         result_dict['label_prob_all'].append({word: dict(zip(self.labels, logit_prob))})
109 | 
110 |     def compute_loss_output(self, batch_data, reshaped_logits):
111 |         targets = batch_data["target"]
112 |         target_tensor = torch.as_tensor(targets, dtype=torch.long)
113 |         loss = self.loss_fct(reshaped_logits.view(-1, len(self.labels)), target_tensor.view(-1))
114 | 
115 |         return loss
116 | 


--------------------------------------------------------------------------------
/tfkit/task/tag/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import tfkit.utility.tok as tok
 2 | from tfkit.utility.data_filereader import get_tag_data_from_file
 3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
 4 | 
 5 | get_data_from_file = get_tag_data_from_file
 6 | 
 7 | 
 8 | class Preprocessor(GeneralNLPPreprocessor):
 9 | 
10 |     def read_file_to_data(self, path):
11 |         return get_tag_data_from_file(path)
12 | 
13 |     def preprocess(self, item, **param_dict):
14 |         input_text, target = item['input'], item.get('target', None)
15 |         separator = param_dict.get('separator', ' ')
16 |         word_token_mapping = []
17 |         token_word_mapping = []
18 |         pos = 0
19 | 
20 |         for word_i, word in enumerate(input_text.split(separator)):
21 |             tokenize_word = self.tokenizer.tokenize(word)
22 |             for _ in range(len(tokenize_word)):
23 |                 if _ < 1:  # only record first token (one word one record)
24 |                     word_token_mapping.append({'char': word, 'pos': pos, 'len': len(tokenize_word)})
25 |                 token_word_mapping.append({'tok': tokenize_word[_], 'word': word, 'pos': len(word_token_mapping) - 1})
26 |                 pos += 1
27 | 
28 |         t_input_list, t_pos_list = tok.handle_exceed(self.tokenizer, input_text, self.parameters['maxlen'] - 2,
29 |                                                      mode=self.parameters.get('handle_exceed'),
30 |                                                      keep_after_sep=False)
31 |         preprocessed_data = []
32 |         for t_input, t_pos in zip(t_input_list, t_pos_list):  # -1 for cls
33 |             # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
34 |             row_dict = dict()
35 |             tokenized_input = [tok.tok_begin(self.tokenizer)] + t_input
36 |             input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
37 | 
38 |             if target is not None:
39 |                 target_token = []
40 |                 for input_word, target_label in zip(word_token_mapping, target.split(separator)):
41 |                     if t_pos[0] <= input_word['pos'] < t_pos[1]:
42 |                         for _ in range(input_word['len']):
43 |                             target_token += [target_label]
44 | 
45 |                 target_id = [target_token[0]] + target_token
46 | 
47 |                 if len(input_id) != len(target_id):
48 |                     print(list(zip(input.split(separator), target.split(separator))))
49 |                     print(self.tokenizer.decode(input_id))
50 |                     print(input_id)
51 |                     print(target_id)
52 |                     print("input target len not equal ", len(input_id), len(target_id))
53 |                     continue
54 |                 row_dict['target'] = target_id
55 | 
56 |             row_dict['input'] = input_id
57 |             row_dict['word_token_mapping'] = word_token_mapping
58 |             row_dict['token_word_mapping'] = token_word_mapping
59 |             row_dict['end'] = len(input_id)
60 |             row_dict['pos'] = t_pos
61 |             preprocessed_data.append(row_dict)
62 |         return preprocessed_data
63 | 
64 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 |         labels = item['task_dict']
66 |         print("item['input']",len(item['input']))
67 |         mask_id = [1] * len(item['input'])
68 |         mask_id.extend([0] * (maxlen - len(mask_id)))
69 |         item['input'].extend([0] * (self.parameters['maxlen'] - len(item['input'])))
70 |         row_dict = {
71 |             'input': item['input'],
72 |             'mask': mask_id,
73 |             'pos': item['pos'],
74 |         }
75 |         # 'token_word_mapping': item['token_word_mapping']
76 |         if 'target' in item:
77 |             print(labels['tag'])
78 |             target_id = [labels['tag'].index(i) for i in item['target']]
79 |             if "O" in labels['tag']:
80 |                 target_id = [labels['tag'].index("O")] + target_id
81 |             else:
82 |                 target_id = [target_id[0]] + target_id
83 |             target_id.extend([0] * (self.parameters['maxlen'] - len(target_id)))
84 |             row_dict['target'] = target_id
85 | 
86 |         return row_dict
87 | 


--------------------------------------------------------------------------------
/tfkit/test/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
 4 | 
 5 | DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data')
 6 | TAG_DATASET = os.path.join(DATASET_DIR, 'tag.csv')
 7 | CLAS_DATASET = os.path.join(DATASET_DIR, 'classification.csv')
 8 | GEN_DATASET = os.path.join(DATASET_DIR, 'generation.csv')
 9 | MASK_DATASET = os.path.join(DATASET_DIR, 'mask.csv')
10 | MCQ_DATASET = os.path.join(DATASET_DIR, 'mcq.csv')
11 | QA_DATASET = os.path.join(DATASET_DIR, 'qa.csv')
12 | ADDTOK_DATASET = os.path.join(DATASET_DIR, 'unk_tok.csv')
13 | NEWTOKEN_FILE = os.path.join(DATASET_DIR, 'tok_list.txt')
14 | 
15 | MODEL_SAVE_DIR = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
16 | ADDTOKFREQ_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfreq/')
17 | ADDTOKFILE_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfile/')
18 | CLAS_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clas/')
19 | TAG_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tag/')
20 | TAGCRF_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tagcrf/')
21 | ONEBYONE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'onebyone/')
22 | CLM_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clm/')
23 | SEQ2SEQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'seq2seq/')
24 | ONCE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'once/')
25 | ONCECTC_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'oncectc/')
26 | MASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mask/')
27 | MCQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mcq/')
28 | QA_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'qa/')
29 | MTTASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mttask/')
30 | 
31 | ONEBYONE_MODEL_PATH = os.path.join(ONEBYONE_MODEL_DIR, '2.pt')
32 | ONCE_MODEL_PATH = os.path.join(ONCE_MODEL_DIR, '2.pt')
33 | ONCECTC_MODEL_PATH = os.path.join(ONCECTC_MODEL_DIR, '1.pt')
34 | SEQ2SEQ_MODEL_PATH = os.path.join(SEQ2SEQ_MODEL_DIR, '2.pt')
35 | CLM_MODEL_PATH = os.path.join(CLM_MODEL_DIR, '2.pt')
36 | CLAS_MODEL_PATH = os.path.join(CLAS_MODEL_DIR, '2.pt')
37 | MASK_MODEL_PATH = os.path.join(MASK_MODEL_DIR, '2.pt')
38 | MCQ_MODEL_PATH = os.path.join(MCQ_MODEL_DIR, '2.pt')
39 | TAG_MODEL_PATH = os.path.join(TAG_MODEL_DIR, '2.pt')
40 | QA_MODEL_PATH = os.path.join(QA_MODEL_DIR, '2.pt')
41 | ADDTOKFREQ_MODEL_PATH = os.path.join(ADDTOKFREQ_SAVE_DIR, '2.pt')
42 | ADDTOKFILE_MODEL_PATH = os.path.join(ADDTOKFILE_SAVE_DIR, '2.pt')
43 | 


--------------------------------------------------------------------------------
/tfkit/test/task/test_task_model.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import tfkit
  4 | from tfkit.utility.data_loader import pad_batch
  5 | from torch import Tensor
  6 | from transformers import BertTokenizer, AutoModel, AutoTokenizer
  7 | 
  8 | 
  9 | class TestModel(unittest.TestCase):
 10 | 
 11 | 
 12 |     def testGenerationModel(self):
 13 |         input = "See you next time"
 14 |         maxlen = 32
 15 |         tokenizer = AutoTokenizer.from_pretrained('sshleifer/bart-tiny-random')
 16 |         pretrained = AutoModel.from_pretrained('sshleifer/bart-tiny-random')
 17 |         # tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm
 18 |         for gmodel in [tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm]:
 19 |             print(str(gmodel))
 20 |             model = gmodel.Model(tokenizer, pretrained, maxlen=maxlen)
 21 |             preprocessor = gmodel.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice', reserved_len=0)
 22 |             for preprocessed_item in preprocessor.preprocess(
 23 |                     {'task': 'taskA', 'input': input}):
 24 |                 print("preprocessed_item", preprocessed_item)
 25 |                 feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen)
 26 |                 feature = preprocessor.postprocess_batch(feature)
 27 |                 print(model(feature, eval=True))
 28 |                 self.assertTrue(isinstance(model(feature, eval=True), dict))
 29 |                 model_dict = model(feature, eval=True)
 30 |                 self.assertTrue('max_item' in model_dict)
 31 | 
 32 |             # greedy
 33 |             print("greedy")
 34 |             result, detail = model.predict(input=input)
 35 |             print(result, model_dict)
 36 |             self.assertTrue(len(result) == 1)
 37 |             self.assertTrue(isinstance(result, list))
 38 |             self.assertTrue(isinstance(detail, dict))
 39 | 
 40 |             # TopK
 41 |             result, detail = model.predict(input=input, decodenum=3, mode='topK', topK=3, filtersim=False)
 42 |             print("topK", result)
 43 |             self.assertTrue(len(result) == 3)
 44 |             self.assertTrue(isinstance(result, list))
 45 |             self.assertTrue(isinstance(detail, dict))
 46 | 
 47 |             # beamsearch
 48 |             result, detail = model.predict(input=input, decodenum=3)
 49 |             print("beamsaerch", len(result), result, model_dict)
 50 |             self.assertTrue(len(result) == 3)
 51 |             self.assertTrue(isinstance(result, list))
 52 |             self.assertTrue(isinstance(detail, dict))
 53 | 
 54 |             # TopP
 55 |             result, detail = model.predict(input=input, decodenum=3, mode='topP', topP=0.8)
 56 |             print("TopP", len(result), result, model_dict)
 57 |             self.assertTrue(len(result) == 3)
 58 |             self.assertTrue(isinstance(result, list))
 59 |             self.assertTrue(isinstance(detail, dict))
 60 | 
 61 |             # test exceed 512
 62 |             result, model_dict = model.predict(input="T " * 540)
 63 |             print("test exceed 512", len(result), result, model_dict)
 64 |             self.assertTrue(isinstance(result, list))
 65 |             self.assertTrue(isinstance(detail, dict))
 66 |             print("exceed max len", result)
 67 | 
 68 |             result, model_dict = model.predict(input="T " * 550, reserved_len=10)
 69 |             print(result)
 70 |             self.assertTrue(isinstance(result, list))
 71 |             self.assertTrue(isinstance(detail, dict))
 72 |             print("exceed max len with reserved len:", result)
 73 | 
 74 |     # def testClas(self):
 75 |     #     input = "One hundred thirty-four patients suspected of having pancreas cancer successfully underwent gray scale ultrasound examination of the pancreas ."
 76 |     #     target = "a"
 77 |     #     tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
 78 |     #     pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
 79 |     #     maxlen = 512
 80 |     #     model = tfkit.task.clas.Model(tokenizer, pretrained, tasks_detail={"taskA": ["a", "b"]})
 81 |     #     preprocessor = tfkit.task.clas.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice',
 82 |     #                                                 reserved_len=0)
 83 |     #     for preprocessed_item in preprocessor.preprocess(
 84 |     #             {'task': 'taskA', 'input': input, 'target': target, 'task_dict': {"taskA": ["a", "b"]}}):
 85 |     #         feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen)
 86 |     #         for k, v in feature.items():
 87 |     #             feature[k] = [v, v]
 88 |     #         print(feature)
 89 |     #         # test train
 90 |     #         print(model(feature))
 91 |     #         self.assertTrue(isinstance(model(feature), Tensor))
 92 |     #         # test eval
 93 |     #         print(model(feature, eval=True))
 94 |     #         model_dict = model(feature, eval=True)
 95 |     #         print(model_dict)
 96 |     #
 97 |     #     # test predict
 98 |     #     tok_label = model.predict(task="taskA", input=input)
 99 |     #     self.assertTrue(len(tok_label) == 2)
100 |     #     # test predict with top k 2
101 |     #     top_k_label, top_k_dict = model.predict(task="taskA", input=input, topK=2)
102 |     #     print("test predict with top k 2, ", top_k_label, top_k_dict)
103 |     #     self.assertTrue(len(top_k_label) == 2)
104 |     #
105 |     #     # test exceed 512
106 |     #     for merge_strategy in ['entropy', 'count', 'prob']:
107 |     #         result, model_dict = model.predict(task="taskA", input=" ".join([str(i) for i in range(2000)]),
108 |     #                                            merge_strategy=merge_strategy, topK=2)
109 |     #         print(result, len(model_dict), model_dict)
110 |     #         self.assertTrue(isinstance(result, list))
111 |     #         self.assertTrue(len(result) == 2)
112 | 
113 |     # def testQA(self):
114 |     #     input = "梵 語 在 社 交 中 口 頭 使 用 , 並 且 在 早 期 古 典 梵 語 文 獻 的 發 展 中 維 持 口 頭 傳 統 。 在 印 度 , 書 寫 形 式 是 當 梵 語 發 展 成 俗 語 之 後 才 出 現 的 ; 在 書 寫 梵 語 的 時 候 , 書 寫 系 統 的 選 擇 受 抄 寫 者 所 處 地 域 的 影 響 。 同 樣 的 , 所 有 南 亞 的 主 要 書 寫 系 統 事 實 上 都 用 於 梵 語 文 稿 的 抄 寫 。 自 1 9 世 紀 晚 期 , 天 城 文 被 定 為 梵 語 的 標 準 書 寫 系 統 , 十 分 可 能 的 原 因 是 歐 洲 人 有 用 這 種 文 字 印 刷 梵 語 文 本 的 習 慣 。 最 早 的 已 知 梵 語 碑 刻 可 確 定 為 公 元 前 一 世 紀 。 它 們 採 用 了 最 初 用 於 俗 語 而 非 梵 語 的 婆 羅 米 文 。 第 一 個 書 寫 梵 語 的 證 據 , 出 現 在 晚 於 它 的 俗 語 的 書 寫 證 據 之 後 的 幾 個 世 紀 , 這 被 描 述 為 一 種 悖 論 。 在 梵 語 被 書 寫 下 來 的 時 候 , 它 首 先 用 於 行 政 、 文 學 或 科 學 類 的 文 本 。 宗 教 文 本 口 頭 傳 承 , 在 相 當 晚 的 時 候 才 「 不 情 願 」 地 被 書 寫 下 來 。 [Question] 最 初 梵 語 以 什 麼 書 寫 系 統 被 記 錄 下 來 ?"
115 |     #     target = [201, 205]
116 |     #     tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
117 |     #     pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
118 |     #     model = tfkit.task.qa.Model(tokenizer, pretrained, maxlen=512)
119 |     #
120 |     #     proc = tfkit.task.qa.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice',
121 |     #                                       reserved_len=0)
122 |     #     for items in proc.preprocess({"input": input}):
123 |     #         raw_input = items['raw_input']
124 |     #         feature = proc.postprocess(items, tokenizer, 512)
125 |     #         for k, v in feature.items():
126 |     #             feature[k] = [v]
127 |     #
128 |     #         # test train
129 |     #         print(model(feature))
130 |     #         self.assertTrue(isinstance(model(feature), Tensor))
131 |     #         # test eval
132 |     #         print(model(feature, eval=True))
133 |     #         model_dict = model(feature, eval=True)
134 |     #         self.assertTrue('label_prob_all' in model_dict)
135 |     #         self.assertTrue('label_map' in model_dict)
136 |     #
137 |     #     # test predict
138 |     #     result, model_dict = model.predict(input=input)
139 |     #     print("model_dict", model_dict, input, result)
140 |     #     self.assertTrue('label_prob_all' in model_dict[0])
141 |     #     self.assertTrue('label_map' in model_dict[0])
142 |     #     self.assertTrue(len(result) == 1)
143 |     #
144 |     #     # # test eval top k = 2
145 |     #     # top_k_label, top_k_dict = task.predict(input=input, topK=2)
146 |     #     # print("top_k_label", top_k_label)
147 |     #     # self.assertTrue(len(top_k_label) == 2)
148 |     #
149 |     #     # test exceed 512
150 |     #     for merge_strategy in ['entropy', 'count', 'prob']:
151 |     #         result, model_dict = model.predict(input=" ".join([str(i) for i in range(550)]),
152 |     #                                            handle_exceed='start_slice',
153 |     #                                            merge_strategy=merge_strategy)
154 |     #         print(result, len(model_dict))
155 |     #         self.assertTrue(isinstance(result, list))
156 |     #
157 |     # def testTag(self):
158 |     #     tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_small')
159 |     #     pretrained = AutoModel.from_pretrained('voidful/albert_chinese_small')
160 |     #
161 |     #     input = "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 [SEP] 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 FOB ?"
162 |     #     target = "O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O"
163 |     #     label = ["O", "A"]
164 |     #
165 |     #     model = tfkit.task.tag.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"default": label})
166 |     #
167 |     #     # test exceed 512
168 |     #     for merge_strategy in ['count']:
169 |     #         result, model_dict = model.predict(
170 |     #             input="""
171 |     #                   Rundfadsfdsfsfning 明朝（1368年1月23日－1644年4月25日[註 1]）是中國歷史上最後一個由漢族建立的大一統王朝，歷經十二世、十六位皇帝，國祚二百七十六年[參 4]。\n\n元朝末年政治腐敗，種族紛爭，天災不斷，民不聊生，民變暴動屢禁不止，平民朱元璋加入紅巾軍並在其中乘勢崛起，跟隨佔據濠州的郭子興。郭子興死後，朱元璋被當時反抗軍擁立的小明王韓林兒封為左副元帥，並率部眾先後攻占滁州、和州等地，並最終攻佔集慶（今江蘇南京），採取朱升所建議的「高築牆，廣積糧，緩稱王」的政策，以鞏固根據地，讓士兵屯田積糧減少百姓負擔，以示自己為仁義之師而避免受敵。1364年，朱元璋稱吳王，建立西吳政權。1368年，在掃滅陳友諒、張士誠和方國珍等群雄勢力後，朱元璋於當年農曆正月初四日登基稱帝，立國號為大明[參 5]，定都應天府（今南京市），其轄區稱為京師，由因皇室姓朱，故又稱朱明，之後以「驅逐胡虜，恢復中華」[參 6]為號召北伐中原[參 7][參 8]，並收回了燕雲十六州[參 9]，結束蒙元在中國漢地的統治，統一天下。\n\n明初天下大定，經過朱元璋的休養生息，社會經濟得以恢復和發展，國力迅速恢復，史稱洪武之治。朱元璋去世後，其孫朱允炆即位，但其在靖難之役中敗於駐守燕京的朱元璋第四子朱棣，也自此失蹤。朱棣登基後遷都至順天府（今北京市），將北平布政司升為京師，原京師改稱南京[參 3]。成祖朱棣時期，開疆拓土，又派遣鄭和七下西洋，此後許多漢人遠赴海外，國勢達到頂峰，史稱永樂盛世。其後的仁宗和宣宗時期國家仍處於興盛時期，史稱仁宣之治[參 10]。英宗和代宗時期，遭遇土木之變，國力中衰，經于謙等人抗敵，最終解除國家危機。憲宗和孝宗相繼與民休息，孝宗則力行節儉，減免稅賦，百姓安居樂業，史稱弘治中興[參 11]。武宗時期爆發了南巡之爭和寧王之亂。世宗即位初，引發大禮議之爭，他清除宦官和權臣勢力後總攬朝綱，實現嘉靖中興，並於屯門海戰與西草灣之戰中擊退葡萄牙殖民侵略，任用胡宗憲和俞大猷等將領平定東南沿海的倭患。世宗駕崩後經過隆慶新政國力得到恢復，神宗前期任用張居正，推行萬曆新政，國家收入大增，商品經濟空前繁榮、科學巨匠迭出、社會風尚呈現出活潑開放的新鮮氣息，史稱萬曆中興[參 12]。後經過萬曆三大征平定內憂外患，粉碎豐臣秀吉攻占朝鮮進而入明的計劃，然而因為國本之爭，皇帝逐漸疏於朝政，史稱萬曆怠政，同時東林黨爭也帶來了明中期的政治混亂。\n\n萬曆一朝成為明朝由盛轉衰的轉折期[參 13]。光宗繼位不久因紅丸案暴斃，熹宗繼承大統改元天啟，天啟年間魏忠賢閹黨禍亂朝綱，至明思宗即位後剷除閹黨，但閹黨倒臺後，黨爭又起，政治腐敗以及連年天災[註 2][註 3]，導致國力衰退，最終爆發大規模民變。1644年4月25日（舊曆三月十九），李自成所建立的大順軍攻破北京，思宗自縊於煤山，是為甲申之變。隨後吳三桂倒戈相向，滿族建立的滿清入主中原。明朝宗室於江南地區相繼成立南明諸政權，而原本反明的流寇在李自成等領袖死後亦加入南明陣營，這些政權被清朝統治者先後以「為君父報仇」為名各個殲滅，1662年，明朝宗室最後政權被剷除，永曆帝被俘後被殺，滿清又陸續擊敗各地反抗軍，以及攻取台灣、澎湖，1683年，奉大明為正朔的明鄭向清朝投降，漢族抗爭勢力方為清朝所消滅。[參 16]。\n\n明代的核心領土囊括漢地[註 4]，東北到外興安嶺及黑龍江流域[參 19]，後縮為遼河流域；初年北達戈壁沙漠一帶，後改為今長城；西北至新疆哈密，後改為嘉峪關；西南臨孟加拉灣[註 5]，後折回約今雲南境；曾經在今中國東北、新疆東部及西藏等地設有羈縻機構[參 21]。不過，明朝是否實際統治了西藏國際上尚存在有一定的爭議[註 6]。明成祖時期曾短暫征服及統治安南[參 22]，永樂二十二年（1424年），明朝國土面積達到極盛，在東南亞設置舊港宣慰司[註 7]等行政機構，加強對東南洋一帶的管理[參 23][參 24]。\n\n明代商品經濟繁榮，出現商業集鎮，而手工業及文化藝術呈現世俗化趨勢[參 25]。根據《明實錄》所載的人口峰值於成化十五年（1479年）達七千餘萬人[參 26]，不過許多學者考慮到當時存在大量隱匿戶口，故認為明朝人口峰值實際上逾億[參 27]，還有學者認為晚明人口峰值接近2億[註 8]。這一時期，其GDP總量所占的世界比例在中國古代史上也是最高的，1600年明朝GDP總量為960億美元，占世界經濟總量的29.2%，晚明中國人均GDP在600美元[註 9]。\n\n明朝政治則是權力趨於集中，明太祖在誅殺胡惟庸後廢除傳統的丞相制，六部直接對皇帝負責，後來設置內閣；地方上由承宣布政使司、提刑按察使司、都指揮使司分掌權力，加強地方管理。仁宗、宣宗之後，文官治國的思想逐漸濃厚，行政權向內閣和六部轉移。同時還設有都察院等監察機構，為加強對全國臣民的監視，明太祖設立特務機構錦衣衛，明成祖設立東廠，明憲宗時再設西廠（後取消），明武宗又設內行廠（後取消），合稱「廠衛」。但明朝皇帝並非完全獨斷獨行，有許多事還必須經過經廷推、廷議、廷鞫程序，同時，能將原旨退還的給事中亦可對皇權形成制衡。[參 33]到了後期皇帝出現了怠政，宦官行使大權的陋習[參 3]，儘管決策權始終集中在皇帝手中，然而政務大部分已經由內閣處理，此外，到了明代中晚期文官集團的集體意見足以與皇帝抗衡，在遇到事情決斷兩相僵持不下時，也容易產生一種類似於「憲政危機（英語：Constitutional crisis）」的情況，因此「名義上他是天子，實際上他受制於廷臣。」[參 34]但明朝皇權受制於廷臣主要是基於道德上而非法理上，因為明朝當時風氣普遍注重名節，受儒家教育的皇帝通常不願被冠以「昏君」之名。但雖然皇權受制衡，皇帝仍可任意動用皇權，例如明世宗「大禮議」事件最後以廷杖朝臣多人的方式結束[參 35]，明神宗在國本之爭失利後也以長期拒絕參與政事向朝臣們示威[1][2][3]。\n\n有學者認為明代是繼漢唐之後的黃金時期，也被稱為最後一個可以和漢唐媲美的盛世[參 36]。清代張廷玉等修的官修《明史》評價明朝為「治隆唐宋」[註 10]、「遠邁漢唐」[參 37]。
172 |     #                   """,
173 |     #             merge_strategy=merge_strategy, start_contain="B_",
174 |     #             end_contain="I_")
175 |     #         print(result)
176 |     #         self.assertTrue(isinstance(result, list))
177 | 
178 |         # proc = tfkit.task.tag.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice',
179 |         #                                   reserved_len=0)
180 |         # for items in proc.prepare_data({"input": input}):
181 |         #     raw_input = items['raw_input']
182 |         #     feature = proc.postprocess(items, tokenizer, 512)
183 |         #     for k, v in feature.items():
184 |         #         feature[k] = [v]
185 |         #     self.assertTrue(isinstance(model(feature), Tensor))
186 |         #     print(model(feature))
187 |         #     # test eval
188 |         #     model_dict = model(feature, eval=True)
189 |         #     self.assertTrue('label_prob_all' in model_dict)
190 |         #     self.assertTrue('label_map' in model_dict)
191 |         #     self.assertEqual(len(model_dict['label_map']), len(input.split(" ")))
192 |         #
193 |         # # test predict
194 |         # result, model_dict = model.predict(input=input, start_contain="A", end_contain="A")
195 |         # self.assertTrue('label_prob_all' in model_dict[0])
196 |         # self.assertTrue('label_map' in model_dict[0])
197 |         # print("result", result, len(result))
198 |         # self.assertTrue(isinstance(result, list))
199 |         #
200 |         # # test exceed 512
201 |         # for merge_strategy in ['minentropy', 'maxcount', 'maxprob']:
202 |         #     result, model_dict = model.predict(input=" ".join([str(i) for i in range(1000)]),
203 |         #                                        merge_strategy=merge_strategy, start_contain="A", end_contain="A")
204 |         #     print(result)
205 |         #     self.assertTrue(isinstance(result, list))
206 | 


--------------------------------------------------------------------------------
/tfkit/test/test_atrain.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pytest
  4 | from transformers import BertTokenizer, AutoModel
  5 | 
  6 | import tfkit
  7 | from tfkit.test import *
  8 | from tfkit.utility.model import load_model_class
  9 | 
 10 | 
 11 | class TestTrain(unittest.TestCase):
 12 | 
 13 |     def testHelp(self):
 14 |         result = os.system('tfkit-train -h')
 15 |         assert (result == 0)
 16 | 
 17 |     def test_parser(self):
 18 |         input_arg, model_arg = tfkit.train.parse_train_args(
 19 |             ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config',
 20 |              'voidful/albert_chinese_tiny'])
 21 |         print(input_arg, model_arg)
 22 |         self.assertTrue(input_arg.get('task') == ['once'])
 23 |         self.assertTrue(isinstance(input_arg.get('train'), list))
 24 | 
 25 |         input_arg, model_arg = tfkit.train.parse_train_args(
 26 |             ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config',
 27 |              'voidful/albert_chinese_tiny', '--likelihood', 'pos'])
 28 |         print(input_arg, model_arg)
 29 |         self.assertTrue(model_arg.get('likelihood') == 'pos')
 30 |         self.assertTrue(isinstance(input_arg.get('train'), list))
 31 | 
 32 |     def test_optimizer(self):
 33 |         model_class = load_model_class('clas')
 34 |         tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
 35 |         pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
 36 |         model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]},
 37 |                                   maxlen=128)
 38 |         optim, scheduler = tfkit.train.optimizer(model, lr=0.1, total_step=10)
 39 |         print(optim, scheduler)
 40 |         optim.zero_grad()
 41 |         scheduler.step()
 42 | 
 43 |     def testMultiTask(self):
 44 |         tfkit.train.main(
 45 |             ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
 46 |              '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
 47 |              'voidful/albert_chinese_tiny', '--maxlen', '50'])
 48 |         result = os.system(
 49 |             'tfkit-train --batch 2 --epoch 2 --savedir ' + MTTASK_MODEL_DIR + ' --train ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --lr 5e-5 --test ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --task once clm --config voidful/albert_chinese_tiny --maxlen 50')
 50 |         self.assertTrue(result == 0)
 51 | 
 52 |     def testGenOnce(self):
 53 |         tfkit.train.main(
 54 |             ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
 55 |              GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
 56 |              'voidful/albert_chinese_tiny', '--maxlen', '50'])
 57 |         result = os.system(
 58 |             'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task once --config voidful/albert_chinese_tiny --maxlen 50')
 59 |         self.assertTrue(result == 0)
 60 | 
 61 |     def testGenOnceCTC(self):
 62 |         tfkit.train.main(
 63 |             ['--batch', '2', '--epoch', '1', '--savedir', ONCECTC_MODEL_DIR, '--train',
 64 |              GEN_DATASET, '--lr', '3e-4', '--test', GEN_DATASET, '--task', 'oncectc', '--config',
 65 |              'voidful/albert_chinese_tiny', '--maxlen', '50'])
 66 |         result = os.system(
 67 |             'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task oncectc --config voidful/albert_chinese_tiny --maxlen 50')
 68 |         self.assertTrue(result == 0)
 69 | 
 70 |     def testGenSeq2Seq(self):
 71 |         # result = os.system(
 72 |         #     'tfkit-train --batch 2 --epoch 1 --savedir ' + SEQ2SEQ_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task seq2seq --config prajjwal1/bert-small --maxlen 50 --selfkd True')
 73 |         # self.assertTrue(result == 0)
 74 |         tfkit.train.main(
 75 |             ['--batch', '1', '--epoch', '1', '--savedir', SEQ2SEQ_MODEL_DIR, '--train',
 76 |              GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config',
 77 |              'prajjwal1/bert-small', '--maxlen', '20'])
 78 |         tfkit.train.main(
 79 |             ['--batch', '2', '--epoch', '2', '--savedir', SEQ2SEQ_MODEL_DIR, '--train',
 80 |              GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config',
 81 |              'prajjwal1/bert-small', '--maxlen', '20', '--likelihood', 'pos'])
 82 | 
 83 |     def testGenCLM(self):
 84 |         result = os.system(
 85 |             'tfkit-train --batch 2 --epoch 1 --savedir ' + CLM_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task clm --config prajjwal1/bert-small --maxlen 50')
 86 |         self.assertTrue(result == 0)
 87 |         tfkit.train.main(
 88 |             ['--batch', '2', '--epoch', '2', '--savedir', CLM_MODEL_DIR, '--train',
 89 |              GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'clm', '--config',
 90 |              'prajjwal1/bert-small', '--maxlen', '20'])
 91 | 
 92 |     def testAddTokenFile(self):
 93 |         tfkit.train.main(
 94 |             ['--batch', '2', '--epoch', '1', '--savedir', ADDTOKFILE_SAVE_DIR, '--train',
 95 |              GEN_DATASET, '--lr', '5e-5', '--test', ADDTOK_DATASET, '--task', 'clm', '--config',
 96 |              'voidful/albert_chinese_tiny', '--maxlen', '100', '--add_tokens_file', NEWTOKEN_FILE])
 97 |         result = os.system(
 98 |             f'tfkit-train --batch 2 --add_tokens_file {NEWTOKEN_FILE}  --savedir {ADDTOKFILE_SAVE_DIR} --epoch 2  --train {ADDTOK_DATASET}  --test {ADDTOK_DATASET} --task clm --config voidful/albert_chinese_tiny --maxlen 50')
 99 |         self.assertTrue(result == 0)
100 | 
101 |     def testResume(self):
102 |         tfkit.train.main(
103 |             ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
104 |              GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
105 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'testresume'])
106 | 
107 |         tfkit.train.main(
108 |             ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
109 |              GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
110 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(ONCE_MODEL_DIR, "1.pt")])
111 | 
112 |     def testResumeMultiModel(self):
113 |         tfkit.train.main(
114 |             ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
115 |              '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
116 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm'])
117 |         # resume to train all task
118 |         tfkit.train.main(
119 |             ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
120 |              '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
121 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm', '--resume',
122 |              os.path.join(MTTASK_MODEL_DIR, "1.pt")])
123 |         # resume to train only one task
124 |         tfkit.train.main(
125 |             ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train',
126 |              GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'clm', '--config',
127 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(MTTASK_MODEL_DIR, "1.pt"),
128 |              '--tag', 'clm'])
129 | 
130 |     @pytest.mark.skip()
131 |     def testLoggerwandb(self):
132 |         tfkit.train.main(
133 |             ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
134 |              GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
135 |              'voidful/albert_chinese_tiny', '--maxlen', '50', '--wandb'])
136 | 
137 |     def testClas(self):
138 |         tfkit.train.main(
139 |             ['--batch', '2', '--epoch', '1', '--savedir', CLAS_MODEL_DIR, '--train',
140 |              CLAS_DATASET, '--lr', '5e-5', '--test', CLAS_DATASET, '--task', 'clas', '--config',
141 |              'voidful/albert_chinese_tiny', '--maxlen', '50'])
142 |         result = os.system(
143 |             'tfkit-train --batch 2 --epoch 2 --savedir ' + CLAS_MODEL_DIR + ' --train ' + CLAS_DATASET + ' --test ' + CLAS_DATASET + ' --task clas --config voidful/albert_chinese_tiny --maxlen 50')
144 |         self.assertTrue(result == 0)
145 | 
146 |     # def testQA(self):
147 |     #     tfkit.train.main(
148 |     #         ['--batch', '2', '--epoch', '1', '--savedir', QA_MODEL_DIR, '--train',
149 |     #          QA_DATASET, '--lr', '5e-5', '--test', QA_DATASET, '--task', 'qa', '--config',
150 |     #          'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'start_slice'])
151 |     #     result = os.system(
152 |     #         'tfkit-train --batch 2 --epoch 2 --savedir ' + QA_MODEL_DIR + ' --train ' + QA_DATASET + ' --test ' + QA_DATASET + ' --task qa --config voidful/albert_chinese_tiny --maxlen 512 --handle_exceed start_slice')
153 |     #     self.assertTrue(result == 0)
154 |     #
155 |     # def testTag(self):
156 |     #     tfkit.train.main(
157 |     #         ['--batch', '2', '--epoch', '1', '--savedir', TAG_MODEL_DIR, '--train',
158 |     #          TAG_DATASET, '--lr', '5e-5', '--test', TAG_DATASET, '--task', 'tag', '--config',
159 |     #          'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'slide'])
160 |     #     result = os.system(
161 |     #         'tfkit-train --batch 2 --epoch 2 --savedir ' + TAG_MODEL_DIR + ' --train ' + TAG_DATASET + ' --test ' + TAG_DATASET + ' --task tag --config voidful/albert_chinese_tiny --maxlen 50 --handle_exceed slide')
162 |     #     self.assertTrue(result == 0)
163 | 


--------------------------------------------------------------------------------
/tfkit/test/test_package.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | import tfkit
 6 | import os
 7 | 
 8 | class TestPackage(unittest.TestCase):
 9 | 
10 |     def testImport(self):
11 |         path = os.path.dirname(tfkit.__file__)
12 |         print(path)
13 |         tfkit.task
14 |         tfkit.utility
15 | 


--------------------------------------------------------------------------------
/tfkit/test/test_zeval.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import tfkit
 4 | from tfkit.test import *
 5 | 
 6 | 
 7 | class TestEval(unittest.TestCase):
 8 | 
 9 |     def testHelp(self):
10 |         result = os.system('tfkit-eval -h')
11 |         self.assertTrue(result == 0)
12 | 
13 |     def test_parser(self):
14 |         parser, _ = tfkit.eval.parse_eval_args(
15 |             ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print'])
16 |         print(parser)
17 |         self.assertTrue(parser.get('model') == ['once'])
18 | 
19 |         eval_parser, model_parser = tfkit.eval.parse_eval_args(
20 |             ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print', '--decodenum', '2'])
21 |         self.assertTrue(eval_parser.get('model') == ['once'])
22 |         self.assertTrue(model_parser.get('decodenum') == '2')
23 | 
24 |     def testEvalGen(self):
25 |         tfkit.eval.main(
26 |             ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
27 |         result = os.system(
28 |             'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
29 |         self.assertTrue(result == 0)
30 | 
31 |     def testEvalGenOnce(self):
32 |         tfkit.eval.main(
33 |             ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
34 |         result = os.system(
35 |             'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
36 |         self.assertTrue(result == 0)
37 | 
38 |     def testEvalGenOnceCTC(self):
39 |         tfkit.eval.main(
40 |             ['--model', ONCECTC_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
41 |         result = os.system(
42 |             'tfkit-eval --model ' + ONCECTC_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
43 |         self.assertTrue(result == 0)
44 | 
45 |     def testEvalSeq2Seq(self):
46 |         tfkit.eval.main(
47 |             ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print',
48 |              '--decodenum', '2'])
49 |         tfkit.eval.main(
50 |             ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
51 |         result = os.system(
52 |             'tfkit-eval --model ' + SEQ2SEQ_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
53 |         self.assertTrue(result == 0)
54 | 
55 |     def testEvalCLM(self):
56 |         tfkit.eval.main(
57 |             ['--model', CLM_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
58 |         result = os.system(
59 |             'tfkit-eval --model ' + CLM_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
60 |         self.assertTrue(result == 0)
61 | 
62 |     def testEvalAddedTokenModel(self):
63 |         result = os.system(
64 |             'tfkit-eval --model ' + ADDTOKFILE_MODEL_PATH + ' --valid ' + ADDTOK_DATASET + ' --metric emf1 --print')
65 |         self.assertTrue(result == 0)
66 | 
67 |     def testEvalClassify(self):
68 |         tfkit.eval.main(
69 |             ['--model', CLAS_MODEL_PATH, '--valid', CLAS_DATASET, '--metric', 'clas', '--print'])
70 |         result = os.system(
71 |             'tfkit-eval --model ' + CLAS_MODEL_PATH + ' --valid ' + CLAS_DATASET + ' --metric clas --print')
72 |         self.assertTrue(result == 0)
73 | 
74 |     # def testEvalQA(self):
75 |     #     tfkit.eval.main(
76 |     #         ['--model', QA_MODEL_PATH, '--valid', QA_DATASET, '--metric', 'emf1', '--print'])
77 |     #     result = os.system(
78 |     #         'tfkit-eval --model ' + QA_MODEL_PATH + ' --valid ' + QA_DATASET + ' --metric emf1 --print')
79 |     #     self.assertTrue(result == 0)
80 |     #
81 |     # def testEvalTag(self):
82 |     #     tfkit.eval.main(
83 |     #         ['--model', TAG_MODEL_PATH, '--valid', TAG_DATASET, '--metric', 'clas', '--print'])
84 |     #     result = os.system(
85 |     #         'tfkit-eval --model ' + TAG_MODEL_PATH + ' --valid ' + TAG_DATASET + ' --metric clas --print')
86 |     #     self.assertTrue(result == 0)


--------------------------------------------------------------------------------
/tfkit/test/test_zzdump.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from tfkit.test import *
 3 | import os
 4 | 
 5 | import tfkit
 6 | 
 7 | 
 8 | class TestEval(unittest.TestCase):
 9 |     ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
10 |     MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
11 | 
12 |     def testHelp(self):
13 |         result = os.system('tfkit-dump -h')
14 |         assert (result == 0)
15 | 
16 |     def test_parser(self):
17 |         parser = tfkit.dump.parse_dump_args(['--model', 'a', '--dumpdir', 'b'])
18 |         self.assertTrue(parser.get('model') == 'a')
19 |         self.assertTrue(parser.get('dumpdir') == 'b')
20 | 
21 |     def testDump(self):
22 |         dump_dir = './cache/dump'
23 |         tfkit.dump.main(["--model", CLM_MODEL_PATH, '--dumpdir', dump_dir])
24 |         result = os.system(
25 |             'tfkit-dump --model ' + CLM_MODEL_PATH + ' --dumpdir ' + dump_dir)
26 |         self.assertTrue(result == 0)
27 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_filereader.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tfkit.test import *
 4 | from tfkit.utility.data_filereader import *
 5 | 
 6 | 
 7 | class TestDataFile(unittest.TestCase):
 8 | 
 9 |     def test_get_x_data_from_file(self):
10 |         for get_x_iter in [get_gen_data_from_file(GEN_DATASET),
11 |                            get_qa_data_from_file(QA_DATASET),
12 |                            get_tag_data_from_file(TAG_DATASET),
13 |                            get_clas_data_from_file(CLAS_DATASET),
14 |                            get_multiclas_data_from_file(CLAS_DATASET)]:
15 |             while True:
16 |                 try:
17 |                     print(next(get_x_iter))
18 |                 except StopIteration as e:
19 |                     task_label_dict = e.value
20 |                     break
21 |             print(task_label_dict)
22 |             for k, v in task_label_dict.items():
23 |                 print(k, v)
24 |                 self.assertTrue(isinstance(v, list))
25 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_loader.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | 
 5 | from tfkit.utility.data_loader import pad_batch
 6 | 
 7 | 
 8 | class TestUtilityDataLoader(unittest.TestCase):
 9 | 
10 |     def test_batch_reduce_pad(self):
11 |         k = [{'input': torch.tensor([1, 2, 3])},
12 |              {'input': torch.tensor([3, 4])},
13 |              {'input': torch.tensor([5])}]
14 |         reduced_batch = pad_batch(k)
15 |         self.assertEqual(len(reduced_batch[0]['input']), len(reduced_batch[1]['input']))
16 |         print(reduced_batch)
17 |         self.assertCountEqual(reduced_batch[0]['input'], [1, 2, 3])
18 |         self.assertCountEqual(reduced_batch[1]['input'], [3, 4, 0])
19 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_processor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from tfkit.test import *
 4 | from tfkit.utility.data_filereader import *
 5 | 
 6 | 
 7 | class TestDataPreprocess(unittest.TestCase):
 8 | 
 9 |     def test_get_x_data_from_file(self):
10 |         for get_x_iter in [get_gen_data_from_file(GEN_DATASET),
11 |                            get_qa_data_from_file(QA_DATASET),
12 |                            get_tag_data_from_file(TAG_DATASET),
13 |                            get_clas_data_from_file(CLAS_DATASET),
14 |                            get_multiclas_data_from_file(CLAS_DATASET)]:
15 |             while True:
16 |                 try:
17 |                     print(next(get_x_iter))
18 |                 except StopIteration as e:
19 |                     task_label_dict = e.value
20 |                     break
21 |             print(task_label_dict)
22 |             for k, v in task_label_dict.items():
23 |                 print(k, v)
24 |                 self.assertTrue(isinstance(v, list))
25 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | import os
 4 | 
 5 | from tfkit.utility.logger import Logger
 6 | 
 7 | dir_path = os.path.dirname(os.path.realpath(__file__))
 8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 9 | 
10 | import unittest
11 | import tfkit
12 | 
13 | 
14 | class TestLogger(unittest.TestCase):
15 |     ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
16 |     MODEL_SAVE_PATH = os.path.join(ROOT_DIR, './test/cache/')
17 | 
18 |     def test_write_log(self):
19 |         logger = Logger(savedir=self.MODEL_SAVE_PATH)
20 |         logger.write_log("test")
21 |         with open(logger.logfilepath, 'r') as f:
22 |             lines = f.read().splitlines()
23 |             last_line = lines[-1]
24 |             print(last_line)
25 |             self.assertEqual(last_line, "test")
26 | 
27 |     def test_write_metric(self):
28 |         logger = Logger(savedir=self.MODEL_SAVE_PATH)
29 |         logger.write_metric("test", 1, 0)
30 |         with open(logger.metricfilepath, 'r') as f:
31 |             last_row = list(csv.reader(f))[-1]
32 |             self.assertEqual(last_row, ["test", '1', '0'])
33 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_loss.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch.autograd import Variable
  7 | 
  8 | dir_path = os.path.dirname(os.path.realpath(__file__))
  9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 10 | 
 11 | import unittest
 12 | import tfkit
 13 | 
 14 | 
 15 | class TestLoss(unittest.TestCase):
 16 |     outputs = Variable(torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]), requires_grad=False)
 17 |     targets = Variable(torch.Tensor([1, 1]).long(), requires_grad=False)
 18 |     alln_targets = Variable(torch.Tensor([-1, -1]).long(), requires_grad=False)
 19 |     onen_targets = Variable(torch.Tensor([1, -1]).long(), requires_grad=False)
 20 | 
 21 |     def testLabelSmoothingCrossEntropy(self):
 22 |         outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
 23 |         targets = torch.Tensor([1, 1]).long()
 24 |         alln_targets = torch.Tensor([0, -1]).long()
 25 |         onen_targets = torch.Tensor([1, -1]).long()
 26 | 
 27 |         criterion = nn.CrossEntropyLoss(ignore_index=-1)
 28 |         custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, ignore_index=-1)
 29 | 
 30 |         self.assertTrue(criterion(outputs, targets).item() <
 31 |                         custom_criterion(outputs, targets).item())
 32 |         self.assertTrue(criterion(outputs, onen_targets).item() <
 33 |                         custom_criterion(outputs, onen_targets).item())
 34 | 
 35 |         criterion = nn.CrossEntropyLoss()
 36 |         custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3)
 37 |         self.assertTrue(criterion(outputs, targets).item() <
 38 |                         custom_criterion(outputs, targets).item())
 39 | 
 40 |         custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, reduction='none')
 41 |         print(custom_criterion(self.outputs, self.targets))
 42 |         self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
 43 | 
 44 |     def testDiceLoss(self):
 45 |         custom_criterion = tfkit.utility.loss.DiceLoss(ignore_index=-1)
 46 |         self.assertTrue(0.8 < custom_criterion(self.outputs, self.targets).item() < 1)
 47 |         self.assertTrue(0.99 < custom_criterion(self.outputs, self.alln_targets).item() <= 1)
 48 |         self.assertTrue(0.8 < custom_criterion(self.outputs, self.onen_targets).item() < 1)
 49 | 
 50 |         custom_criterion = tfkit.utility.loss.DiceLoss(reduction='none')
 51 |         print(custom_criterion(self.outputs, self.targets))
 52 |         self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
 53 | 
 54 |     def testLossDrop(self):
 55 |         outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
 56 |         targets = torch.Tensor([1, 1]).long()
 57 |         norm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 58 |         loss_fct = nn.CrossEntropyLoss(reduction='none', ignore_index=-1)  # -1 index = padding token
 59 |         masked_lm_loss = loss_fct(outputs, targets)
 60 |         masked_lm_loss = masked_lm_loss.view(-1, len(targets))  # view by batch size
 61 |         masked_lm_loss = masked_lm_loss.sum(dim=0)
 62 |         masked_lm_loss = masked_lm_loss.mean()
 63 |         print(masked_lm_loss.mean(), norm_loss_fct(outputs, targets).mean())
 64 | 
 65 |     def testBCEFocalLoss(self):
 66 |         outputs = torch.Tensor([[0, 1, 0], [0.2, 0, 0]])
 67 |         targets = torch.Tensor([[0, 1, 0], [1, 0, 0]])
 68 |         criterion = nn.BCELoss()
 69 |         custom_criterion = tfkit.utility.loss.BCEFocalLoss()
 70 |         self.assertTrue(criterion(outputs, targets).item() >
 71 |                         custom_criterion(outputs, targets).item())
 72 | 
 73 |     def testNegativeCElLoss(self):
 74 |         outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
 75 |         targets = torch.Tensor([1, 1]).long()
 76 |         alln_targets = torch.Tensor([-1, -1]).long()
 77 |         onen_targets = torch.Tensor([1, -1]).long()
 78 | 
 79 |         criterion = nn.CrossEntropyLoss(ignore_index=-1)
 80 |         custom_criterion = tfkit.utility.loss.NegativeCElLoss()
 81 |         self.assertTrue(
 82 |             criterion(outputs, targets).item() < custom_criterion(outputs, self.targets).item())
 83 |         self.assertTrue(criterion(outputs, onen_targets).item() < custom_criterion(outputs, onen_targets).item())
 84 | 
 85 |     def testFocalLoss(self):
 86 |         criterion = nn.CrossEntropyLoss(ignore_index=-1)
 87 |         custom_criterion = tfkit.utility.loss.FocalLoss(gamma=0)
 88 |         self.assertAlmostEqual(criterion(self.outputs, self.targets).item(),
 89 |                                custom_criterion(self.outputs, self.targets).item())
 90 |         self.assertAlmostEqual(criterion(self.outputs, self.alln_targets).item(),
 91 |                                custom_criterion(self.outputs, self.alln_targets).item())
 92 |         self.assertAlmostEqual(criterion(self.outputs, self.onen_targets).item(),
 93 |                                custom_criterion(self.outputs, self.onen_targets).item())
 94 | 
 95 |         custom_criterion = tfkit.utility.loss.FocalLoss(gamma=1)
 96 |         self.assertTrue(criterion(self.outputs, self.targets) > custom_criterion(self.outputs, self.targets))
 97 |         self.assertTrue(criterion(self.outputs, self.alln_targets).item() - custom_criterion(self.outputs,
 98 |                                                                                              self.alln_targets).item() < 1)
 99 |         self.assertTrue(criterion(self.outputs, self.onen_targets) > custom_criterion(self.outputs, self.onen_targets))
100 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from tfkit.utility.model import list_all_model, load_model_class, load_predict_parameter, load_trained_model
 5 | 
 6 | dir_path = os.path.dirname(os.path.realpath(__file__))
 7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 8 | 
 9 | import unittest
10 | from transformers import BertTokenizer, AutoModel
11 | 
12 | 
13 | class TestModelLoader(unittest.TestCase):
14 |     ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../"))
15 |     MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
16 | 
17 |     def test_list_all_model(self):
18 |         models = list_all_model()
19 |         self.assertTrue(isinstance(models, list))
20 | 
21 |     def test_load_model_class(self):
22 |         load_model_class('clas')
23 |         load_model_class('once')
24 | 
25 |     def test_load_predict_parameter(self):
26 |         model_class = load_model_class('clas')
27 |         # load pre-train task
28 |         tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
29 |         pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
30 |         model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]},
31 |                                   maxlen=128)
32 |         clas_param = load_predict_parameter(model)
33 |         print("clas_param", clas_param)
34 |         self.assertTrue('input' in clas_param)
35 |         self.assertTrue('topK' in clas_param)
36 |         self.assertTrue('task' in clas_param)
37 |         self.assertTrue('handle_exceed' in clas_param)
38 |         self.assertTrue(isinstance(clas_param['handle_exceed'], str))
39 | 
40 |     # def test_load_trained_model(self):
41 |     #     model_path = os.path.join(self.MODEL_SAVE_PATH, '1.pt')
42 |     #     model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path)
43 |     #     print(model)
44 |     #     print(model_type)
45 |     #     print(model_class)
46 |     #     print(model_info)
47 |     #     print(model.predict)
48 |     #     print(model.predict(input="a"))
49 | 


--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_tok.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | dir_path = os.path.dirname(os.path.realpath(__file__))
 5 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
 6 | 
 7 | import unittest
 8 | import tfkit
 9 | from transformers import AutoTokenizer, BertTokenizer
10 | 
11 | 
12 | class TestTok(unittest.TestCase):
13 |     ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../"))
14 |     DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data')
15 | 
16 |     def testTok(self):
17 |         tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
18 |         begin = tfkit.utility.tok.tok_begin(tokenizer)
19 |         self.assertEqual(begin, "[CLS]")
20 |         sep = tfkit.utility.tok.tok_sep(tokenizer)
21 |         self.assertEqual(sep, "[SEP]")
22 |         mask = tfkit.utility.tok.tok_mask(tokenizer)
23 |         self.assertEqual(mask, "[MASK]")
24 |         pad = tfkit.utility.tok.tok_pad(tokenizer)
25 |         self.assertEqual(pad, "[PAD]")
26 | 
27 |     def testTok(self):
28 |         tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
29 |         begin = tfkit.utility.tok.tok_begin(tokenizer)
30 |         self.assertEqual(begin, "<s>")
31 |         sep = tfkit.utility.tok.tok_sep(tokenizer)
32 |         self.assertEqual(sep, "</s>")
33 |         mask = tfkit.utility.tok.tok_mask(tokenizer)
34 |         self.assertEqual(mask, "<mask>")
35 |         pad = tfkit.utility.tok.tok_pad(tokenizer)
36 |         self.assertEqual(pad, "<pad>")
37 | 
38 |     def testGetXUnkToken(self):
39 |         tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
40 |         result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[], topP=0.5)
41 |         self.assertFalse(result)
42 |         result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[], freqK=10)
43 |         self.assertFalse(result)
44 |         result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
45 |                                                        freqK=1)
46 |         self.assertTrue(len(result) > 0)
47 |         result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
48 |                                                       topP=0.9)
49 |         self.assertTrue(len(result) > 0)
50 | 
51 |     def testHandleExceed(self):
52 |         tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
53 |         seq = " ".join([str(_) for _ in range(100)])
54 |         maxlen = 50
55 |         for mode in ['noop', 'remove', 'slide', 'start_slice', 'end_slice']:
56 |             rlt, _ = tfkit.utility.tok.handle_exceed(tokenizer, seq, maxlen, mode=mode)
57 |             if mode == 'remove':
58 |                 self.assertTrue(len(rlt) == 0)
59 |             if mode == 'slide':
60 |                 self.assertTrue(len(rlt) > 1)
61 |             for i in rlt:
62 |                 print(i)
63 |                 if mode != 'noop':
64 |                     self.assertTrue(len(i) == 50)
65 | 


--------------------------------------------------------------------------------
/tfkit/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/tfkit/utility/__init__.py


--------------------------------------------------------------------------------
/tfkit/utility/data_filereader.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from collections import defaultdict
  3 | 
  4 | import nlp2
  5 | 
  6 | 
  7 | # ignore sklearn warning
  8 | def warn(*args, **kwargs):
  9 |     pass
 10 | 
 11 | 
 12 | import warnings
 13 | 
 14 | warnings.warn = warn
 15 | 
 16 | from tqdm.auto import tqdm
 17 | 
 18 | from tfkit.utility import tok
 19 | 
 20 | 
 21 | def get_multiclas_data_from_file(fpath):
 22 |     task_label_dict = defaultdict(list)
 23 |     with open(fpath, 'r') as infile:
 24 |         reader = csv.DictReader(infile)
 25 |         fieldnames = reader.fieldnames
 26 |         headers = ['input'] + ['target_' + str(i) for i in range(len(fieldnames) - 1)]
 27 | 
 28 |         is_multi_label = ""
 29 |         for rows in nlp2.read_csv_chunk(fpath, ','):
 30 |             for row in rows:
 31 |                 if tok.UNIVERSAL_SEP in row[1]:
 32 |                     is_multi_label = "_multi_label"
 33 |                     break
 34 | 
 35 |         for rows in nlp2.read_csv_chunk(fpath, ','):
 36 |             for row in rows:
 37 |                 start_pos = 1
 38 |                 for pos, item in enumerate(row[start_pos:]):
 39 |                     pos += start_pos
 40 |                     task = headers[0] + "_" + headers[pos] + is_multi_label
 41 |                     item = item.strip()
 42 |                     if tok.UNIVERSAL_SEP in item:
 43 |                         for i in item.split(tok.UNIVERSAL_SEP):
 44 |                             task_label_dict[task].append(i) if i not in task_label_dict[task] else task_label_dict[task]
 45 |                     else:
 46 |                         task_label_dict[task].append(item) if item not in task_label_dict[task] else task_label_dict[
 47 |                             task]
 48 |                     task_label_dict[task].sort()
 49 | 
 50 |         for rows in nlp2.read_csv_chunk(fpath, ','):
 51 |             chunk = []
 52 |             for row in rows:
 53 |                 start_pos = 1
 54 |                 for pos, item in enumerate(row[start_pos:]):
 55 |                     pos += start_pos
 56 |                     task = headers[0] + "_" + headers[pos] + is_multi_label
 57 |                     item = item.strip()
 58 |                     targets = item.split(tok.UNIVERSAL_SEP) if tok.UNIVERSAL_SEP in item else [item]
 59 |                     targets = [task_label_dict[task][task_label_dict[task].index(target)] for target in targets]
 60 |                     input = row[0]
 61 |                     chunk.append({"task": task, "input": input, "target": targets})
 62 |             yield chunk
 63 |         return task_label_dict
 64 | 
 65 | 
 66 | def get_clas_data_from_file(fpath):
 67 |     task_label_dict = defaultdict(list)
 68 |     task = 'clas'
 69 |     task_label_dict[task] = []
 70 |     for rows in nlp2.read_csv_chunk(fpath, ','):
 71 |         chunk = []
 72 |         for row in rows:
 73 |             source_text = row[0]
 74 |             target_text = row[1]
 75 |             if target_text not in task_label_dict[task]:
 76 |                 task_label_dict[task].append(target_text)
 77 |             chunk.append({"task": task, "input": source_text, "target": task_label_dict[task].index(target_text)})
 78 |         yield chunk
 79 |     return task_label_dict
 80 | 
 81 | 
 82 | def get_gen_data_from_file(fpath):
 83 |     task_label_dict = defaultdict(list)
 84 |     task = 'gen'
 85 |     task_label_dict[task] = []
 86 |     print("Reading data from file...")
 87 |     for rows in nlp2.read_csv_chunk(fpath, ','):
 88 |         chunk = []
 89 |         for row in rows:
 90 |             source_text = str(row[0]).strip()
 91 |             target_text = str(row[1]).strip()
 92 |             negative_text = str(row[2]).strip() if len(row) > 2 else None
 93 |             if len(source_text) == 0 or len(target_text) == 0:
 94 |                 continue
 95 |             chunk.append({"task": task, "input": source_text, "target": target_text, "ntarget": negative_text})
 96 |         yield chunk
 97 |     return task_label_dict
 98 | 
 99 | 
100 | def get_qa_data_from_file(fpath):
101 |     task_label_dict = defaultdict(list)
102 |     task = 'qa'
103 |     task_label_dict[task] = []
104 |     for rows in nlp2.read_csv_chunk(fpath, ','):
105 |         chunk = []
106 |         for row in rows:
107 |             context, start, end = row
108 |             chunk.append({"task": task, "input": context, "target": [start, end]})
109 |         yield chunk
110 |     return task_label_dict
111 | 
112 | 
113 | def get_tag_data_from_file(fpath, text_index: int = 0, label_index: int = 1, separator=" "):
114 |     task_label_dict = defaultdict(list)
115 |     task = 'tag'
116 |     labels = []
117 |     for rows in nlp2.read_csv_chunk(fpath, ','):
118 |         for row in rows:
119 |             for i in row[1].split(separator):
120 |                 if i not in labels and len(i.strip()) > 0:
121 |                     labels.append(i)
122 |                     labels.sort()
123 |     task_label_dict[task] = labels
124 | 
125 |     for rows in nlp2.read_csv_chunk(fpath, ','):
126 |         chunk = []
127 |         for row in rows:
128 |             chunk.append({"task": task, "input": row[text_index].strip(), "target": row[label_index].strip(),
129 |                           'separator': separator})
130 |         yield chunk
131 |     return task_label_dict
132 | 
133 | 
134 | def get_tag_data_from_file_col(fpath, text_index: int = 0, label_index: int = 1, separator=" ", **kwargs):
135 |     tasks = defaultdict(list)
136 |     task = 'default'
137 |     labels = []
138 |     with open(fpath, 'r', encoding='utf-8') as f:
139 |         lines = f.read().splitlines()
140 |         for line in tqdm(lines):
141 |             rows = line.split(separator)
142 |             if len(rows) > 1:
143 |                 if rows[label_index] not in labels and len(rows[label_index]) > 0:
144 |                     labels.append(rows[label_index])
145 |                     labels.sort()
146 |     tasks[task] = labels
147 |     with open(fpath, 'r', encoding='utf-8') as f:
148 |         lines = f.read().splitlines()
149 |         x, y = "", ""
150 |         for line in tqdm(lines):
151 |             rows = line.split(separator)
152 |             if len(rows) == 1:
153 |                 yield tasks, task, x.strip(), [y.strip()]
154 |                 x, y = "", ""
155 |             else:
156 |                 if len(rows[text_index]) > 0:
157 |                     x += rows[text_index].replace(" ", "_") + separator
158 |                     y += rows[label_index].replace(" ", "_") + separator
159 | 


--------------------------------------------------------------------------------
/tfkit/utility/data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import torch
 3 | from torch import nn
 4 | from torch.utils import data
 5 | 
 6 | 
 7 | def index_of(in_list, val):
 8 |     """
 9 |     get token index in list, return -1 when it is not in the list 
10 |     :rtype: int
11 |     :param in_list: query list
12 |     :param val: query target
13 |     :return: position index
14 |     """
15 |     try:
16 |         return in_list.index(val)
17 |     except ValueError:
18 |         return -1
19 | 
20 | 
21 | def pad_batch(batch):
22 |     """
23 |     reduce batch data shape by reduce their padding to common max
24 |     it needs to Handel some exception since some key is no need to be padded
25 |     :param batch: list of dict, with key input and target as model input and target
26 |     :return: list of dict
27 |     """
28 |     keys = list(batch[0].keys())
29 |     for k in keys:
30 |         batch_key_length = [len(i[k]) if not isinstance(i[k], int) else 1 for i in batch]
31 |         if len(set(batch_key_length)) > 1:  # is all value same? if no, it need to pad with max length
32 |             pad_length = max(batch_key_length)
33 |             for idx, _ in enumerate(batch):
34 |                 if f"{k}_pad" in batch[idx]:
35 |                     padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), batch[idx][f"{k}_pad"][0])
36 |                 else:
37 |                     padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), 0)
38 |                 # batch[idx][k] = torch.unsqueeze(padded(batch[idx][k]), 0)
39 |                 batch[idx][k] = padded(batch[idx][k])
40 |     for ind, dat in enumerate(batch):
41 |         for k, v in dat.items():
42 |             batch[ind][k] = numpy.asarray(batch[ind][k])
43 |     return batch
44 | 
45 | 
46 | def dataloader_collate(batch):
47 |     """
48 |     dataloader_collate function to apply batch reduce padding
49 |     :param batch: list of dict
50 |     :return: batch: list of dict
51 |     """
52 |     # batch = copy.deepcopy(batch)
53 |     return torch.utils.data._utils.collate.default_collate(pad_batch(batch))
54 | 


--------------------------------------------------------------------------------
/tfkit/utility/data_processor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from numpy import uint16
  4 | 
  5 | from tfkit.utility import tok
  6 | 
  7 | 
  8 | class GeneralNLPPreprocessor:
  9 |     """
 10 |     The design of NLPPreprocessor is to handle a pure text input,
 11 |     perform preprocessing on it base on model constrain,
 12 |     return ids as output
 13 | 
 14 |     This class will be applied before model training, splitting and prepare the data for model input
 15 |     it will call get feature from data when it's converting to model input
 16 |     """
 17 | 
 18 |     def __init__(self, tokenizer, maxlen=512, handle_exceed='slide', reserved_len=0, uint16_save=False,
 19 |                  kwargs={}):
 20 |         self.tokenizer = tokenizer
 21 |         self.uint16_save = uint16_save
 22 |         self.parameters = {**{'tokenizer': tokenizer, 'maxlen': maxlen, 'handle_exceed': handle_exceed,
 23 |                               'reserved_len': reserved_len}, **kwargs}
 24 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 25 |         # item = {key: value.tolist() for key, value in item.items()}
 26 |         self.tok_pad_id = tok.tok_pad_id(tokenizer)
 27 |         self.tok_bos_id = tok.tok_begin_id(tokenizer)
 28 |         self.tok_sep_id = tok.tok_sep_id(tokenizer)
 29 |         self.tok_mask_id = tok.tok_mask_id(tokenizer)
 30 | 
 31 |     def read_file_to_data(self, filepath):
 32 |         assert 'plz override this funciton'
 33 | 
 34 |     def set_global_parameters(self):
 35 |         self.tokenize_target = False
 36 | 
 37 |     def preprocess(self, item):
 38 |         self.set_global_parameters()
 39 |         preprocessed_data = []
 40 |         item = self.preprocess_component_prepare_input(item)
 41 |         # target may be none in eval
 42 |         t_input_list, t_target_list, t_input_index, t_target_index = self.preprocess_component_split_into_list(
 43 |             item['input'],
 44 |             item.get('target'))
 45 |         for t_input, t_target, t_input_index, t_target_index in zip(t_input_list,
 46 |                                                                     t_target_list,
 47 |                                                                     t_input_index,
 48 |                                                                     t_target_index):
 49 |             slice_length = self.parameters['maxlen'] - self.parameters.get('reserved_len') - 3
 50 |             item['input'] = [tok.tok_begin(self.tokenizer)] + t_input[:slice_length]
 51 |             item['input_index'] = t_input_index
 52 |             item['target_index'] = t_target_index
 53 |             if len(t_target) > 0:
 54 |                 item['target'] = t_target
 55 |             for convert_feature_input_dict in self.preprocess_component_convert_to_id(item):
 56 |                 if self.uint16_save:
 57 |                     data_item = {k: np.array(v, dtype=uint16) if isinstance(v, list) else v for k, v in
 58 |                                  convert_feature_input_dict.items()}
 59 |                 else:
 60 |                     data_item = convert_feature_input_dict
 61 |                 preprocessed_data.append(data_item)
 62 |         return preprocessed_data
 63 | 
 64 |     def preprocess_component_prepare_input(self, item):
 65 |         if tok.UNIVERSAL_SEP in item['input']:
 66 |             part = item['input'].split(tok.UNIVERSAL_SEP)
 67 |             item['previous'] = self.tokenizer.tokenize(part[-1])
 68 |             item['input'] = "".join(part[:-1])
 69 |         return item
 70 | 
 71 |     def preprocess_component_split_into_list(self, input_text, target_text=None):
 72 |         t_input_list, t_input_index = tok.handle_exceed(self.tokenizer, input_text,
 73 |                                                         maxlen=self.parameters['maxlen'] - 3,
 74 |                                                         mode=self.parameters.get('handle_exceed'))
 75 |         if self.tokenize_target and target_text:
 76 |             t_target_list, t_target_index = tok.handle_exceed(self.tokenizer, target_text,
 77 |                                                               maxlen=self.parameters['maxlen'] - 3,
 78 |                                                               mode=self.parameters.get('handle_exceed'))
 79 |         elif target_text:
 80 |             t_target_list, t_target_index = [target_text * len(t_input_list)], [[0] * len(t_input_list)]
 81 |         else:
 82 |             t_target_list, t_target_index = ['' * len(t_input_list)], [[0] * len(t_input_list)]
 83 |         return t_input_list, t_target_list, t_input_index, t_target_index
 84 | 
 85 |     def preprocess_component_convert_to_id(self, item):
 86 |         yield {k: self.tokenizer.convert_tokens_to_ids(v) if isinstance(v, list) else v for k, v in item.items()}
 87 | 
 88 |     def postprocess(self, item, tokenizer, maxlen, **kwargs):
 89 |         return {key: torch.tensor(value) for key, value in item.items() if isinstance(value, list)}
 90 | 
 91 |     def postprocess_batch(self, feature_dict, **kwargs):
 92 |         return {key: torch.unsqueeze(torch.tensor(value), 0).to(self.device) for key, value in feature_dict.items()}
 93 | 
 94 | 
 95 | class GeneralCVPreprocessor:
 96 |     def __init__(self, feature_extractor, kwargs={}):
 97 |         self.feature_extractor = feature_extractor
 98 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 99 |         self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs}
100 | 
101 |     def read_file_to_data(self, filepath):
102 |         assert 'plz override this funciton'
103 | 
104 |     def preprocess(self, item):
105 |         preprocessed_data = []
106 |         preprocessed_data.append(item)
107 |         return preprocessed_data
108 | 
109 |     def postprocess(self, item, **kwargs):
110 |         item['input'] = self.feature_extractor(item['input'])
111 |         return {key: torch.tensor(value) for key, value in item.items()}
112 | 
113 | 
114 | class GeneralSpeechPreprocessor:
115 |     def __init__(self, feature_extractor, kwargs={}):
116 |         self.feature_extractor = feature_extractor
117 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
118 |         self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs}
119 | 
120 |     def read_file_to_data(self, filepath):
121 |         assert 'plz override this function'
122 | 
123 |     def preprocess(self, item):
124 |         preprocessed_data = []
125 |         preprocessed_data.append(item)
126 |         return preprocessed_data
127 | 
128 |     def postprocess(self, item, **kwargs):
129 |         item['input'] = self.feature_extractor(item['input'])
130 |         return {key: torch.tensor(value) for key, value in item.items()}


--------------------------------------------------------------------------------
/tfkit/utility/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | from random import choice
 4 | 
 5 | import joblib
 6 | import nlp2
 7 | from torch.utils import data
 8 | from tqdm.contrib.concurrent import process_map
 9 | 
10 | 
11 | def get_dataset(file_path, task_class, tokenizer, parameter):
12 |     panel = nlp2.Panel()
13 |     # all_arg = nlp2.function_get_all_arg_with_value(task_class.preprocessor.prepare_convert_to_id)
14 |     # if parameter.get('panel'):
15 |     #     print("Operation panel for data preprocessing.")
16 |     #     for missarg in nlp2.function_check_missing_arg(task_class.preprocessor,
17 |     #                                                    parameter):
18 |     #         panel.add_element(k=missarg, v=all_arg[missarg], msg=missarg, default=all_arg[missarg])
19 |     #     filled_arg = panel.get_result_dict()
20 |     #     parameter.update(filled_arg)
21 |     ds = TFKitDataset(fpath=file_path, tokenizer=tokenizer,
22 |                       preprocessor=task_class.Preprocessor,
23 |                       preprocessing_arg=parameter)
24 |     return ds
25 | 
26 | 
27 | class TFKitDataset(data.Dataset):
28 |     def __init__(self, fpath, tokenizer, preprocessor, preprocessing_arg={}):
29 |         cache_path = fpath + "_" + tokenizer.name_or_path.replace("/", "_") + ".cache"
30 |         self.task_dict = {}
31 |         self.preprocessor = preprocessor(tokenizer, kwargs=preprocessing_arg)
32 |         self.tokenizer = tokenizer
33 |         if os.path.isfile(cache_path) and preprocessing_arg.get('cache', False):
34 |             with open(cache_path, "rb") as fo:
35 |                 outdata = joblib.load(fo)
36 |                 sample = outdata['sample']
37 |                 length = outdata['length']
38 |                 self.task_dict = outdata['task']
39 |         else:
40 |             print(f"Start preprocessing...")
41 |             sample = defaultdict(list)
42 |             length = 0
43 |             get_data_item = self.preprocessor.read_file_to_data(fpath)
44 |             while True:
45 |                 try:
46 |                     for items in process_map(self.preprocessor.preprocess, next(get_data_item),
47 |                                              chunksize=1000):
48 |                         for i in items:
49 |                             length += 1
50 |                             for k, v in i.items():
51 |                                 sample[k].append(v)
52 |                     print(f"loaded {length} data.")
53 |                 except StopIteration as e:
54 |                     tasks = e.value
55 |                     break
56 |             self.task_dict = tasks
57 |             print(f"There are {length} datas after preprocessing.")
58 |             if preprocessing_arg.get('cache', False):
59 |                 with open(cache_path, 'wb') as fo:
60 |                     outdata = {'sample': sample, 'task': self.task_dict, 'length': length}
61 |                     joblib.dump(outdata, fo)
62 |         self.length = length
63 |         self.sample = sample
64 |         self.task = self.task_dict
65 | 
66 |     def increase_with_sampling(self, total):
67 |         for _ in range(total - self.length):
68 |             for key in self.sample.keys():
69 |                 self.sample[key].append(choice(self.sample[key]))
70 | 
71 |     def __len__(self):
72 |         return self.length
73 | 
74 |     def __getitem__(self, idx):
75 |         return self.preprocessor.postprocess(
76 |             {**{'task_dict': self.task_dict}, **{key: self.sample[key][idx] for key in self.sample.keys()}},
77 |             self.tokenizer,
78 |             maxlen=self.preprocessor.parameters['maxlen'])
79 | 


--------------------------------------------------------------------------------
/tfkit/utility/eval_metric.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import re
  3 | import string
  4 | from collections import Counter
  5 | from collections import defaultdict
  6 | 
  7 | import editdistance as ed
  8 | from tqdm.auto import tqdm
  9 | 
 10 | from tfkit.utility import tok
 11 | 
 12 | 
 13 | def _normalize_answer(s, task='emf1'):
 14 |     """Lower text and remove punctuation, articles and extra whitespace."""
 15 | 
 16 |     def remove_articles(text):
 17 |         if len(text) > 1:
 18 |             return re.sub(r'\b(a|an|the)\b', ' ', text)
 19 |         else:
 20 |             return text
 21 | 
 22 |     def white_space_fix(text):
 23 |         return ' '.join(text.split())
 24 | 
 25 |     def remove_punc(text):
 26 |         exclude = set(string.punctuation)
 27 |         return ''.join(ch for ch in text if ch not in exclude)
 28 | 
 29 |     def lower(text):
 30 |         return text.lower()
 31 | 
 32 |     if task == 'emf1':
 33 |         return white_space_fix(remove_articles(remove_punc(lower(s))))
 34 |     else:
 35 |         return white_space_fix((remove_punc(lower(s))))
 36 | 
 37 | 
 38 | def _f1_score(prediction, ground_truth):
 39 |     prediction_tokens = _normalize_answer(prediction).split()
 40 |     ground_truth_tokens = _normalize_answer(ground_truth).split()
 41 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 42 |     num_same = sum(common.values())
 43 |     if num_same == 0:
 44 |         return 0
 45 |     precision = 1.0 * num_same / len(prediction_tokens)
 46 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 47 |     f1 = (2 * precision * recall) / (precision + recall)
 48 |     return f1
 49 | 
 50 | 
 51 | def _cer(groundtruth, hypothesis):
 52 |     err = 0
 53 |     tot = 0
 54 |     for p, t in zip(hypothesis, groundtruth):
 55 |         err += float(ed.eval(p.lower(), t.lower()))
 56 |         tot += len(t)
 57 |     return err / tot
 58 | 
 59 | 
 60 | def _wer(groundtruth, hypothesis):
 61 |     err = 0
 62 |     tot = 0
 63 |     for p, t in zip(hypothesis, groundtruth):
 64 |         p = p.lower().split(' ')
 65 |         t = t.lower().split(' ')
 66 |         err += float(ed.eval(p, t))
 67 |         tot += len(t)
 68 |     return err / tot
 69 | 
 70 | 
 71 | class EvalMetric:
 72 | 
 73 |     def __init__(self, tokenizer, normalize_text=True):
 74 |         self.tasks = defaultdict(lambda: defaultdict(list))
 75 |         self.tokenizer = tokenizer
 76 |         self.target_list = defaultdict(lambda: defaultdict(int))
 77 |         self.normalize_text = normalize_text
 78 | 
 79 |     def tokenize_text(self, text):
 80 |         text = self.tokenizer.decode(self.tokenizer.encode(text, add_special_tokens=False))
 81 |         if self.normalize_text:
 82 |             text = text.replace(tok.tok_sep(self.tokenizer), " ")
 83 |             # return  _normalize_answer(text, task='others')  # remove punctuation
 84 |             # keep punctuation
 85 |             text = "".join(
 86 |                 (char if char.isalpha() or char == " " else " " + char + " ") for char in text)  # separate punctuation
 87 |             text = ' '.join(text.split()).lower().strip()  # remove extra blank
 88 |         return text
 89 | 
 90 |     def add_record(self, ori_input, ori_predicted, ori_target, task='default'):
 91 |         input = predicted = target = ""
 92 |         input_list = predicted_list = ori_predicted_list = target_list = []
 93 | 
 94 |         if isinstance(ori_input, str):
 95 |             input = self.tokenize_text(ori_input.strip())
 96 |             input_list = [input]
 97 |         if isinstance(ori_input, list):
 98 |             input_list = copy.copy(ori_input)
 99 |             for i, t in enumerate(ori_input):
100 |                 input_list[i] = self.tokenize_text(t.strip())
101 |             input = " ".join(input_list)
102 | 
103 |         if isinstance(ori_predicted, str):
104 |             predicted = self.tokenize_text(ori_predicted)
105 |             predicted_list = [predicted]
106 |             ori_predicted_list = [ori_predicted]
107 |         if isinstance(ori_predicted, list):
108 |             predicted_list = copy.copy(ori_predicted)
109 |             ori_predicted_list = copy.copy(ori_predicted)
110 |             for i, t in enumerate(ori_predicted):
111 |                 if not isinstance(t, list):
112 |                     predicted_list[i] = self.tokenize_text(t.strip())
113 |                     ori_predicted_list[i] = t
114 |                 else:
115 |                     predicted_list[i] = ''
116 |                     ori_predicted_list[i] = ''
117 |             predicted = " ".join(predicted_list)
118 |         if isinstance(ori_target, str):
119 |             target_list = []
120 |             if tok.UNIVERSAL_SEP in ori_target:
121 |                 target = ori_target
122 |                 target_list.extend([self.tokenize_text(st.strip()) for st in ori_target.split(tok.UNIVERSAL_SEP)])
123 |             else:
124 |                 target = self.tokenize_text(ori_target.strip())
125 |                 target_list.append(target)
126 |         elif isinstance(ori_target, list):
127 |             for i, t in enumerate(ori_target):
128 |                 if isinstance(t, list):
129 |                     ori_target[i] = self.tokenize_text(t.strip())
130 | 
131 |             target_list = ori_target
132 | 
133 |         for t in target_list:
134 |             self.target_list[task][t] += 1
135 | 
136 |         self.tasks[task]['input'].append(input)
137 |         self.tasks[task]['input_list'].append(input_list)
138 |         self.tasks[task]['predicted'].append(predicted)
139 |         self.tasks[task]['predicted_list'].append(predicted_list)
140 |         self.tasks[task]['target'].append(target)
141 |         self.tasks[task]['target_list'].append(target_list)
142 |         self.tasks[task]['ori_input'].append(ori_input)
143 |         self.tasks[task]['ori_predicted'].append(ori_predicted)
144 |         self.tasks[task]['ori_predicted_list'].append(ori_predicted_list)
145 |         self.tasks[task]['ori_target'].append(ori_target)
146 | 
147 |     def get_record(self, task='default'):
148 |         return self.tasks[task]
149 | 
150 |     def cal_score(self, metric):
151 |         data_score = []
152 |         for task_name, task in self.tasks.items():
153 |             print("Task : " + task_name + " report ")
154 |             if "emf1" in metric:
155 |                 em = 0
156 |                 total = 0
157 |                 f1 = 0
158 |                 for pos, predict in enumerate(task['predicted']):
159 |                     em_list = []
160 |                     f1_list = []
161 |                     for target in task['target_list'][pos]:
162 |                         if _normalize_answer(str(predict)) == _normalize_answer(str(target)) and len(
163 |                                 _normalize_answer(str(predict))) > 0 or len(str(predict)) == len(str(target)) == 0:
164 |                             em_score = 1
165 |                             f1_score = 1
166 |                         else:
167 |                             em_score = 0
168 |                             f1_score = _f1_score(str(predict), str(target))
169 |                         em_list.append(em_score)
170 |                         f1_list.append(f1_score)
171 |                     em += max(em_list)
172 |                     f1 += max(f1_list)
173 |                     data_score.append([predict, task['target_list'][pos][em_list.index(max(em_list))],
174 |                                        {'em': max(em_list), 'f1': max(f1_list)}])
175 |                     total += 1
176 |                 result = {"EM": em / (total or not total), "F1": f1 / (total or not total)}
177 |                 data_score = sorted(data_score, key=lambda i: i[2]['em'], reverse=True)
178 |             if "er" in metric:
179 |                 predicts = []
180 |                 targets = []
181 |                 for pos, predict in enumerate(task['predicted']):
182 |                     wer_list = []
183 |                     cer_list = []
184 |                     for target in task['target_list'][pos]:
185 |                         if len(target) > 0 and len(predict) > 0:
186 |                             wer_list.append(100 * _wer([target], [predict]))
187 |                             cer_list.append(100 * _cer([target], [predict]))
188 |                         else:
189 |                             wer_list.append(100)
190 |                             cer_list.append(100)
191 |                     wer = min(wer_list)
192 |                     cer = min(cer_list)
193 |                     target = task['target_list'][pos][wer_list.index(wer)]
194 |                     predicts.append(predict)
195 |                     targets.append(target)
196 |                     data_score.append([predict, target, {'wer': wer, 'cer': cer}])
197 | 
198 |                 wer = 100 * _wer(targets, predicts) if len(target) > 0 else 100
199 |                 cer = 100 * _cer(targets, predicts) if len(target) > 0 else 100
200 |                 result = {"WER": wer, "CER": cer}
201 |                 data_score = sorted(data_score, key=lambda i: i[2]['wer'], reverse=False)
202 |             if "nlg" in metric:
203 |                 try:
204 |                     from nlgeval import NLGEval
205 |                 except ImportError:
206 |                     print(
207 |                         "nlg-eval package not install, plz install it: pip install git+https://github.com/voidful/nlg-eval.git ; nlg-eval --setup ./nlg-eval-data/")
208 |                     raise
209 |                 nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=["METEOR"])
210 | 
211 |                 target_list = task['target_list']
212 |                 predicted = task['predicted']
213 |                 for idx, tl in enumerate(target_list):
214 |                     max_candidate = max([len(i) for i in target_list])
215 |                     if max_candidate - len(tl) > 0:
216 |                         target_list[idx].extend([""] * (max_candidate - len(tl)))
217 | 
218 |                 for t, p in tqdm(zip(target_list, predicted), total=len(target_list)):
219 |                     data_score.append([p, t, nlgeval.compute_metrics(ref_list=list(map(list, zip(t))), hyp_list=[p])])
220 |                 result = nlgeval.compute_metrics(ref_list=list(map(list, zip(*task['target_list']))),  # transpose
221 |                                                  hyp_list=predicted)
222 |                 data_score = sorted(data_score, key=lambda i: i[2]['ROUGE_L'])
223 |             if "clas" in metric:
224 |                 from sklearn.metrics import classification_report
225 |                 from sklearn.preprocessing import MultiLabelBinarizer
226 |                 from sklearn.metrics import precision_recall_fscore_support
227 |                 target_key = [t for t in self.target_list[task_name].keys() if len(t) > 0]
228 |                 mlb = MultiLabelBinarizer().fit([target_key])
229 |                 # remove all blank target
230 |                 task['target_list'] = [[j for j in sub if len(j) > 0] for sub in task['target_list']]
231 |                 # modify for tagging result
232 |                 if isinstance(task['ori_predicted_list'][0][0], list):
233 |                     target_list = sum([[[j] for j in sub] for sub in task['target_list']], [])
234 |                     predicted = sum([[[j] for j in sub] for sub in task['ori_predicted_list']], [])
235 |                     if len(target_list) != len(predicted):
236 |                         diff = len(task['target_list']) - len(task['ori_predicted_list'])
237 |                         predicted.extend([['']] * diff)
238 |                 else:
239 |                     target_list = task['target_list']
240 |                     predicted = task['ori_predicted_list']
241 | 
242 |                 for p, t in zip(predicted, target_list):
243 |                     score = dict(zip(["precision", "recall", "fbeta_score", "support"],
244 |                                      precision_recall_fscore_support(mlb.transform([t]), mlb.transform([p]),
245 |                                                                      average='weighted')))
246 |                     data_score.append([p, t, score])
247 |                 result = classification_report(
248 |                     mlb.transform(target_list),
249 |                     mlb.transform(predicted),
250 |                     target_names=list(mlb.classes_))
251 |                 data_score = sorted(data_score, key=lambda i: i[2]['fbeta_score'])
252 |             yield (task_name, result, data_score)
253 | 


--------------------------------------------------------------------------------
/tfkit/utility/logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import json
 4 | 
 5 | 
 6 | class Logger:
 7 | 
 8 |     def __init__(self, savedir, logfilename="message.log", metricfilename="metric.log", tensorboard=False, wandb=False,
 9 |                  print_fn=print):
10 |         self.savedir = savedir
11 |         self.logfilepath = os.path.join(savedir, logfilename)
12 |         self.metricfilepath = os.path.join(savedir, metricfilename)
13 |         self.tensorboard_writer = None
14 |         self.wandb_writer = None
15 |         self.print_fn = print_fn
16 |         if tensorboard:
17 |             from torch.utils.tensorboard import SummaryWriter
18 |             self.tensorboard_writer = SummaryWriter()
19 |         if wandb:
20 |             import wandb
21 |             project_name = savedir.replace("/", "_")
22 |             self.wandb_writer = wandb.init(project=project_name)
23 | 
24 |     def write_config(self, config_dict):
25 |         if self.wandb_writer:
26 |             self.wandb_writer.config.update(config_dict)
27 |         if self.tensorboard_writer:
28 |             self.tensorboard_writer.add_hparams(config_dict)
29 | 
30 |         with open(self.metricfilepath, "a", encoding='utf8') as log_file:
31 |             writer = csv.writer(log_file)
32 |             writer.writerow([json.dumps(config_dict)])
33 | 
34 |     def write_log(self, *args):
35 |         line = ' '.join([str(a) for a in args])
36 |         with open(self.logfilepath, "a", encoding='utf8') as log_file:
37 |             log_file.write(line + '\n')
38 |         self.print_fn(line)
39 | 
40 |     def write_metric(self, tag, scalar_value, global_step):
41 |         if self.wandb_writer:
42 |             self.wandb_writer.log({tag: scalar_value, "global_step": global_step})
43 |         if self.tensorboard_writer:
44 |             self.tensorboard_writer.add_scalar(tag, scalar_value, global_step)
45 |         with open(self.metricfilepath, "a", encoding='utf8') as log_file:
46 |             writer = csv.writer(log_file)
47 |             writer.writerow([tag, scalar_value, global_step])
48 | 


--------------------------------------------------------------------------------
/tfkit/utility/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class BCEFocalLoss(nn.Module):
  8 |     def __init__(self, gamma=2):
  9 |         super(BCEFocalLoss, self).__init__()
 10 |         self.gamma = gamma
 11 | 
 12 |     def forward(self, input, target):
 13 |         BCE_loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
 14 |         pt = torch.exp(-BCE_loss)  # prevents nans when probability 0
 15 |         focal_loss = (1 - pt) ** self.gamma * BCE_loss
 16 |         return focal_loss.mean()
 17 | 
 18 | 
 19 | class FocalLoss(nn.Module):
 20 |     def __init__(self, gamma=2, ignore_index=-1):
 21 |         super(FocalLoss, self).__init__()
 22 |         self.gamma = gamma
 23 |         self.softmax = nn.Softmax(dim=1)
 24 |         self.nll = nn.NLLLoss(ignore_index=ignore_index)
 25 | 
 26 |     def forward(self, input, target):
 27 |         softmax = self.softmax(input)
 28 |         logpt = torch.log(softmax)
 29 |         pt = Variable(logpt.data.exp())
 30 |         return self.nll((1 - pt) ** self.gamma * logpt, target)
 31 | 
 32 | 
 33 | class SeqCTCLoss(nn.Module):
 34 |     def __init__(self, blank_index):
 35 |         super(SeqCTCLoss, self).__init__()
 36 |         self.blank_index = blank_index
 37 | 
 38 |     def forward(self, logits, input_lengths, targets, target_lengths):
 39 |         # lengths : (batch_size, )
 40 |         # log_logits : (T, batch_size, n_class), this kind of shape is required for ctc_loss
 41 |         # log_logits = logits + (logit_mask.unsqueeze(-1) + 1e-45).log()
 42 |         log_logits = logits.log_softmax(-1).transpose(0, 1)
 43 |         loss = F.ctc_loss(log_logits,
 44 |                           targets,
 45 |                           input_lengths,
 46 |                           target_lengths,
 47 |                           blank=self.blank_index,
 48 |                           reduction='mean',
 49 |                           zero_infinity=True)
 50 |         return loss
 51 | 
 52 | 
 53 | class SelfKDLoss(nn.Module):
 54 | 
 55 |     def __init__(self, alpha=0.1, temperature=2,ignore_index=-1):
 56 |         super(SelfKDLoss, self).__init__()
 57 |         self.alpha = alpha
 58 |         self.temperature = temperature
 59 |         self.ignore_index = ignore_index
 60 | 
 61 |     def forward(self, outputs, teacher_outputs, labels):
 62 |         loss = nn.KLDivLoss()(F.log_softmax(outputs / self.temperature, dim=-1),
 63 |                               F.softmax(teacher_outputs / self.temperature, dim=-1)) * (
 64 |                        self.alpha * self.temperature * self.temperature) + F.cross_entropy(outputs, labels,ignore_index=self.ignore_index,) * (
 65 |                        1. - self.alpha)
 66 |         return loss
 67 | 
 68 | 
 69 | class DiceLoss(nn.Module):
 70 |     """From 'Dice Loss for Data-imbalanced NLP Tasks'"""
 71 | 
 72 |     def __init__(self, ignore_index=None, reduction='mean'):
 73 |         super(DiceLoss, self).__init__()
 74 |         self.ignore_index = ignore_index
 75 |         self.reduction = reduction
 76 | 
 77 |     def forward(self, y_pred, y_true):
 78 |         y_pred = torch.softmax(y_pred, dim=1)
 79 |         if self.ignore_index is not None:
 80 |             mask = y_true == -1
 81 |             filtered_target = y_true
 82 |             filtered_target[mask] = 0
 83 |             torch.gather(y_pred, dim=1, index=filtered_target.unsqueeze(1))
 84 |             mask = mask.unsqueeze(1).expand(y_pred.data.size())
 85 |             y_pred[mask] = 0
 86 |         pred_prob = torch.gather(y_pred, dim=1, index=y_true.unsqueeze(1))
 87 |         dsc_i = 1 - ((1 - pred_prob) * pred_prob) / ((1 - pred_prob) * pred_prob + 1)
 88 |         if self.reduction == 'mean':
 89 |             return dsc_i.mean()
 90 |         else:
 91 |             return dsc_i.view(-1)
 92 | 
 93 | 
 94 | class NegativeCElLoss(nn.Module):
 95 |     def __init__(self, ignore_index=-1, reduction='mean'):
 96 |         super(NegativeCElLoss, self).__init__()
 97 |         self.softmax = nn.Softmax(dim=1)
 98 |         self.alpha = 1
 99 |         self.nll = nn.NLLLoss(ignore_index=ignore_index, reduction=reduction)
100 | 
101 |     def forward(self, input, target):
102 |         nsoftmax = self.softmax(input)
103 |         nsoftmax = torch.clamp((1.0 - nsoftmax), min=1e-32)
104 |         return self.nll(torch.log(nsoftmax) * self.alpha, target)
105 | 
106 | 
107 | class LabelSmoothingLoss(nn.Module):
108 |     def __init__(self, classes, smoothing=0.1, dim=-1, ignore_index=None, reduction='mean'):
109 |         super(LabelSmoothingLoss, self).__init__()
110 |         self.confidence = 1.0 - smoothing
111 |         self.smoothing = smoothing
112 |         self.cls = classes
113 |         self.dim = dim
114 |         self.reduction = reduction
115 |         self.ignore_index = ignore_index
116 | 
117 |     def forward(self, pred, target):
118 |         pred = pred.log_softmax(dim=self.dim)
119 |         with torch.no_grad():
120 |             true_dist = torch.zeros_like(pred)
121 |             true_dist.fill_(self.smoothing / (self.cls - 1))
122 |             if self.ignore_index is not None:
123 |                 mask = target == -1
124 |                 filtered_target = target.clone()
125 |                 filtered_target[mask] = 0
126 |                 true_dist.scatter_(1, filtered_target.unsqueeze(1), self.confidence)
127 |                 mask = mask.unsqueeze(1).expand(pred.data.size())
128 |                 true_dist[mask] = 0
129 |             else:
130 |                 true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
131 |         if self.reduction == 'mean':
132 |             return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
133 |         else:
134 |             return torch.sum(-true_dist * pred, dim=self.dim)
135 | 


--------------------------------------------------------------------------------
/tfkit/utility/model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import importlib
  3 | import os
  4 | from typing import List
  5 | 
  6 | import inquirer
  7 | import nlp2
  8 | import torch
  9 | from torch import nn
 10 | from transformers import AutoTokenizer, AutoModel
 11 | 
 12 | 
 13 | def list_all_model(ignore_list=[]):
 14 |     dataset_dir = os.path.abspath(__file__ + "/../../") + '/task'
 15 |     return list(filter(
 16 |         lambda x: os.path.isdir(os.path.join(dataset_dir, x)) and '__pycache__' not in x and x not in ignore_list,
 17 |         os.listdir(dataset_dir)))
 18 | 
 19 | 
 20 | def load_predict_parameter(model, model_arg={}, enable_arg_panel=False):
 21 |     """use inquirer panel to let user input task parameter or just use default value"""
 22 |     return nlp2.function_argument_panel(model.predictor.wrap_input, model_arg,
 23 |                                         disable_input_panel=(not enable_arg_panel),
 24 |                                         func_parent=model,
 25 |                                         ignore_empty=True)
 26 | 
 27 | 
 28 | def load_model_class(model_name):
 29 |     return importlib.import_module('.' + model_name, 'tfkit.task')
 30 | 
 31 | 
 32 | def load_pretrained_model(pretrained_config, model_type):
 33 |     pretrained = AutoModel.from_pretrained(pretrained_config)
 34 |     if 'clm' in model_type:
 35 |         pretrained.config.is_decoder = True
 36 |     return pretrained
 37 | 
 38 | 
 39 | def load_pretrained_tokenizer(pretrained_config):
 40 |     tokenizer = AutoTokenizer.from_pretrained(pretrained_config)
 41 |     return tokenizer
 42 | 
 43 | 
 44 | def resize_pretrain_tok(pretrained, tokenizer):
 45 |     if pretrained.config.vocab_size != len(tokenizer):
 46 |         pretrained.resize_token_embeddings(len(tokenizer))
 47 |     return pretrained, tokenizer
 48 | 
 49 | 
 50 | def add_tokens_to_pretrain(pretrained, tokenizer, add_tokens, sample_init=False):
 51 |     origin_vocab_size = tokenizer.vocab_size
 52 |     print("===ADD TOKEN===")
 53 |     num_added_toks = tokenizer.add_tokens(add_tokens)
 54 |     print('We have added', num_added_toks, 'tokens')
 55 |     pretrained.resize_token_embeddings(len(tokenizer))
 56 |     if sample_init:
 57 |         input_embedding = pretrained.get_input_embeddings()
 58 |         state_dict_weight = input_embedding.state_dict()['weight']
 59 |         state_dict_weight[origin_vocab_size:len(tokenizer)] = copy.copy(
 60 |             state_dict_weight[100:100 + num_added_toks])
 61 |         pretrained.set_input_embeddings(input_embedding)
 62 |     print("===============")
 63 |     return pretrained, tokenizer
 64 | 
 65 | 
 66 | def load_trained_model(model_path, pretrained_config=None, tag=None):
 67 |     """loading saved task"""
 68 | 
 69 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 70 |     torchpack = torch.load(model_path, map_location=device)
 71 | 
 72 |     model_info = {key: torchpack[key] for key in torchpack.keys() if 'state_dict' not in key and 'models' not in key}
 73 |     print("===task info===")
 74 |     [print(k, v[:10], "...") if isinstance(v, list) and len(v) > 10 else print(k, v) for k, v in model_info.items()]
 75 |     print('===============')
 76 | 
 77 |     if 'tags' in torchpack and len(torchpack['tags']) > 1:
 78 |         if tag is None:
 79 |             print("Pick which models to use in multi-task models")
 80 |             inquirer_res = inquirer.prompt(
 81 |                 [inquirer.List('tag', message="Select task", choices=torchpack['tags'])])
 82 |             tag = inquirer_res['tag']
 83 |         type_ind = torchpack['tags'].index(tag)
 84 |     else:
 85 |         type_ind = 0
 86 |     print("loading saved task")
 87 | 
 88 |     # get all loading parameter
 89 |     maxlen = torchpack['maxlen']
 90 |     if pretrained_config is not None:
 91 |         config = pretrained_config
 92 |     else:
 93 |         config = torchpack['model_config'] if 'model_config' in torchpack else torchpack['bert']
 94 |     model_types = [torchpack['type']] if not isinstance(torchpack['type'], list) else torchpack['type']
 95 |     models_state = torchpack['models'] if 'models' in torchpack else [torchpack['model_state_dict']]
 96 |     type = model_types[type_ind]
 97 |     add_tokens = torchpack['add_tokens'] if 'add_tokens' in torchpack else None
 98 |     # load task
 99 |     tokenizer = AutoTokenizer.from_pretrained(config)
100 |     pretrained = AutoModel.from_pretrained(config)
101 | 
102 |     pretrained, tokenizer = add_tokens_to_pretrain(pretrained, tokenizer, add_tokens)
103 | 
104 |     model_class = load_model_class(type)
105 |     task_detail = {}
106 |     if 'task-label' in torchpack:
107 |         task_detail = torchpack['task-label']
108 |     elif 'label' in torchpack:
109 |         task_detail = {'label': torchpack['label']}
110 | 
111 |     model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail=task_detail,
112 |                               maxlen=maxlen)
113 |     model.load_state_dict(models_state[type_ind], strict=False)
114 |     model = model.to(device)
115 | 
116 |     preprocessor = model_class.Preprocessor(tokenizer)
117 | 
118 |     print("finish loading")
119 |     return model, type, model_class, model_info, preprocessor
120 | 
121 | 
122 | def save_model(models, input_arg, models_tag, epoch, fname, logger, accelerator, add_tokens=None):
123 |     accelerator.wait_for_everyone()
124 |     save_model = {
125 |         'models': [accelerator.get_state_dict(m) for m in models],
126 |         'model_config': input_arg.get('config'),
127 |         'add_tokens': add_tokens,
128 |         'tags': models_tag,
129 |         'type': input_arg.get('task'),
130 |         'maxlen': input_arg.get('maxlen'),
131 |         'epoch': epoch
132 |     }
133 | 
134 |     for ind, m in enumerate(input_arg.get('task')):
135 |         if 'tag' in m:
136 |             save_model['label'] = models[ind].labels
137 |         if "clas" in m:
138 |             save_model['task-label'] = models[ind].tasks_detail
139 | 
140 |     torch.save(save_model, f"{fname}.pt")
141 |     logger.write_log(f"weights were saved to {fname}.pt")
142 | 
143 | 
144 | def tie_encoder_decoder_weights(encoder, decoder, base_model_prefix):
145 |     uninitialized_encoder_weights: List[str] = []
146 |     if decoder.__class__ != encoder.__class__:
147 |         print(
148 |             f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
149 |         )
150 | 
151 |     def tie_encoder_to_decoder_recursively(
152 |             decoder_pointer: nn.Module,
153 |             encoder_pointer: nn.Module,
154 |             module_name: str,
155 |             uninitialized_encoder_weights: List[str],
156 |             depth=0,
157 |     ):
158 |         assert isinstance(decoder_pointer, nn.Module) and isinstance(
159 |             encoder_pointer, nn.Module
160 |         ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
161 |         if hasattr(decoder_pointer, "weight"):
162 |             assert hasattr(encoder_pointer, "weight")
163 |             encoder_pointer.weight = decoder_pointer.weight
164 |             if hasattr(decoder_pointer, "bias"):
165 |                 assert hasattr(encoder_pointer, "bias")
166 |                 encoder_pointer.bias = decoder_pointer.bias
167 |             return
168 | 
169 |         encoder_modules = encoder_pointer._modules
170 |         decoder_modules = decoder_pointer._modules
171 |         if len(decoder_modules) > 0:
172 |             assert (
173 |                     len(encoder_modules) > 0
174 |             ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
175 | 
176 |             all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
177 |             encoder_layer_pos = 0
178 |             for name, module in decoder_modules.items():
179 |                 if name.isdigit():
180 |                     encoder_name = str(int(name) + encoder_layer_pos)
181 |                     decoder_name = name
182 |                     if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
183 |                             encoder_modules
184 |                     ) != len(decoder_modules):
185 |                         # this can happen if the name corresponds to the position in a list module list of layers
186 |                         # in this case the decoder has added a cross-attention that the encoder does not have
187 |                         # thus skip this step and subtract one layer pos from encoder
188 |                         encoder_layer_pos -= 1
189 |                         continue
190 |                 elif name not in encoder_modules:
191 |                     continue
192 |                 elif depth > 500:
193 |                     raise ValueError(
194 |                         "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your task."
195 |                     )
196 |                 else:
197 |                     decoder_name = encoder_name = name
198 |                 tie_encoder_to_decoder_recursively(
199 |                     decoder_modules[decoder_name],
200 |                     encoder_modules[encoder_name],
201 |                     module_name + "/" + name,
202 |                     uninitialized_encoder_weights,
203 |                     depth=depth + 1,
204 |                 )
205 |                 all_encoder_weights.remove(module_name + "/" + encoder_name)
206 | 
207 |             uninitialized_encoder_weights += list(all_encoder_weights)
208 | 
209 |     # tie weights recursively
210 |     tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights)
211 |     if len(uninitialized_encoder_weights) > 0:
212 |         print(
213 |             f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
214 |         )
215 |     else:
216 |         print("All encoder weights tied to the decoder")
217 | 


--------------------------------------------------------------------------------
/tfkit/utility/tok.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import nlp2
  4 | from tqdm import tqdm
  5 | from transformers import AutoTokenizer
  6 | 
  7 | UNIVERSAL_SEP = "///"
  8 | 
  9 | 
 10 | def tok_begin(tokenizer):
 11 |     if tokenizer.special_tokens_map.get('bos_token') is not None:
 12 |         return tokenizer.special_tokens_map.get('bos_token')
 13 |     elif tokenizer.special_tokens_map.get('cls_token') is not None:
 14 |         tokenizer.special_tokens_map.get('cls_token')
 15 |     return 'cls'
 16 | 
 17 | 
 18 | def tok_begin_id(tokenizer):
 19 |     return tokenizer.convert_tokens_to_ids(tok_begin(tokenizer))
 20 | 
 21 | 
 22 | def tok_sep(tokenizer):
 23 |     if tokenizer.special_tokens_map.get('sep_token') is not None:
 24 |         return tokenizer.special_tokens_map.get('sep_token')
 25 |     elif tokenizer.special_tokens_map.get('eos_token') is not None:
 26 |         return tokenizer.special_tokens_map.get('eos_token')
 27 |     return 'sep'
 28 | 
 29 | 
 30 | def tok_sep_id(tokenizer):
 31 |     return tokenizer.convert_tokens_to_ids(tok_sep(tokenizer))
 32 | 
 33 | 
 34 | def tok_mask(tokenizer):
 35 |     if tokenizer.special_tokens_map.get('mask_token'):
 36 |         return tokenizer.special_tokens_map.get('mask_token')
 37 |     return 'msk'
 38 | 
 39 | 
 40 | def tok_mask_id(tokenizer):
 41 |     return tokenizer.convert_tokens_to_ids(tok_mask(tokenizer))
 42 | 
 43 | 
 44 | def tok_pad(tokenizer):
 45 |     if tokenizer.special_tokens_map.get('pad_token'):
 46 |         return tokenizer.special_tokens_map.get('pad_token')
 47 |     return 'pad'
 48 | 
 49 | 
 50 | def tok_pad_id(tokenizer):
 51 |     return tokenizer.convert_tokens_to_ids(tok_pad(tokenizer))
 52 | 
 53 | 
 54 | def get_all_tok_from_config(config):
 55 |     tokenizer = AutoTokenizer.from_pretrained(config)
 56 |     return list(tokenizer.get_vocab().keys())
 57 | 
 58 | 
 59 | def handle_exceed(tokenizer, seq, maxlen, mode=['noop', 'remove', 'slide', 'start_slice', 'end_slice'],
 60 |                   keep_after_sep=True):
 61 |     if isinstance(seq, list):
 62 |         return seq, [[len(seq)]]
 63 |     mode = mode[0] if isinstance(mode, list) else mode
 64 |     sep_tok = tok_sep(tokenizer)
 65 |     sep_split = seq.split(sep_tok)
 66 |     ext_seq = [sep_tok] + tokenizer.tokenize(sep_tok.join(sep_split[1:])) \
 67 |         if len(sep_split) > 1 and keep_after_sep else []
 68 |     t_seq = tokenizer.tokenize(sep_split[0])
 69 |     if mode == 'noop':
 70 |         return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
 71 |     if mode == 'remove':
 72 |         if len(t_seq + ext_seq) <= maxlen:
 73 |             return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
 74 |         else:
 75 |             return [], [[0, 0]]
 76 |     if mode == 'slide':
 77 |         return nlp2.sliding_windows(t_seq, maxlen - len(ext_seq), append_seq=ext_seq)
 78 |     if mode == 'start_slice':
 79 |         slices = t_seq[:maxlen - len(ext_seq)]
 80 |         slices.extend(ext_seq)
 81 |         return [slices], [[0, maxlen - len(ext_seq)]]
 82 |     if mode == 'end_slice':
 83 |         start_pos = len(t_seq) + len(ext_seq) - maxlen
 84 |         slices = t_seq[start_pos:]
 85 |         slices.extend(ext_seq)
 86 |         return [slices], [[max(0, start_pos), len(t_seq)]]
 87 | 
 88 | 
 89 | def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
 90 |     unk_count_dict = OrderedDict()
 91 |     for path in file_paths:
 92 |         for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
 93 |             for tok in nlp2.split_sentence_to_array(input_sent):
 94 |                 if tokenizer._unk_token in tokenizer.tokenize(tok):
 95 |                     unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
 96 |     top_range = int((len(unk_count_dict) + 1) * topP * 100)
 97 |     return list(unk_count_dict.keys())[:top_range]
 98 | 
 99 | 
100 | def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
101 |     unk_count_dict = OrderedDict()
102 |     for path in file_paths:
103 |         for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
104 |             for tok in nlp2.split_sentence_to_array(input_sent):
105 |                 if tokenizer._unk_token in tokenizer.tokenize(tok):
106 |                     unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
107 |     return [key for key, value in unk_count_dict.items() if value >= freqK]
108 | 


--------------------------------------------------------------------------------