├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── demo_data
├── classification.csv
├── generation.csv
├── mask.csv
├── mcq.csv
├── qa.csv
├── tag.csv
├── tok_list.txt
└── unk_tok.csv
├── docs
├── benchmark.md
├── img
│ ├── flow.png
│ ├── tfkit-icon.png
│ └── tfkit.png
├── index.md
├── installation.md
├── models.md
├── structure.md
└── tasks.md
├── mkdocs.yml
├── requirements.txt
├── setup.py
└── tfkit
├── __init__.py
├── dump.py
├── eval.py
├── task
├── __init__.py
├── clas
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
├── clm
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
├── once
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
├── oncectc
│ ├── __init__.py
│ └── model.py
├── qa
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
├── seq2seq
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
└── tag
│ ├── __init__.py
│ ├── model.py
│ └── preprocessor.py
├── test
├── __init__.py
├── task
│ └── test_task_model.py
├── test_atrain.py
├── test_package.py
├── test_zeval.py
├── test_zzdump.py
└── utility
│ ├── test_utility_data_filereader.py
│ ├── test_utility_data_loader.py
│ ├── test_utility_data_processor.py
│ ├── test_utility_dataset.py
│ ├── test_utility_eval_metric.py
│ ├── test_utility_logger.py
│ ├── test_utility_loss.py
│ ├── test_utility_model.py
│ └── test_utility_tok.py
├── train.py
└── utility
├── __init__.py
├── data_filereader.py
├── data_loader.py
├── data_processor.py
├── dataset.py
├── eval_metric.py
├── logger.py
├── loss.py
├── model.py
├── predictor.py
└── tok.py
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: [ 3.9 ]
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - uses: actions/cache@v2
27 | with:
28 | path: ~/.cache/pip
29 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
30 | restore-keys: |
31 | ${{ runner.os }}-pip-
32 | - name: Install dependencies
33 | run: |
34 | python -m pip install --upgrade pip
35 | pip install flake8 pytest
36 | pip install -r requirements.txt
37 | pip install .
38 | - name: Lint with flake8
39 | run: |
40 | # stop the build if there are Python syntax errors or undefined names
41 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
42 | - name: Test with pytest
43 | run: |
44 | pytest
45 | - name: Generate coverage report
46 | run: |
47 | pip install pytest-cov
48 | pytest --cov=./ --cov-report=xml
49 | - name: Upload coverage to Codecov
50 | uses: codecov/codecov-action@v1
51 | with:
52 | fail_ci_if_error: false
53 | verbose: false
54 | - name: Build
55 | run: |
56 | python setup.py install
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # General
2 | .DS_Store
3 | .AppleDouble
4 | .LSOverride
5 |
6 | # Icon must end with two \r
7 | Icon
8 |
9 | # Thumbnails
10 | ._*
11 |
12 | # Files that might appear in the root of a volume
13 | .DocumentRevisions-V100
14 | .fseventsd
15 | .Spotlight-V100
16 | .TemporaryItems
17 | .Trashes
18 | .VolumeIcon.icns
19 | .com.apple.timemachine.donotpresent
20 |
21 | # Directories potentially created on remote AFP share
22 | .AppleDB
23 | .AppleDesktop
24 | Network Trash Folder
25 | Temporary Items
26 | .apdisk
27 |
28 | # IntelliJ project files
29 | .idea
30 | *.iml
31 | out
32 | gen### Example user template template
33 | ### Example user template
34 |
35 | # IntelliJ project files
36 | .idea
37 | *.iml
38 | out
39 | gen### Python template
40 | # Byte-compiled / optimized / DLL files
41 | __pycache__/
42 | *.py[cod]
43 | *$py.class
44 |
45 | # C extensions
46 | *.so
47 |
48 | # Distribution / packaging
49 | .Python
50 | build/
51 | develop-eggs/
52 | dist/
53 | downloads/
54 | eggs/
55 | .eggs/
56 | lib/
57 | lib64/
58 | parts/
59 | sdist/
60 | var/
61 | wheels/
62 | *.egg-info/
63 | .installed.cfg
64 | *.egg
65 | MANIFEST
66 |
67 | # PyInstaller
68 | # Usually these files are written by a python script from a template
69 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
70 | *.manifest
71 | *.spec
72 |
73 | # Installer logs
74 | pip-log.txt
75 | pip-delete-this-directory.txt
76 |
77 | # Unit test / coverage reports
78 | htmlcov/
79 | .tox/
80 | .coverage
81 | .coverage.*
82 | .cache
83 | nosetests.xml
84 | coverage.xml
85 | *.cover
86 | .hypothesis/
87 | .pytest_cache/
88 |
89 | # Translations
90 | *.mo
91 | *.pot
92 |
93 | # Django stuff:
94 | *.log
95 | local_settings.py
96 | db.sqlite3
97 |
98 | # Flask stuff:
99 | instance/
100 | .webassets-cache
101 |
102 | # Scrapy stuff:
103 | .scrapy
104 |
105 | # Sphinx documentation
106 | docs/_build/
107 |
108 | # PyBuilder
109 | target/
110 |
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 |
114 | # pyenv
115 | .python-version
116 |
117 | # celery beat schedule file
118 | celerybeat-schedule
119 |
120 | # SageMath parsed files
121 | *.sage.py
122 |
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 |
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 |
136 | # Rope project settings
137 | .ropeproject
138 |
139 | # mkdocs documentation
140 | /site
141 |
142 | # how2
143 | .how2
144 | how2
145 | /how2
146 |
147 | # test cache
148 | ./tfkit/test/cache
149 | /tfkit/test/cache
150 | tfkit/test/cache
151 |
152 | # test cache
153 | ./tfkit/test/runs
154 | /tfkit/test/runs
155 | tfkit/test/runs
156 |
157 | ./tfkit/test/wandb
158 | /tfkit/test/wandb
159 | tfkit/test/wandb
160 |
161 | # cache
162 | ./cache
163 | cache
164 | /cache
165 |
166 | # mypy
167 | .mypy_cache/
168 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to tfkit
2 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
3 |
4 | - Reporting a bug
5 | - Discussing the current state of the code
6 | - Submitting a fix
7 | - Proposing new features
8 | - Becoming a maintainer
9 |
10 | ## We Develop with Github
11 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
12 |
13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
15 |
16 | 1. Fork the repo and create your branch from `master`.
17 | 2. If you've added code that should be tested, add tests.
18 | 3. If you've changed APIs, update the documentation.
19 | 4. Ensure the test suite passes.
20 | 5. Make sure your code lints.
21 | 6. Issue that pull request!
22 |
23 | ## Any contributions you make will be under the Apache 2.0 Software License
24 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/) that covers the project. Feel free to contact the maintainers if that's a concern.
25 |
26 | ## Report bugs using Github's [issues](https://github.com/voidful/tfkit/issues)
27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](); it's that easy!
28 |
29 | ## Write bug reports with detail, background, and sample code
30 | **Great Bug Reports** tend to have:
31 |
32 | - A quick summary and/or background
33 | - Steps to reproduce
34 | - Be specific!
35 | - Give sample code if you can.
36 | - What you expected would happen
37 | - What actually happens
38 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
39 |
40 | People *love* thorough bug reports. I'm not even kidding.
41 |
42 | ## License
43 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
44 |
45 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-devel
2 |
3 | ENV LANG=C.UTF-8
4 | WORKDIR /workspace/
5 | COPY ./ /workspace/
6 |
7 | # install basics
8 | RUN apt-get update -y
9 | RUN apt-get install -y git curl htop wget tmux
10 |
11 | # install python deps
12 | RUN pip install -r /workspace/requirements.txt
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright voidful
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | ## What is it
29 | TFKit is a tool kit mainly for language generation.
30 | It leverages the use of transformers on many tasks with different models in this all-in-one framework.
31 | All you need is a little change of config.
32 |
33 | ## Task Supported
34 | With transformer models - BERT/ALBERT/T5/BART......
35 | | | |
36 | |-|-|
37 | | Text Generation | :memo: seq2seq language model |
38 | | Text Generation | :pen: causal language model |
39 | | Text Generation | :printer: once generation model / once generation model with ctc loss |
40 | | Text Generation | :pencil: onebyone generation model |
41 |
42 | # Getting Started
43 | Learn more from the [document](https://voidful.github.io/TFkit/).
44 |
45 | ## How To Use
46 |
47 | ### Step 0: Install
48 | Simple installation from PyPI
49 | ```bash
50 | pip install git+https://github.com/voidful/TFkit.git@refactor-dataset
51 | ```
52 |
53 | ### Step 1: Prepare dataset in csv format
54 | [Task format](https://voidful.tech/TFkit/tasks/)
55 | ```
56 | input, target
57 | ```
58 |
59 | ### Step 2: Train model
60 | ```bash
61 | tfkit-train \
62 | --task clas \
63 | --config xlm-roberta-base \
64 | --train training_data.csv \
65 | --test testing_data.csv \
66 | --lr 4e-5 \
67 | --maxlen 384 \
68 | --epoch 10 \
69 | --savedir roberta_sentiment_classificer
70 | ```
71 |
72 | ### Step 3: Evaluate
73 | ```bash
74 | tfkit-eval \
75 | --task roberta_sentiment_classificer/1.pt \
76 | --metric clas \
77 | --valid testing_data.csv
78 | ```
79 |
80 | ## Advanced features
81 |
82 | Multi-task training
83 |
84 | ```bash
85 | tfkit-train \
86 | --task clas clas \
87 | --config xlm-roberta-base \
88 | --train training_data_taskA.csv training_data_taskB.csv \
89 | --test testing_data_taskA.csv testing_data_taskB.csv \
90 | --lr 4e-5 \
91 | --maxlen 384 \
92 | --epoch 10 \
93 | --savedir roberta_sentiment_classificer_multi_task
94 | ```
95 |
96 |
97 | ## Not maintained task
98 | Due to time constraints, the following tasks are temporarily not supported
99 | | | |
100 | |-|-|
101 | | Classification | :label: multi-class and multi-label classification |
102 | | Question Answering | :page_with_curl: extractive qa |
103 | | Question Answering | :radio_button: multiple-choice qa |
104 | | Tagging | :eye_speech_bubble: sequence level tagging / sequence level with crf |
105 | | Self-supervise Learning | :diving_mask: mask language model |
106 |
107 | ## Supplement
108 | - [transformers models list](https://huggingface.co/models): you can find any pretrained models here
109 | - [nlprep](https://github.com/voidful/NLPrep): download and preprocessing data in one line
110 | - [nlp2go](https://github.com/voidful/nlp2go): create demo api as quickly as possible.
111 |
112 |
113 | ## Contributing
114 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
115 |
116 | ## License 
117 |
118 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
119 |
120 | ## Icons reference
121 | Icons modify from Freepik from www.flaticon.com
122 | Icons modify from Nikita Golubev from www.flaticon.com
123 |
--------------------------------------------------------------------------------
/demo_data/classification.csv:
--------------------------------------------------------------------------------
1 | We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS
2 | Calotropis procera (ushaar) keratitis.,Not-Related
3 | Fixed drug eruption is associated with many drugs but this is the first such report with omeprazole.,Related///CONCLUSION
--------------------------------------------------------------------------------
/demo_data/generation.csv:
--------------------------------------------------------------------------------
1 | "Dan's parents were overweight . Dan was overweight as well . The doctors told his parents it was unhealthy . His parents understood and decided to make a change .","They got themselves and Dan on a diet ."
2 | "Jane was working at a diner . Suddenly , a customer barged up to the counter . He began yelling about how long his food was taking . /// Jane didn't know how to react .","Luckily , her coworker intervened and calmed the man down ."
3 | Peter was a truck driver . He was running a little behind on schedule . Peter decided to run past the weigh station . He was stopped by a cop .,"Peter ended up running late and getting a fine ."
--------------------------------------------------------------------------------
/demo_data/mask.csv:
--------------------------------------------------------------------------------
1 | "i go to [MASK] by [MASK]","school bus"
2 | "how did i [MASK] [MASK]","get here"
--------------------------------------------------------------------------------
/demo_data/mcq.csv:
--------------------------------------------------------------------------------
1 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . According to the article , what is the author 's opinion about identity ? [MASK] People 's identity is completely determined by their genes . [MASK] Government has the rights to confirm people 's identities . [MASK] Cloning itself gives parents great power over identity . [MASK] Cloning may prevent people from possessing their identities .",3
2 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . According to Paragraph 4 , which is right ? [MASK] cloning means running the risk of wasting too much effort [MASK] numbers of baby animals are likely to be created by cloning [MASK] human cloning is much more difficult than animal cloning [MASK] there are 200 sheep successfully cloned .",0
3 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2
--------------------------------------------------------------------------------
/demo_data/qa.csv:
--------------------------------------------------------------------------------
1 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. What did Beyoncé announce in January 2010?", 18,25
2 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. Who suggested the hiatus for Beyoncé?", 74,84
3 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. In what year did Beyonce have her hiatus?", 60,64
--------------------------------------------------------------------------------
/demo_data/tag.csv:
--------------------------------------------------------------------------------
1 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
2 | "1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 會 中 , 威 廉 · 瓊 斯 發 表 了 下 面 這 段 著 名 的 言 論 : 「 梵 語 儘 管 非 常 古 老 , 構 造 卻 精 妙 絕 倫 : 比 希 臘 語 還 完 美 , 比 拉 丁 語 還 豐 富 , 精 緻 之 處 同 時 勝 過 此 兩 者 , 但 在 動 詞 詞 根 和 語 法 形 式 上 , 又 跟 此 兩 者 無 比 相 似 , 不 可 能 是 巧 合 的 結 果 。 這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
3 | "這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。 基 於 相 似 的 原 因 , 儘 管 缺 少 同 樣 有 力 的 證 據 , 我 們 可 以 推 想 哥 德 語 和 凱 爾 特 語 , 雖 然 混 入 了 迥 然 不 同 的 語 彙 , 也 與 梵 語 有 著 相 同 的 起 源 ; 而 古 波 斯 語 可 能 也 是 這 一 語 系 的 子 裔 。 」 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
4 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 印 歐 語 系 因 為 哪 一 門 語 言 而 被 發 現 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
5 | 實 驗 室,LOA LOB LOC
6 | 溫 者 必 良 , 自 古 而 然 。,O O O O O O O O O O
7 | 狼 煙 逝 去 , 幽 夢 醒 來 。,B_Thing I_Thing O O O O O O O O
--------------------------------------------------------------------------------
/demo_data/tok_list.txt:
--------------------------------------------------------------------------------
1 | 闕
2 | :mbk1:
3 | >gg<
--------------------------------------------------------------------------------
/demo_data/unk_tok.csv:
--------------------------------------------------------------------------------
1 | 紫府東風放夜時。步蓮穠李伴人歸,五更鐘動笙歌散,十里月明燈火稀。
2 | 香苒苒,夢依依。天涯寒盡減春衣,鳳凰城闕知何處,寥落星河一雁飛。
--------------------------------------------------------------------------------
/docs/benchmark.md:
--------------------------------------------------------------------------------
1 | ##DRCD
2 | ### Test
3 | | model | EM | F1 |
4 | | :----:|:----: |:----: |
5 | | albert-small | 74.45% | 86.08% |
6 | | electra-small | 76.64% | 87.49% |
7 | | albert-base | 80.17% | 89.87% |
8 |
9 | ### Dev
10 | | model | EM | F1 |
11 | | :----:|:----: |:----: |
12 | | albert-small | 73.70% | 85.33% |
13 | | electra-small | 77.61% | 87.33% |
14 | | albert-base | 80.52% | 89.92% |
15 |
--------------------------------------------------------------------------------
/docs/img/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/flow.png
--------------------------------------------------------------------------------
/docs/img/tfkit-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit-icon.png
--------------------------------------------------------------------------------
/docs/img/tfkit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | ## Getting started
23 |
24 | ### Installing via pip
25 | ```bash
26 | pip install tfkit
27 | ```
28 |
29 | * You can use tfkit for model training and evaluation with `tfkit-train` and `tfkit-eval`.
30 |
31 | ### Running TFKit on the task you wanted
32 |
33 | ### First step - prepare your dataset
34 | The key to combine different task together is to make different task with same data format.
35 |
36 | **notice**
37 |
38 | * All data will be in csv format - tfkit will use **csv** for all task, normally it will have two columns, first columns is the input of models, the second column is the output of models.
39 | * Plane text with no tokenization - there is no need to tokenize text before training, or do re-calculating for tokenization, tfkit will handle it for you.
40 | * No header is needed.
41 |
42 | For example, a sentiment classification dataset will be like:
43 | ```csv
44 | how dare you,negative
45 | ```
46 |
47 | !!! hint
48 | For the detail and example format on different, you can check [here](tasks/)
49 |
50 | !!! hint
51 | nlprep is a tool for data split/preprocessing/argumentation, it can help you to create ready to train data for tfkit, check [here](https://github.com/voidful/NLPrep)
52 |
53 | ### Second step - model training
54 |
55 | Using `tfkit-train` for model training, you can use
56 |
57 | Before training a model, there is something you need to clarify:
58 |
59 | - `--model` what is your model to handle this task? check [here](models/) to the detail of models.
60 | - `--config` what pretrained model you want to use? you can go [https://huggingface.co/models](https://huggingface.co/models) to search for available pretrained models.
61 | - `--train` and `--test` training and testing dataset path, which is in csv format.
62 | - `--savedir` model saving directory, default will be in '/checkpoints' folder
63 |
64 | you can leave the rest to the default config, or use `tfkit-train -h` to more configuration.
65 |
66 | An example about training a sentiment classifier:
67 | ```bash
68 | tfkit-train \
69 | --task clas \
70 | --config xlm-roberta-base \
71 | --train training_data.csv \
72 | --test testing_data.csv \
73 | --lr 4e-5 \
74 | --maxlen 384 \
75 | --epoch 10 \
76 | --savedir roberta_sentiment_classificer
77 | ```
78 |
79 | #### Third step - model eval
80 |
81 | Using `tfkit-eval` for model evaluation.
82 | - `--model` saved model's path.
83 | - `--metric` the evaluation metric eg: emf1, nlg(BLEU/ROUGE), clas(confusion matrix).
84 | - `--valid` validation data, also in csv format.
85 | - `--panel` a input panel for model specific parameter.
86 |
87 | for more configuration detail, you may use `tfkit-eval -h`.
88 |
89 | After evaluate, It will print evaluate result in your console, and also generate three report for debugging.
90 | - `*_score.csv` overall score, it is the copy of the console result.
91 | - `*each_data_score.csv` score on each data, 3 column `predicted,targets,score`, ranked from the lowest to the highest.
92 | - `*predicted.csv` csv file include 3 column `input,predicted,targets`.
93 |
94 | !!! hint
95 | nlp2go is a tool for demonstration, with CLI and Restful interface. check [here](https://github.com/voidful/nlp2go)
96 |
97 | ### Example
98 | #### Use distilbert to train NER Model
99 | ```bash
100 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t
101 | tfkit-train --batch 10 --epoch 3 --lr 5e-6 --train ./clner_row/train --test ./clner_row/test --maxlen 512 --task tag --config distilbert-base-multilingual-cased
102 | nlp2go --task ./checkpoints/3.pt --cli
103 | ```
104 |
105 | #### Use Albert to train DRCD Model Model
106 | ```bash
107 | nlprep --dataset qa_zh --outdir ./zhqa/
108 | tfkit-train --maxlen 512 --savedir ./drcd_qa_model/ --train ./zhqa/drcd-train --test ./zhqa/drcd-test --task qa --config voidful/albert_chinese_small --cache
109 | nlp2go --task ./drcd_qa_model/3.pt --cli
110 | ```
111 |
112 | #### Use Albert to train both DRCD Model and NER Model
113 | ```bash
114 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t
115 | nlprep --dataset qa_zh --outdir ./zhqa/
116 | tfkit-train --maxlen 300 --savedir ./mt-qaner --train ./clner_row/train ./zhqa/drcd-train --test ./clner_row/test ./zhqa/drcd-test --task tag qa --config voidful/albert_chinese_small
117 | nlp2go --task ./mt-qaner/3.pt --cli
118 | ```
119 |
120 | **You can also try tfkit in Google Colab: [](https://colab.research.google.com/drive/1hqaTKxd3VtX2XkvjiO0FMtY-rTZX30MJ?usp=sharing)**
121 |
122 | ## Contributing
123 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
124 |
125 | ## License
126 | 
127 |
128 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
129 |
130 | ## Icons reference
131 | Icons modify from Freepik from www.flaticon.com
132 | Icons modify from Nikita Golubev from www.flaticon.com
133 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 | tfkit is tested on Python 3.6+, and PyTorch 1.1.0+.
3 |
4 | ### Installing via pip
5 | ```bash
6 | pip install tfkit
7 | ```
8 | ### Installing via source
9 | ```bash
10 | git clone https://github.com/voidful/tfkit.git
11 | python setup.py install
12 | # or
13 | pip install .
14 | ```
15 |
16 | ## Running tfkit
17 | Model you've installed tfkit, you can run with
18 |
19 | ### pip installed version:
20 | `tfkit-train`
21 | `tfkit-eval`
22 | `tfkit-dump`
23 |
24 | ### local version:
25 | `python -m tfkit.train`
26 | `python -m tfkit.eval`
27 | `python -m tfkit.dump`
--------------------------------------------------------------------------------
/docs/models.md:
--------------------------------------------------------------------------------
1 | ## Models Overview
2 |
3 | | task | available models |
4 | | ----------- | ------------------------------------ |
5 | | text generation | `seq2seq` `clm` `onebyone` `once` `oncectc` |
6 | | extractive question answering | `qa` |
7 | | multiple choice question answering | `mcq` |
8 | | sequence tagging | `tag` `tagcrf` |
9 | | sentence classification | `clas` |
10 | | mask language model | `clm` |
11 |
12 | ## Text Generation
13 | ### `seq2seq`
14 | [comment]: <> (::: tfkit.model.seq2seq.model.Model.forward)
15 | [comment]: <> (::: tfkit.model.seq2seq.dataloader)
16 | encoder decoder models for text generation, eg: T5/BART
17 |
18 | ### `clm`
19 | causal language model, decoder only models for text generation, eg: GPT
20 |
21 | ### `onebyone`
22 | onebyone text generation, for mask lm generation.
23 |
24 | ### `once`
25 | once text generation
26 |
27 | ### `oncectc`
28 | once text generation with ctc loss
29 |
30 | ## Extractive Question Answering
31 | ### `qa`
32 | SQuAD like question answer
33 |
34 | ## Multiple Choice Question Answering
35 | ### `mcq`
36 | softmax from mask token in input
37 |
38 | ## Sequence Tagging
39 | ### `tag`
40 | token classification
41 |
42 | ### `tagcrf`
43 | token classification with crf layer
44 |
45 | ## Sentence Classification
46 | ### `clas`
47 | sentence classification using pooling head from transformer models.
48 |
49 | ## Mask Language Model
50 | ### `mask`
51 | mask token prediction, for self-supervised learning
--------------------------------------------------------------------------------
/docs/structure.md:
--------------------------------------------------------------------------------
1 | ## Overview
2 | Flow
3 | 
4 |
5 | Project directory:
6 | ```
7 | .
8 | ├─ demo_data/ # Example data for training and evaluation
9 | ├─ docs/ # Documents
10 | ├─ tfkit/
11 | │ ├─ model/ # all of the models, subdir name will be model name
12 | │ │ ├─ model_name # - name will be dynamic import to tfkit-train
13 | │ │ │ ├─ __init__.py
14 | │ │ │ ├─ dataloader.py # - for data loading and preprocessing
15 | │ │ │ └─ model.py # - model forward and prediction
16 | │ │ └─ __init__.py
17 | │ ├─ test/ # project unit test
18 | │ │ ├─ __init__.py
19 | │ │ ├─ test_atrain.py # - test tfkit-train
20 | │ │ ├─ test_dataloader.py # - test all model/*/dataloader.py
21 | │ │ ├─ test_model.py # - test all model/*/model.py
22 | │ │ ├─ test_package.py # - test package import
23 | │ │ ├─ test_utility_dataset.py # - test utility/dataset.py
24 | │ │ ├─ test_utility_eval_metric.py # - test utility/eval_metric.py
25 | │ │ ├─ test_utility_logger.py # - test utility/logger.py
26 | │ │ ├─ test_utility_loss.py # - test utility/loss.py
27 | │ │ ├─ test_utility_model_loader.py # - test utility/model_loader.py
28 | │ │ ├─ test_utility_tok.py # - test utility/predictor.py
29 | │ │ ├─ test_zeval.py # - test tfkit-eval
30 | │ │ └─ test_zzdump.py # - test tfkit-dump
31 | │ ├─ utility/ # project utility
32 | │ │ ├─ __init__.py
33 | │ │ ├─ dataset.py # - handle dataset loading
34 | │ │ ├─ eval_metric.py # - handle evaluation metric calculation
35 | │ │ ├─ logger.py # - handle logging and printing
36 | │ │ ├─ loss.py # - custom loss function
37 | │ │ ├─ model_loader.py # - handle model loading
38 | │ │ ├─ predictor.py # - handle model prediction
39 | │ │ └─ tok.py # - handle tokenization
40 | │ ├─ __init__.py # package init
41 | │ ├─ dump.py # tfkit-dump handler
42 | │ ├─ eval.py # tfkit-eval handler
43 | │ └─ train.py # tfkit-train handler
44 | ├─ Dockerfile # recommend docker file
45 | ├─ mkdocs.yml # document config
46 | ├─ README.md # project readme
47 | ├─ requirements.txt # package requirement
48 | └─ setup.py # package setup
49 | ```
--------------------------------------------------------------------------------
/docs/tasks.md:
--------------------------------------------------------------------------------
1 | ## Task format
2 |
3 | ### Classification
4 |
5 | !!! info
6 | #### multi-class classification:
7 | Format:
8 | `input sentence,label`
9 |
10 | Example:
11 | ```
12 | Calotropis procera (ushaar) keratitis.,Not-Related
13 | ```
14 |
15 | #### multi-label classification
16 | use `///` to separate each label.
17 |
18 | Format:
19 | `input sentence,label1///label2`
20 |
21 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/classification.csv):
22 | ```
23 | We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS
24 | ```
25 |
26 | ### Text Generation
27 |
28 | !!! info
29 | Format:
30 | `input sentence, target sentence`
31 |
32 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/generation.csv):
33 | ```
34 | Peter was a truck driver . He was running a little behind on schedule . Peter decided to run past the weigh station . He was stopped by a cop .,"Peter ended up running late and getting a fine ."
35 | ```
36 |
37 | ### Extractive Question Answering
38 |
39 | !!! info
40 | Format:
41 | `input sentence with question, answer start position, answer end position`
42 |
43 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/qa.csv):
44 | ```
45 | Beyoncé announced a hiatus from her music ... Who suggested the hiatus for Beyoncé?, 74,84
46 | ```
47 |
48 | ### Multiple-Choice Question Answering
49 |
50 | !!! info
51 | Input passage should include all available, $each choice must start with a mask token$
52 | choice id will be start from 0
53 |
54 | Format:
55 | `input passage [MASK]choiceA [MASK]choiceB, 1`
56 |
57 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mcq.csv):
58 | ```
59 | "I 'm sure many of you have seen Star Wars ... What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2
60 | ```
61 |
62 | ### Mask Language Modeling
63 |
64 | !!! info
65 | input sentence with mask, can be multiple
66 | target of each mask should be separate by blank
67 | Format:
68 | `input sentence with [MASK] [MASK],target_token target_token`
69 |
70 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mask.csv):
71 | ```
72 | "how did i [MASK] [MASK]","get here"
73 | ```
74 |
75 | ### Sequence Tagging
76 |
77 | !!! info
78 | input sentence with blank between each word
79 | target label separate with blank, should be one to one to the input
80 | Format:
81 | `input sentence,tag tag`
82 |
83 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/tag.csv):
84 | ```
85 | "welcome to New York,O O B_place B_place"
86 | ```
87 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | # Project information
2 | site_name: tfkit
3 | site_description: 🤖📇 Transformers kit - End2End toolkit for NLP task
4 | site_author: Voidful
5 | site_url: https://github.com/voidful/tfkit
6 | repo_name: tfkit
7 | repo_url: https://github.com/voidful/tfkit
8 | copyright: Copyright © Voidful
9 |
10 | nav:
11 | - Home: index.md
12 | - Installation: installation.md
13 | - Tasks: tasks.md
14 | - Models: models.md
15 | - Structure: structure.md
16 | - Benchmark: benchmark.md
17 |
18 | plugins:
19 | - search
20 | - mkdocstrings:
21 | default_handler: python
22 | handlers:
23 | python:
24 | setup_commands:
25 | - import sys
26 | - sys.path.append("docs")
27 | rendering:
28 | show_root_heading: True
29 | heading_level: 3
30 | show_source: false
31 | watch:
32 | - tfkit
33 |
34 | theme:
35 | name: material
36 | language: en
37 | palette:
38 | primary: blue grey
39 | accent: blue grey
40 | font:
41 | text: Roboto
42 | code: Roboto Mono
43 | logo: img/tfkit-icon.png
44 | favicon: img/tfkit-icon.png
45 |
46 | # Extras
47 | extra:
48 | social:
49 | - icon: fontawesome/brands/github-alt
50 | link: https://github.com/voidful/tfkit
51 | - icon: fontawesome/brands/twitter
52 | link: https://twitter.com/voidful_stack
53 | - icon: fontawesome/brands/linkedin
54 | link: https://www.linkedin.com/in/voidful/
55 | version:
56 | provider: mike
57 |
58 | # Google Analytics
59 | google_analytics:
60 | - UA-127062540-5
61 | - auto
62 |
63 | # Extensions
64 | markdown_extensions:
65 | - markdown.extensions.admonition
66 | - markdown.extensions.attr_list
67 | - markdown.extensions.codehilite:
68 | guess_lang: false
69 | - markdown.extensions.def_list
70 | - markdown.extensions.footnotes
71 | - markdown.extensions.meta
72 | - markdown.extensions.toc:
73 | permalink: true
74 | - pymdownx.arithmatex
75 | - pymdownx.betterem:
76 | smart_enable: all
77 | - pymdownx.caret
78 | - pymdownx.critic
79 | - pymdownx.details
80 | - pymdownx.emoji:
81 | emoji_index: !!python/name:materialx.emoji.twemoji
82 | emoji_generator: !!python/name:materialx.emoji.to_svg
83 | # - pymdownx.highlight:
84 | # linenums_style: pymdownx-inline
85 | - pymdownx.inlinehilite
86 | - pymdownx.keys
87 | - pymdownx.magiclink:
88 | repo_url_shorthand: true
89 | user: squidfunk
90 | repo: mkdocs-material
91 | - pymdownx.mark
92 | - pymdownx.smartsymbols
93 | - pymdownx.snippets:
94 | check_paths: true
95 | - pymdownx.superfences
96 | - pymdownx.tabbed
97 | - pymdownx.tasklist:
98 | custom_checkbox: true
99 | - pymdownx.tilde
100 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=3.3.0
2 | tensorboard
3 | tensorboardX
4 | torch
5 | matplotlib
6 | nlp2>=1.8.44
7 | tqdm>=4.45.0
8 | inquirer
9 | numpy
10 | scipy>=1.10.1
11 | pytorch-crf
12 | sentencepiece
13 | pandas
14 | accelerate>=0.5.1
15 | joblib
16 | scikit-learn
17 | editdistance
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open('requirements.txt') as f:
4 | required = f.read().splitlines()
5 |
6 | setup(
7 | name='tfkit',
8 | version='0.8.20',
9 | description='Transformers kit - Multi-task QA/Tagging/Multi-label Multi-Class Classification/Generation with BERT/ALBERT/T5/BERT',
10 | url='https://github.com/voidful/TFkit',
11 | author='Voidful',
12 | author_email='voidful.stack@gmail.com',
13 | long_description=open("README.md", encoding="utf8").read(),
14 | long_description_content_type="text/markdown",
15 | setup_requires=['setuptools-git'],
16 | classifiers=[
17 | 'Development Status :: 4 - Beta',
18 | "Intended Audience :: Science/Research",
19 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
20 | "License :: OSI Approved :: Apache Software License",
21 | 'Programming Language :: Python :: 3.6'
22 | ],
23 | license="Apache",
24 | keywords='transformer huggingface nlp multi-task multi-class multi-label classification generation tagging deep learning machine reading',
25 | packages=find_packages(),
26 | install_requires=required,
27 | entry_points={
28 | 'console_scripts': ['tfkit-train=tfkit.train:main', 'tfkit-eval=tfkit.eval:main', 'tfkit-dump=tfkit.dump:main']
29 | },
30 | py_modules=['tfkit'],
31 | python_requires=">=3.5.0",
32 | zip_safe=False,
33 | )
34 |
--------------------------------------------------------------------------------
/tfkit/__init__.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility
2 | import tfkit.dump
3 | import tfkit.train
4 | import tfkit.eval
5 | from tfkit.task import *
--------------------------------------------------------------------------------
/tfkit/dump.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 |
4 | from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, \
5 | AutoModelForCausalLM
6 |
7 | from tfkit.utility.model import load_trained_model, add_tokens_to_pretrain
8 |
9 |
10 | def parse_dump_args(args):
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("--model", required=True, type=str)
13 | parser.add_argument("--dumpdir", required=True, type=str)
14 | return vars(parser.parse_args(args))
15 |
16 |
17 | def main(arg=None):
18 | arg = parse_dump_args(sys.argv[1:]) if arg is None else parse_dump_args(arg)
19 | model, model_type, model_class, model_info, model_preprocessor = load_trained_model(arg.get('model'))
20 | tokenizer = model.tokenizer
21 | pretrained_config = model_info.get("model_config")
22 | if model_type == 'clm':
23 | hf_model = AutoModelForCausalLM.from_pretrained(model_info.get("model_config"))
24 | hf_model.eval()
25 | hf_model.transformer = model.pretrained
26 | if hasattr(hf_model, 'lm_head'):
27 | hf_model.lm_head.weight = model.model.weight
28 | else:
29 | hf_model.cls.weight = model.model.weight
30 | hf_model.config.tie_word_embeddings = False
31 | hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', []))
32 | hf_model.save_pretrained(arg.get('dumpdir'))
33 | elif model_type == 'seq2seq':
34 | hf_model = AutoModelForSeq2SeqLM.from_pretrained(model_info.get("model_config"))
35 | hf_model.eval()
36 | hf_model.model = model.pretrained
37 | hf_model.lm_head = model.model
38 | hf_model.config.tie_word_embeddings = False
39 | hf_model.config.tie_encoder_decoder = False
40 | hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', []))
41 | hf_model.save_pretrained(arg.get('dumpdir'))
42 | elif model_type == 'clas':
43 | hf_model = AutoModelForSequenceClassification.from_pretrained(model_info.get("model_config"))
44 | hf_model.classifier.weight = model.classifier_list[0].weight
45 | hf_model.save_pretrained(arg.get('dumpdir'))
46 | else:
47 | model.pretrained.save_pretrained(arg.get('dumpdir'))
48 |
49 | tokenizer.save_pretrained(arg.get('dumpdir'))
50 | print('==================')
51 | print("Finish model dump.")
52 |
53 |
54 | if __name__ == "__main__":
55 | main()
56 |
--------------------------------------------------------------------------------
/tfkit/eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import csv
3 | import logging
4 | import sys
5 | import time
6 | from datetime import timedelta
7 |
8 | import nlp2
9 | import torch
10 | from tqdm.auto import tqdm
11 |
12 | from tfkit.utility.eval_metric import EvalMetric
13 | from tfkit.utility.model import load_trained_model, load_predict_parameter
14 |
15 | transformers_logger = logging.getLogger('transformers')
16 | transformers_logger.setLevel(logging.CRITICAL)
17 |
18 |
19 | def parse_eval_args(args):
20 | parser = argparse.ArgumentParser()
21 | group = parser.add_mutually_exclusive_group(required=True)
22 | group.add_argument("--model", nargs='+', type=str, help="evaluation model")
23 | parser.add_argument("--config", type=str, help='pre-trained task path after add token')
24 | parser.add_argument("--metric", required=True, type=str, choices=['emf1', 'nlg', 'clas', 'er'],
25 | help="evaluate metric")
26 | parser.add_argument("--valid", required=True, type=str, nargs='+', help="evaluate data path")
27 | parser.add_argument("--tag", type=str, help="evaluate task tag for select multi-task task")
28 | parser.add_argument("--print", action='store_true', help="print each pair of evaluate data")
29 | parser.add_argument("--panel", action='store_true', help="enable panel to input argument")
30 |
31 | input_arg, model_arg = parser.parse_known_args(args)
32 | input_arg = {k: v for k, v in vars(input_arg).items() if v is not None}
33 | model_arg = {k.replace("--", ""): v for k, v in zip(model_arg[:-1:2], model_arg[1::2])}
34 | return input_arg, model_arg
35 |
36 |
37 | def main(arg=None):
38 | with torch.no_grad():
39 | eval_arg, model_arg = parse_eval_args(sys.argv[1:]) if arg is None else parse_eval_args(arg)
40 | models_path = eval_arg.get('model', [])
41 |
42 | if nlp2.is_dir_exist(models_path[0]):
43 | models = [f for f in nlp2.get_files_from_dir(models_path[0]) if f.endswith('.pt')]
44 | else:
45 | models = models_path
46 |
47 | for model_path in models:
48 | start_time = time.time()
49 | valid = eval_arg.get('valid')[0]
50 | model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path,
51 | pretrained_config=eval_arg.get(
52 | 'config'),
53 | tag=eval_arg.get('tag'))
54 | predict_parameter = load_predict_parameter(model, model_arg, eval_arg.get('panel'))
55 |
56 | eval_metrics = [EvalMetric(model.tokenizer)
57 | for _ in range(int(predict_parameter.get('decodenum', 1)))]
58 |
59 | print("PREDICT PARAMETER")
60 | print("=======================")
61 | print(predict_parameter)
62 | print("=======================")
63 |
64 | get_data_item = preprocessor.read_file_to_data(valid)
65 | for chunk in tqdm(get_data_item):
66 | for i in chunk:
67 | input = i['input']
68 | target = i['target']
69 | predict_parameter.update({'input': input})
70 | result, result_dict = model.predict(**predict_parameter)
71 | for eval_pos, eval_metric in enumerate(eval_metrics):
72 | # predicted can be list of string or string
73 | # target should be list of string
74 | predicted = result
75 | processed_target = target
76 | if 'qa' in model_type:
77 | processed_target = " ".join(input.split(" ")[int(target[0]): int(target[1])])
78 | if len(result) > 0:
79 | predicted = result[0][0] if isinstance(result[0], list) else result[0]
80 | else:
81 | predicted = ''
82 | elif 'onebyone' in model_type or 'seq2seq' in model_type or 'clm' in model_type:
83 | processed_target = target
84 | if len(result) < eval_pos:
85 | print("Decode size smaller than decode num:", result_dict['label_map'])
86 | predicted = result[eval_pos]
87 | elif 'once' in model_type:
88 | processed_target = target
89 | predicted = result[eval_pos]
90 | elif 'mask' in model_type:
91 | processed_target = target.split(" ")
92 | predicted = result
93 | elif 'tag' in model_type:
94 | predicted = " ".join([list(d.values())[0] for d in result_dict[0]['label_map']])
95 | processed_target = target[0].split(" ")
96 | predicted = predicted.split(" ")
97 |
98 | if eval_arg.get('print'):
99 | print('===eval===')
100 | print("input: ", input)
101 | print("target: ", processed_target)
102 | print("predicted: ", predicted)
103 | print('==========')
104 |
105 | eval_metric.add_record(input, predicted, processed_target, eval_arg.get('metric'))
106 |
107 | for eval_pos, eval_metric in enumerate(eval_metrics):
108 | argtype = f"_dataset{valid.replace('/', '_').replace('.', '_')}"
109 | if 'decodenum' in predict_parameter and int(predict_parameter['decodenum']) > 1:
110 | argtype += f"_num_{eval_pos}"
111 | if 'mode' in predict_parameter:
112 | para_mode = predict_parameter['mode'][0] if isinstance(predict_parameter['mode'], list) else \
113 | predict_parameter['mode'].lower()
114 | argtype += f"_mode_{para_mode}"
115 | if 'filtersim' in predict_parameter:
116 | argtype += f"_filtersim_{predict_parameter['filtersim']}"
117 | outfile_name = f"{model_path}{argtype}"
118 |
119 | with open(f"{outfile_name}_predicted.csv", "w", encoding='utf8') as f:
120 | writer = csv.writer(f)
121 | records = eval_metric.get_record(eval_arg.get('metric'))
122 | writer.writerow(['input', 'predicted', 'targets'])
123 | for i, p, t in zip(records['ori_input'], records['ori_predicted'], records['ori_target']):
124 | writer.writerow([i, p, t])
125 | print("write result at:", outfile_name)
126 |
127 | with open(f"{outfile_name}_each_data_score.csv", "w", encoding='utf8') as edsf:
128 | eds = csv.writer(edsf)
129 | with open(f"{outfile_name}_score.csv", "w", encoding='utf8') as f:
130 | for i in eval_metric.cal_score(eval_arg.get('metric')):
131 | f.write(f"TASK: {i[0]} , {eval_pos}\n")
132 | f.write(f"{i[1]}\n")
133 | eds.writerows(i[2])
134 |
135 | print("write score at:", outfile_name)
136 |
137 | for i in eval_metric.cal_score(eval_arg.get('metric')):
138 | print("TASK: ", i[0], eval_pos)
139 | print(i[1])
140 |
141 | print(f"=== Execution time: {timedelta(seconds=(time.time() - start_time))} ===")
142 |
143 |
144 | if __name__ == '__main__':
145 | main()
146 |
--------------------------------------------------------------------------------
/tfkit/task/__init__.py:
--------------------------------------------------------------------------------
1 | import os, pkgutil
2 |
3 | __all__ = list(module for _, module, _ in pkgutil.iter_modules([os.path.dirname(__file__)]))
4 |
--------------------------------------------------------------------------------
/tfkit/task/clas/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/clas/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import torch
5 | from torch import nn
6 |
7 | from tfkit.utility.predictor import ClassificationPredictor
8 |
9 | dir_path = os.path.dirname(os.path.realpath(__file__))
10 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
11 |
12 | from torch import softmax, sigmoid
13 | from tfkit.task.clas import Preprocessor
14 | from tfkit.utility.loss import FocalLoss, BCEFocalLoss
15 |
16 |
17 | class Model(nn.Module):
18 |
19 | def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.1, **kwargs):
20 | super().__init__()
21 | self.tokenizer = tokenizer
22 | self.pretrained = pretrained
23 |
24 | self.dropout = nn.Dropout(dropout)
25 | self.loss_fct = FocalLoss()
26 | self.loss_fct_mt = BCEFocalLoss()
27 |
28 | self.tasks = dict()
29 | self.tasks_detail = tasks_detail
30 | self.classifier_list = nn.ModuleList()
31 | for task, labels in tasks_detail.items():
32 | self.classifier_list.append(nn.Linear(self.pretrained.config.hidden_size, len(labels)))
33 | self.tasks[task] = len(self.classifier_list) - 1
34 | self.maxlen = maxlen
35 |
36 | self.pretrained = self.pretrained
37 | self.classifier_list = self.classifier_list
38 | self.loss_fct = self.loss_fct
39 | self.loss_fct_mt = self.loss_fct_mt
40 |
41 | predictor = ClassificationPredictor(self, Preprocessor)
42 | self.predictor = predictor
43 | self.predict = predictor.predict
44 |
45 | def get_all_task(self):
46 | """
47 | list all classification task
48 | :return: tasks list
49 | """
50 | return list(self.tasks.keys())
51 |
52 | def mean_pooling(self, model_output, attention_mask):
53 | """
54 | Mean Pooling - Take attention mask into account for correct averaging
55 | from https://github.com/UKPLab/sentence-transformers
56 | modify - mask from -1 to 0
57 | :param model_output:
58 | :param attention_mask:
59 | :return:
60 | """
61 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
62 | input_mask_expanded[input_mask_expanded < 0] = 0
63 | sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
64 | sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
65 | return sum_embeddings / sum_mask
66 |
67 | def forward(self, batch_data, eval=False, **kwargs):
68 | # covert input to correct data type
69 | tasks = batch_data['task']
70 | tasks = [bytes(t).decode(encoding="utf-8", errors="ignore") for t in tasks]
71 | inputs = torch.as_tensor(batch_data['input'])
72 | targets = torch.as_tensor(batch_data['target'])
73 | masks = torch.as_tensor(batch_data['mask'])
74 | # define model output
75 | result_dict = {
76 | 'max_item': [],
77 | 'prob_list': [],
78 | 'label_prob': []
79 | }
80 |
81 | result_logits = []
82 | result_labels = []
83 | for p, zin in enumerate(zip(tasks, inputs, masks)):
84 | task, input, mask = zin
85 | task_id = self.tasks[task]
86 | task_labels = self.tasks_detail[task]
87 | output = self.pretrained(input.unsqueeze(0), mask.unsqueeze(0))[0]
88 | pooled_output = self.dropout(self.mean_pooling(output, mask.unsqueeze(0)))
89 | classifier_output = self.classifier_list[task_id](pooled_output)
90 | reshaped_logit = classifier_output.view(-1, len(task_labels)) # 0 for cls position
91 | result_logits.append(reshaped_logit)
92 | if not eval:
93 | target = targets[p]
94 | result_labels.append(target)
95 | else:
96 | if 'multi_label' in task:
97 | reshaped_logit = sigmoid(reshaped_logit)
98 | else:
99 | reshaped_logit = softmax(reshaped_logit, dim=1)
100 | logit_prob = reshaped_logit[0].data.tolist()
101 | logit_label = dict(zip(task_labels, logit_prob))
102 | result_dict['label_prob'].append({task: logit_label})
103 | if 'multi_label' in task:
104 | result_dict['max_item'].append({task: [k for k, v in logit_label.items() if v > 0.5]})
105 | else:
106 | result_dict['max_item'].append({task: [task_labels[logit_prob.index(max(logit_prob))]]})
107 |
108 | if eval:
109 | outputs = result_dict
110 | else:
111 | loss = 0
112 | for logit, labels, task in zip(result_logits, result_labels, tasks):
113 | if 'multi_label' in task:
114 | loss += self.loss_fct_mt(logit, labels.type_as(logit))
115 | else:
116 | loss += self.loss_fct(logit, labels)
117 | outputs = loss
118 |
119 | return outputs
120 |
--------------------------------------------------------------------------------
/tfkit/task/clas/preprocessor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from sklearn.preprocessing import MultiLabelBinarizer
3 |
4 | from tfkit.utility import tok
5 | from tfkit.utility.data_filereader import get_multiclas_data_from_file
6 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
7 |
8 |
9 | class Preprocessor(GeneralNLPPreprocessor):
10 |
11 | def read_file_to_data(self, path):
12 | return get_multiclas_data_from_file(path)
13 |
14 | def preprocess_component_convert_to_id(self, item, **param_dict):
15 | item['input'] = self.tokenizer.convert_tokens_to_ids(item['input'])
16 | yield item
17 |
18 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
19 | tinput, task = item['input'], item['task']
20 | row_dict = {'task': list(task.encode("utf-8"))}
21 | tokenized_input_id = [tok.tok_begin_id(tokenizer)] + tinput + [tok.tok_sep_id(tokenizer)]
22 | mask_id = [1] * len(tokenized_input_id)
23 | row_dict['input'] = tokenized_input_id
24 | row_dict['mask'] = mask_id
25 | row_dict['target'] = [-1]
26 | if 'target' in item:
27 | target = item['target']
28 | if 'multi_label' in task:
29 | mlb = MultiLabelBinarizer(classes=item['task_dict'][task])
30 | tar = mlb.fit_transform([target])
31 | tokenize_label = tar
32 | else:
33 | tokenize_label = [item['task_dict'][task].index(target[0])]
34 | row_dict['target'] = tokenize_label
35 | return {key: torch.tensor(value) for key, value in row_dict.items()}
36 |
--------------------------------------------------------------------------------
/tfkit/task/clm/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/clm/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from tfkit.task.clm import Preprocessor
5 | from tfkit.utility.predictor import AutoRegressivePredictor
6 |
7 | dir_path = os.path.dirname(os.path.realpath(__file__))
8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
9 |
10 | import torch
11 | from torch import nn
12 | from torch.nn.functional import softmax
13 |
14 |
15 | class Model(nn.Module):
16 | def __init__(self, tokenizer, pretrained, maxlen=512, **kwargs):
17 | super().__init__()
18 | self.tokenizer = tokenizer
19 | self.pretrained = pretrained
20 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
21 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
22 | self.maxlen = maxlen
23 | predictor = AutoRegressivePredictor(self, Preprocessor)
24 | self.predictor = predictor
25 | self.predict = predictor.predict
26 |
27 | def clean_cache(self):
28 | self.encoder_outputs = None
29 | self.past_key_values = None
30 |
31 | def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs):
32 | inputs = batch_data['input']
33 | masks = batch_data['mask']
34 | tokens_tensor = torch.as_tensor(inputs)
35 | mask_tensors = torch.as_tensor(masks)
36 |
37 | outputs = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
38 | prediction_scores = self.model(outputs[0])
39 |
40 | if eval:
41 | result_dict = {}
42 | start = batch_data['start'][0]
43 | softmax_score = softmax(prediction_scores[0][start], dim=-1).flatten()
44 | max_item_id = torch.argmax(softmax_score, -1).item()
45 | max_item_prob = softmax_score[max_item_id].item()
46 | result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob)
47 | if max_return > 1:
48 | topK = torch.topk(softmax_score, max_return)
49 | prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in
50 | zip(topK.values.data.tolist(), topK.indices.data.tolist())]
51 | result_dict['prob_list'] = softmax_score.data.tolist()[:max_return]
52 | result_dict['label_prob'] = prob_result
53 | outputs = result_dict
54 | else:
55 | targets = batch_data['target']
56 | loss_tensors = torch.as_tensor(targets)
57 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
58 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
59 | loss_tensors.view(-1))
60 |
61 | outputs = masked_lm_loss
62 | return outputs
63 |
--------------------------------------------------------------------------------
/tfkit/task/clm/preprocessor.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from tfkit.utility.data_filereader import get_gen_data_from_file
4 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
5 |
6 |
7 | class Preprocessor(GeneralNLPPreprocessor):
8 | def read_file_to_data(self, path):
9 | return get_gen_data_from_file(path)
10 |
11 | def preprocess_component_convert_to_id(self, item, **param_dict):
12 | tokenized_input, target = item['input'], item.get('target', None)
13 | tokenized_target = self.tokenizer.tokenize(target) if target else None
14 | previous = item.get("previous", [])
15 | if tokenized_target is None:
16 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
17 | 'previous': self.tokenizer.convert_tokens_to_ids(previous)}
18 | else:
19 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
20 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
21 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
22 |
23 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
24 | t_input_id, previous = item['input'], item['previous']
25 | row_dict = {}
26 | if 'target' in item:
27 | target = item['target']
28 | t_target_id = [-1] * len(t_input_id)
29 | mask_id = [0] * (len(t_target_id))
30 | t_target_id += target + [self.tok_sep_id]
31 | mask_id += [1] * (len(target + [self.tok_sep_id]))
32 |
33 | row_dict['start'] = [len(t_input_id)]
34 | t_input_id += [self.tok_bos_id] + target
35 | mask_id = [1] * (len(t_input_id))
36 | row_dict['target'] = t_target_id
37 | else:
38 | t_prev_id = [self.tok_sep_id] + previous
39 | t_input_id.extend(t_prev_id)
40 | mask_id = [1] * (len(t_input_id))
41 | row_dict['start'] = [len(t_input_id) - 1]
42 | row_dict['input'] = t_input_id
43 | row_dict['mask'] = mask_id
44 | row_dict['target_pad'] = [-1]
45 | return {key: torch.tensor(value) for key, value in row_dict.items()}
46 |
--------------------------------------------------------------------------------
/tfkit/task/once/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/once/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from collections import defaultdict
4 |
5 | from tfkit.task.once import Preprocessor
6 | from tfkit.utility.predictor import NonAutoRegressivePredictor
7 |
8 | dir_path = os.path.dirname(os.path.realpath(__file__))
9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
10 |
11 | from torch.nn.functional import softmax
12 | from tfkit.utility.loss import *
13 | from tfkit.utility.tok import *
14 |
15 |
16 | class Model(nn.Module):
17 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None):
18 | super().__init__()
19 | self.tokenizer = tokenizer
20 | self.pretrained = pretrained
21 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
22 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
23 | self.maxlen = maxlen
24 |
25 | predictor = NonAutoRegressivePredictor(self, Preprocessor)
26 | self.predictor = predictor
27 | self.predict = predictor.predict
28 |
29 | def clean_cache(self):
30 | self.encoder_outputs = None
31 | self.past_key_values = None
32 |
33 | def forward(self, batch_data, eval=False, max_return=1, **kwargs):
34 | inputs = batch_data['input']
35 | masks = batch_data['mask']
36 | starts = batch_data['start']
37 | ends = batch_data['end']
38 | tokens_tensor = torch.as_tensor(inputs)
39 | mask_tensors = torch.as_tensor(masks)
40 |
41 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
42 | sequence_output = output[0]
43 | prediction_scores = self.model(sequence_output)
44 |
45 | if eval:
46 | result_dict = {
47 | 'max_item': [],
48 | 'label_prob': defaultdict(list),
49 | 'prob_list': []
50 | }
51 | start = batch_data['start'][0]
52 | stop = False
53 | topK_ids = [[]] * max_return
54 | topK_probs = [1] * max_return
55 | while start < self.maxlen and not stop:
56 | softmax_score = softmax(prediction_scores[0][start], dim=0)
57 | max_item_id = torch.argmax(softmax_score, -1).item()
58 | max_item_prob = softmax_score[max_item_id].item()
59 | if max_return > 1:
60 | topK = torch.topk(softmax_score, max_return)
61 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
62 | topK_ids[k].append(tid)
63 | topK_probs[k] *= prob
64 | else:
65 | topK_ids[0].append(max_item_id)
66 | topK_probs[0] *= max_item_prob
67 |
68 | if tok_sep_id(self.tokenizer) == max_item_id:
69 | stop = True
70 | start += 1
71 | result_dict['prob_list'] = topK_probs
72 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
73 | zip(topK_ids, topK_probs)]
74 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
75 | outputs = result_dict
76 | else:
77 | targets = batch_data['target']
78 | negative_targets = batch_data['ntarget']
79 | loss_tensors = torch.as_tensor(targets)
80 | negativeloss_tensors = torch.as_tensor(negative_targets)
81 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
82 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
83 | loss_tensors.view(-1))
84 | if not torch.all(negativeloss_tensors.eq(-1)).item():
85 | negative_loss_fct = NegativeCElLoss()
86 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
87 | negativeloss_tensors.view(-1))
88 | masked_lm_loss += negative_loss
89 | outputs = masked_lm_loss
90 |
91 | return outputs
92 |
--------------------------------------------------------------------------------
/tfkit/task/once/preprocessor.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility.tok as tok
2 | from tfkit.utility.data_filereader import get_gen_data_from_file
3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
4 |
5 |
6 | class Preprocessor(GeneralNLPPreprocessor):
7 | def read_file_to_data(self, path):
8 | return get_gen_data_from_file(path)
9 |
10 | def set_global_parameters(self):
11 | self.tokenize_target = True
12 |
13 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
14 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
15 | tokenized_input, tokenized_target, n_target = item['input'], item.get('target', None), item.get('ntarget', None)
16 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
17 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
18 | if "neg" in likelihood:
19 | # formatting neg data in csv
20 | if n_target is None:
21 | ntext_arr = [
22 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
23 | elif tok.tok_sep(self.tokenizer) in n_target:
24 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
25 | else:
26 | ntext_arr = [n_target.strip()]
27 | for neg_text in ntext_arr:
28 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
29 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
30 | 'ntarget': self.tokenizer.convert_tokens_to_ids(neg_text)}
31 |
32 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
33 | tok_pad = tok.tok_pad_id(tokenizer)
34 | tok_bos = tok.tok_begin_id(tokenizer)
35 | tok_sep = tok.tok_sep_id(tokenizer)
36 | tok_mask = tok.tok_mask_id(tokenizer)
37 |
38 | row_dict = {}
39 | t_input_id = item['input']
40 | encoder_mask_id = [1] * (len(t_input_id))
41 | encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id)))
42 | target_start = len(t_input_id)
43 | target_end = maxlen
44 | target_length = target_end - target_start
45 | t_input_id.extend([tok_pad] * (maxlen - len(t_input_id)))
46 | if 'target' in item and item['target'] is not None:
47 | target = item['target'] + [tok_sep]
48 | target.extend([-1] * (maxlen - len(target)))
49 | row_dict['target'] = target
50 | row_dict['ntarget'] = [-1] * maxlen
51 | if 'ntarget' in item and len(item['ntarget'].strip()) > 0:
52 | tokenized_ntarget_id = item['ntarget']
53 | tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id)))
54 | if len(tokenized_ntarget_id) <= maxlen:
55 | row_dict['ntarget'] = tokenized_ntarget_id
56 |
57 | input_length = min(maxlen, target_start * 3)
58 | row_dict['input'] = t_input_id
59 | row_dict['mask'] = encoder_mask_id
60 | row_dict['start'] = target_start
61 | row_dict['end'] = maxlen
62 | row_dict['input_length'] = input_length
63 | row_dict['target_length'] = target_length
64 | return row_dict
65 |
--------------------------------------------------------------------------------
/tfkit/task/oncectc/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Model
2 | from tfkit.task.once.preprocessor import Preprocessor
3 |
--------------------------------------------------------------------------------
/tfkit/task/oncectc/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from collections import defaultdict
4 |
5 | from tfkit.task.once import Preprocessor
6 | from tfkit.utility.predictor import NonAutoRegressivePredictor
7 |
8 | dir_path = os.path.dirname(os.path.realpath(__file__))
9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
10 |
11 | from torch.nn.functional import softmax
12 | from tfkit.utility.loss import *
13 | from tfkit.utility.tok import *
14 | from tfkit.utility.loss import SeqCTCLoss
15 |
16 |
17 | class Model(nn.Module):
18 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None):
19 | super().__init__()
20 | self.tokenizer = tokenizer
21 | self.pretrained = pretrained
22 | self.maxlen = maxlen
23 | self.blank_token = ""
24 | self.tokenizer.add_tokens(self.blank_token)
25 | self.pretrained.resize_token_embeddings(len(tokenizer))
26 | self.blank_index = self.tokenizer.convert_tokens_to_ids([self.blank_token])[0]
27 | self.loss = SeqCTCLoss(blank_index=self.blank_index)
28 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
29 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size)
30 | predictor = NonAutoRegressivePredictor(self, Preprocessor)
31 | self.predictor = predictor
32 | self.predict = predictor.predict
33 |
34 | def clean_cache(self):
35 | self.encoder_outputs = None
36 | self.past_key_values = None
37 |
38 | def forward(self, batch_data, eval=False, max_return=1, **kwargs):
39 | inputs = batch_data['input']
40 | masks = batch_data['mask']
41 | starts = batch_data['start']
42 | ends = batch_data['end']
43 | tokens_tensor = torch.as_tensor(inputs)
44 | mask_tensors = torch.as_tensor(masks)
45 |
46 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
47 | sequence_output = output[0]
48 | prediction_scores = self.model(sequence_output)
49 | batch_size = list(tokens_tensor.shape)[0]
50 | prediction_scores = prediction_scores.view(batch_size, -1, self.vocab_size)
51 |
52 | if eval:
53 | result_dict = {
54 | 'max_item': [],
55 | 'label_prob': defaultdict(list),
56 | 'prob_list': []
57 | }
58 | start = batch_data['start'][0]
59 | topK_ids = [[]] * max_return
60 | topK_probs = [1] * max_return
61 |
62 | pscore = prediction_scores.detach().cpu()
63 | predicted_indexs = pscore.argmax(2).tolist()[0]
64 | predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indexs)
65 | output = []
66 | for pos, (predicted_index, predicted_token) in enumerate(zip(predicted_indexs, predicted_tokens)):
67 | if len(output) > 0 and predicted_index == output[-1]:
68 | continue
69 | if predicted_token == self.blank_token:
70 | continue
71 | if predicted_token == tok_pad(self.tokenizer):
72 | continue
73 | if predicted_token == tok_sep(self.tokenizer):
74 | break
75 |
76 | softmax_score = softmax(prediction_scores[0][pos], dim=0)
77 | max_item_id = torch.argmax(softmax_score, -1).item()
78 | max_item_prob = softmax_score[max_item_id].item()
79 | if max_return > 1:
80 | topK = torch.topk(softmax_score, max_return)
81 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
82 | topK_ids[k].append(tid)
83 | topK_probs[k] *= prob
84 | else:
85 | topK_ids[0].append(max_item_id)
86 | topK_probs[0] *= max_item_prob
87 | start += 1
88 |
89 | result_dict['prob_list'] = topK_probs
90 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
91 | zip(topK_ids, topK_probs)]
92 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
93 | outputs = result_dict
94 | else:
95 | targets = batch_data['target']
96 | negative_targets = batch_data['ntarget']
97 | input_lengths = batch_data['input_length']
98 | target_lengths = batch_data['target_length']
99 |
100 | target_tensors = torch.as_tensor(targets)
101 | input_length_tensors = torch.as_tensor(input_lengths)
102 | target_length_tensors = torch.as_tensor(target_lengths)
103 |
104 | loss_tensors = torch.as_tensor(targets)
105 | negativeloss_tensors = torch.as_tensor(negative_targets)
106 | ctc_lm_loss = self.loss(prediction_scores,
107 | input_length_tensors,
108 | target_tensors.view(batch_size, -1),
109 | target_length_tensors)
110 |
111 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
112 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
113 | loss_tensors.view(-1))
114 | if not torch.all(negativeloss_tensors.eq(-1)).item():
115 | negative_loss_fct = NegativeCElLoss()
116 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
117 | negativeloss_tensors.view(-1))
118 | masked_lm_loss += negative_loss
119 | outputs = ctc_lm_loss + masked_lm_loss
120 |
121 | return outputs
122 |
--------------------------------------------------------------------------------
/tfkit/task/qa/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/qa/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from tfkit.utility.predictor import QuestionAnsweringPredictor
5 |
6 | dir_path = os.path.dirname(os.path.realpath(__file__))
7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
8 |
9 | import torch
10 | import torch.nn as nn
11 | from torch.nn.functional import softmax
12 | from tfkit.task.qa.preprocessor import Preprocessor
13 |
14 |
15 | class Model(nn.Module):
16 |
17 | def __init__(self, tokenizer, pretrained, maxlen=128, dropout=0.1, **kwargs):
18 | super().__init__()
19 | self.tokenizer = tokenizer
20 | self.pretrained = pretrained
21 | self.maxlen = maxlen
22 |
23 | self.dropout = nn.Dropout(dropout)
24 | self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
25 | # self.loss_fct = FocalLoss(ignore_index=-1)
26 | # self.loss_fct = GWLoss()
27 |
28 | self.pretrained = self.pretrained
29 | self.qa_classifier = nn.Linear(self.pretrained.config.hidden_size, 2)
30 | self.loss_fct = self.loss_fct
31 |
32 | predictor = QuestionAnsweringPredictor(self, Preprocessor)
33 | self.predictor = predictor
34 | self.predict = predictor.predict
35 |
36 | def forward(self, batch_data, eval=False, **kwargs):
37 | print("batch_data",batch_data)
38 | inputs = torch.as_tensor(batch_data['input'])
39 | masks = torch.as_tensor(batch_data['mask'])
40 | targets = torch.as_tensor(batch_data['target'])
41 | start_positions, end_positions = targets.split(1, dim=1)
42 | start_positions = start_positions.squeeze(1)
43 | end_positions = end_positions.squeeze(1)
44 |
45 | output = self.pretrained(inputs, attention_mask=masks)[0]
46 | logits = self.qa_classifier(output)
47 | start_logits, end_logits = logits.split(1, dim=-1)
48 | start_logits = start_logits.squeeze(-1)
49 | end_logits = end_logits.squeeze(-1)
50 |
51 | if eval:
52 | result_dict = {
53 | 'label_prob_all': [],
54 | 'label_map': []
55 | }
56 | reshaped_start_logits = softmax(start_logits, dim=1)
57 | reshaped_end_logits = softmax(end_logits, dim=1)
58 | start_prob = reshaped_start_logits.data.tolist()[0]
59 | end_prob = reshaped_end_logits.data.tolist()[0]
60 | result_dict['label_prob_all'].append({'start': dict(zip(range(len(start_prob)), start_prob)),
61 | 'end': dict(zip(range(len(end_prob)), end_prob))})
62 | result_dict['label_map'].append({'start': start_prob.index(max(start_prob)),
63 | 'end': end_prob.index(max(end_prob))})
64 | outputs = result_dict
65 | else:
66 | start_loss = self.loss_fct(start_logits, start_positions)
67 | end_loss = self.loss_fct(end_logits, end_positions)
68 | total_loss = (start_loss + end_loss) / 2
69 | outputs = total_loss
70 |
71 | return outputs
72 |
--------------------------------------------------------------------------------
/tfkit/task/qa/preprocessor.py:
--------------------------------------------------------------------------------
1 | import nlp2
2 | import tfkit.utility.tok as tok
3 | import torch
4 | from tfkit.utility.data_filereader import get_qa_data_from_file
5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 | def read_file_to_data(self, path):
10 | return get_qa_data_from_file(path)
11 |
12 | def preprocess_component_prepare_input(self, item):
13 | mapping_index = []
14 | pos = 1 # cls as start 0
15 | input_text_list = nlp2.split_sentence_to_array(item['input'])
16 | for i in input_text_list:
17 | for _ in range(len(self.tokenizer.tokenize(i))):
18 | if _ < 1:
19 | mapping_index.append({'char': i, 'pos': pos})
20 | pos += 1
21 | item['mapping_index'] = mapping_index
22 | return item
23 |
24 | def preprocess_component_convert_to_id(self, item, **param_dict):
25 | input_text, target = item['input'], item.get('target', None)
26 | tokenized_input = [tok.tok_begin(self.tokenizer)] + input_text + [tok.tok_sep(self.tokenizer)]
27 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
28 | start_index = item['input_index'][0]
29 | end_index = item['input_index'][1]
30 | if target:
31 | item['target'] = [0, 0]
32 | target_start, target_end = target
33 | ori_start = target_start = int(target_start)
34 | ori_end = target_end = int(target_end)
35 | ori_ans = tokenized_input[ori_start:ori_end]
36 | target_start -= start_index
37 | target_end -= start_index
38 | # print("target_start", self.parameters['maxlen'],item['mapping_index'][target_start]['pos'],ori_end)
39 | # if item['mapping_index'][target_start]['pos'] > ori_end or target_start < 0 \
40 | # or target_start > self.parameters['maxlen'] \
41 | # or target_end >= self.parameters['maxlen'] - 2:
42 | # target_start = 0
43 | # target_end = 0
44 | # else:
45 | for map_pos, map_tok in enumerate(item['mapping_index'][start_index:]):
46 | if start_index < map_tok['pos'] <= end_index:
47 | length = len(self.tokenizer.tokenize(map_tok['char']))
48 | if map_pos < ori_start:
49 | target_start += length - 1
50 | if map_pos < ori_end:
51 | target_end += length - 1
52 | item['target'] = [target_start + 1, target_end + 1] # cls +1
53 |
54 | item['input'] = input_id
55 | item['mask'] = [1] * len(input_id)
56 | item['raw_input'] = tokenized_input
57 | yield item
58 |
59 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
60 | row_dict = {
61 | 'input': item['input'],
62 | 'mask': item['mask']
63 | }
64 | if 'target' in item:
65 | row_dict['target'] = item['target']
66 | return {key: torch.tensor(value) for key, value in row_dict.items()}
67 |
--------------------------------------------------------------------------------
/tfkit/task/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/seq2seq/model.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn.functional import softmax
6 | from transformers import AutoModel
7 | import torch.nn.functional as F
8 | from tfkit.task.seq2seq import Preprocessor
9 | from tfkit.utility.loss import NegativeCElLoss, SelfKDLoss
10 | from tfkit.utility.model import tie_encoder_decoder_weights
11 | from tfkit.utility.predictor import AutoRegressivePredictor
12 |
13 |
14 | class Model(nn.Module):
15 | def __init__(self, tokenizer, pretrained, maxlen=512, selfkd=False, **kwargs):
16 | super().__init__()
17 | self.maxlen = maxlen
18 | self.tokenizer = tokenizer
19 | self.pretrained = pretrained
20 | self.selfkd = selfkd
21 | self.decoder_model, init_weight = self.initialize_decoder()
22 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
23 | self.model = nn.Linear(self.decoder_hidden_size, self.vocab_size, bias=False)
24 | if init_weight is not None:
25 | self.model.weight = init_weight
26 | self.predictor = AutoRegressivePredictor(self, Preprocessor)
27 | self.predict = self.predictor.predict
28 |
29 | def initialize_decoder(self):
30 | init_weight = None
31 |
32 | if hasattr(self.pretrained, 'decoder'):
33 | decoder_model = None
34 | self.decoder_hidden_size = self.pretrained.config.hidden_size
35 | if hasattr(self.pretrained, 'shared'):
36 | init_weight = copy.deepcopy(self.pretrained.shared.weight)
37 | else:
38 | decoder_config = copy.deepcopy(self.pretrained.config)
39 | decoder_config.is_decoder = True
40 | decoder_config.add_cross_attention = True
41 | decoder_model = AutoModel.from_config(decoder_config)
42 | tie_encoder_decoder_weights(self.pretrained, decoder_model, decoder_model.base_model_prefix)
43 | self.decoder_hidden_size = decoder_config.hidden_size
44 |
45 | return decoder_model, init_weight
46 |
47 | def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs):
48 | if self.decoder_model:
49 | prediction_output, prediction_all_hidden = self.decoder_forward(batch_data, eval)
50 | else:
51 | prediction_output, prediction_all_hidden = self.encoder_forward(batch_data, eval, beamsearch)
52 |
53 | prediction_scores = self.model(prediction_output)
54 |
55 | if eval:
56 | outputs = self.process_eval_output(prediction_scores, max_return)
57 | else:
58 | outputs = self.calculate_loss(batch_data, prediction_scores, prediction_all_hidden)
59 | return outputs
60 |
61 | def decoder_forward(self, batch_data, eval):
62 | input_tensors = torch.as_tensor(batch_data['input'])
63 | prev_tensors = torch.as_tensor(batch_data['prev'])
64 | encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask'])
65 | decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask'])
66 |
67 | if not eval:
68 | outputs = self.pretrained(input_tensors, attention_mask=encoder_mask_tensors)
69 | prediction = self.decoder_model(
70 | input_ids=prev_tensors,
71 | attention_mask=decoder_mask_tensors,
72 | output_hidden_states=self.selfkd,
73 | use_cache=False,
74 | return_dict=True,
75 | )
76 | prediction_output = prediction['last_hidden_state']
77 | prediction_all_hidden = prediction.get('hidden_states')
78 | return prediction_output, prediction_all_hidden
79 |
80 | def encoder_forward(self, batch_data, eval, beamsearch):
81 | input_tensors = torch.as_tensor(batch_data['input'])
82 | prev_tensors = torch.as_tensor(batch_data['prev'])
83 | encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask'])
84 | decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask'])
85 |
86 | prediction = self.pretrained(
87 | input_ids=input_tensors,
88 | attention_mask=encoder_mask_tensors,
89 | decoder_input_ids=prev_tensors,
90 | decoder_attention_mask=decoder_mask_tensors,
91 | output_hidden_states=self.selfkd,
92 | use_cache=False,
93 | return_dict=True
94 | )
95 | prediction_output = prediction['last_hidden_state']
96 | prediction_all_hidden = prediction.get('decoder_hidden_states')
97 | return prediction_output, prediction_all_hidden
98 |
99 | def process_eval_output(self, prediction_scores, max_return):
100 | result_dict = {}
101 | softmax_score = softmax(prediction_scores[0][0], dim=0)
102 | max_item_id = torch.argmax(softmax_score, -1).item()
103 | max_item_prob = softmax_score[max_item_id].item()
104 | result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob)
105 |
106 | if max_return > 1:
107 | topK = torch.topk(softmax_score, max_return)
108 | prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in
109 | zip(topK.values.data.tolist(), topK.indices.data.tolist())]
110 | result_dict['prob_list'] = softmax_score.data.tolist()[:max_return]
111 | result_dict['label_prob'] = prob_result
112 |
113 | return result_dict
114 |
115 | def calculate_loss(self, batch_data, prediction_scores, prediction_all_hidden):
116 | targets = batch_data['target']
117 | negative_targets = batch_data['ntarget']
118 | loss_tensors = torch.as_tensor(targets)
119 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
120 | lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
121 | loss_tensors.view(-1))
122 |
123 | if self.selfkd:
124 | selfkdloss_fct = SelfKDLoss(ignore_index=-1)
125 | for decoder_hidden in prediction_all_hidden[:-1]:
126 | student = self.model(decoder_hidden)
127 | lm_loss += selfkdloss_fct(student.view(-1, self.vocab_size),
128 | prediction_scores.view(-1, self.vocab_size), loss_tensors.view(-1))
129 |
130 | if 'btarget' in batch_data:
131 | backtran_tensors = torch.as_tensor(batch_data['btarget'])
132 | if not torch.all(backtran_tensors.eq(-1)).item():
133 | backtran_predation = self.pretrained(
134 | input_ids=backtran_tensors,
135 | output_hidden_states=True,
136 | return_dict=True
137 | )
138 | backtran_hidden = backtran_predation['encoder_last_hidden_state']
139 | backtran_loss = F.cosine_similarity(self.encoder_hidden, backtran_hidden).mean()
140 | lm_loss += backtran_loss
141 |
142 | negativeloss_tensors = torch.as_tensor(negative_targets)
143 | if not torch.all(negativeloss_tensors.eq(-1)).item():
144 | negative_loss_fct = NegativeCElLoss(ignore_index=-1)
145 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
146 | negativeloss_tensors.view(-1))
147 | lm_loss += negative_loss
148 |
149 | return lm_loss
150 |
--------------------------------------------------------------------------------
/tfkit/task/seq2seq/preprocessor.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import tfkit.utility.tok as tok
4 | from tfkit.utility.data_filereader import get_gen_data_from_file
5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 | def read_file_to_data(self, path):
10 | return get_gen_data_from_file(path)
11 |
12 | def set_global_parameters(self):
13 | self.tokenize_target = True
14 |
15 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
16 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
17 | tokenized_input, tokenized_target, n_target, b_target = item['input'], \
18 | item.get('target', None), \
19 | item.get('ntarget', None), \
20 | item.get('btarget', None)
21 | previous = item.get("previous", [])
22 | if tokenized_target is None:
23 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
24 | 'previous': self.tokenizer.convert_tokens_to_ids(previous)}
25 | elif b_target and len(b_target) > 0:
26 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
27 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
28 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
29 | 'btarget': self.tokenizer.encode(b_target)}
30 | else:
31 | if "neg" in likelihood or 'both' in likelihood:
32 | # formatting neg data in csv
33 | if n_target is None:
34 | ntext_arr = [
35 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
36 | elif tok.tok_sep(self.tokenizer) in n_target:
37 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
38 | else:
39 | ntext_arr = [n_target.strip()]
40 | for neg_text in ntext_arr:
41 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
42 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
43 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
44 | 'ntarget': self.tokenizer.encode(neg_text)}
45 | else:
46 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
47 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
48 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
49 |
50 | # whole sentence masking
51 | if 'pos' in likelihood:
52 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
53 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
54 | 'previous': self.tokenizer.convert_tokens_to_ids(
55 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target))}
56 | elif 'both' in likelihood:
57 | for neg_text in ntext_arr:
58 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
59 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
60 | 'previous': self.tokenizer.convert_tokens_to_ids(
61 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target)),
62 | 'ntarget': self.tokenizer.encode(neg_text)}
63 |
64 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 | t_input_id, previous = item['input'], item['previous']
66 | row_dict = {}
67 | if 'target' in item:
68 | target = item['target']
69 | tokenized_target_id = []
70 | if len(previous) == len(target):
71 | tokenized_prev_id = [self.tok_mask_id] * maxlen
72 | else:
73 | tokenized_prev_id = [self.tok_sep_id] + target
74 | tokenized_target_id.extend(target + [self.tok_sep_id])
75 | row_dict['target'] = tokenized_target_id
76 | row_dict['target_pad'] = [-1]
77 | row_dict['prev'] = tokenized_prev_id
78 | row_dict['ntarget'] = [-1] * maxlen
79 | if 'ntarget' in item and len(item['ntarget']) > 0:
80 | tokenized_ntarget_id = item['ntarget']
81 | if len(tokenized_ntarget_id) <= maxlen:
82 | row_dict['ntarget'] = tokenized_ntarget_id
83 | if 'btarget' in item and len(item['btarget']) > 0:
84 | row_dict['btarget'] = tokenizer.encode(item['btarget'])
85 | else:
86 | tokenized_prev_id = [self.tok_sep_id]
87 | tokenized_prev_id.extend(previous)
88 | row_dict['prev'] = tokenized_prev_id
89 |
90 | row_dict['input'] = t_input_id
91 | row_dict['encoder_mask'] = [1] * len(t_input_id)
92 | row_dict['decoder_mask'] = [1] * len(tokenized_prev_id)
93 | return {key: torch.tensor(value) for key, value in row_dict.items()}
94 |
--------------------------------------------------------------------------------
/tfkit/task/tag/__init__.py:
--------------------------------------------------------------------------------
1 | from .preprocessor import Preprocessor
2 | from .model import Model
3 |
--------------------------------------------------------------------------------
/tfkit/task/tag/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from collections import Counter
4 |
5 | import torch
6 | from torch import nn
7 | from torch.nn.functional import softmax
8 |
9 | from tfkit.task.tag import Preprocessor
10 | from tfkit.utility.loss import FocalLoss
11 | from tfkit.utility.predictor import TaggingPredictor
12 |
13 | dir_path = os.path.dirname(os.path.realpath(__file__))
14 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
15 |
16 |
17 | class Model(nn.Module):
18 | def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.2, **kwargs):
19 | super().__init__()
20 | self.initialize_components(tokenizer, pretrained, tasks_detail, maxlen, dropout)
21 |
22 | def initialize_components(self, tokenizer, pretrained, tasks_detail, maxlen, dropout):
23 | labels = list(tasks_detail.values())[0]
24 | self.tokenizer = tokenizer
25 | self.pretrained = pretrained
26 | self.dropout = nn.Dropout(dropout)
27 | self.tagger = nn.Linear(self.pretrained.config.hidden_size, len(labels))
28 | self.labels = labels
29 | self.maxlen = maxlen
30 | self.loss_fct = FocalLoss()
31 |
32 | self.pretrained = self.pretrained
33 | self.loss_fct = self.loss_fct
34 |
35 | predictor = TaggingPredictor(self, Preprocessor)
36 | self.predictor = predictor
37 | self.predict = predictor.predict
38 |
39 | def forward(self, batch_data, eval=False, separator=" ", **kwargs):
40 | inputs = batch_data["input"]
41 | masks = batch_data["mask"]
42 |
43 | bert_output = self.compute_bert_output(inputs, masks)
44 |
45 | if eval:
46 | outputs = self.compute_eval_output(batch_data, bert_output)
47 | else:
48 | outputs = self.compute_loss_output(batch_data, bert_output)
49 |
50 | return outputs
51 |
52 | def compute_bert_output(self, inputs, masks):
53 | token_tensor = torch.as_tensor(inputs, dtype=torch.long)
54 | mask_tensors = torch.as_tensor(masks)
55 | bert_output = self.pretrained(token_tensor, attention_mask=mask_tensors)
56 | res = bert_output[0]
57 | pooled_output = self.dropout(res)
58 | reshaped_logits = self.tagger(pooled_output)
59 |
60 | return reshaped_logits
61 |
62 | def compute_eval_output(self, batch_data, reshaped_logits):
63 | result_dict = {
64 | 'label_prob_all': [],
65 | 'label_map': []
66 | }
67 |
68 | ilogit = softmax(reshaped_logits[0], dim=1)
69 | result_labels = ilogit.data.tolist()
70 | start, end = batch_data['pos'][0]
71 | token_word_mapping = batch_data['token_word_mapping']
72 |
73 | for pos, logit_prob in enumerate(result_labels[1:]): # skip cls and sep
74 | if start + pos >= len(token_word_mapping):
75 | break
76 |
77 | word, pos = self.compute_word_pos(token_word_mapping, start, pos)
78 | self.update_result_dict(result_dict, logit_prob, word, pos)
79 |
80 | result_dict['token_word_mapping'] = token_word_mapping[start:end]
81 |
82 | return result_dict
83 |
84 | @staticmethod
85 | def compute_word_pos(token_word_mapping, start, pos):
86 | word = token_word_mapping[start + pos]['word']
87 | pos = token_word_mapping[start + pos]['pos']
88 |
89 | return word, pos
90 |
91 | def update_result_dict(self, result_dict, logit_prob, word, pos):
92 | if len(result_dict['label_map']) > pos:
93 | self.update_existing_result(result_dict, logit_prob, word, pos)
94 | else:
95 | self.append_new_result(result_dict, logit_prob, word)
96 |
97 | def update_existing_result(self, result_dict, logit_prob, word, pos):
98 | O = Counter(result_dict['label_prob_all'][-1][word])
99 | N = Counter(dict(zip(self.labels, logit_prob)))
100 | mean_prob = {k: v / 2 for k, v in (O + N).items()}
101 | result_dict['label_prob_all'][-1] = {word: mean_prob}
102 | result_dict['label_map'][-1] = {
103 | word: max(mean_prob, key=mean_prob.get)}
104 |
105 | def append_new_result(self, result_dict, logit_prob, word):
106 | max_index = logit_prob.index(max(logit_prob))
107 | result_dict['label_map'].append({word: self.labels[max_index]})
108 | result_dict['label_prob_all'].append({word: dict(zip(self.labels, logit_prob))})
109 |
110 | def compute_loss_output(self, batch_data, reshaped_logits):
111 | targets = batch_data["target"]
112 | target_tensor = torch.as_tensor(targets, dtype=torch.long)
113 | loss = self.loss_fct(reshaped_logits.view(-1, len(self.labels)), target_tensor.view(-1))
114 |
115 | return loss
116 |
--------------------------------------------------------------------------------
/tfkit/task/tag/preprocessor.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility.tok as tok
2 | from tfkit.utility.data_filereader import get_tag_data_from_file
3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
4 |
5 | get_data_from_file = get_tag_data_from_file
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 |
10 | def read_file_to_data(self, path):
11 | return get_tag_data_from_file(path)
12 |
13 | def preprocess(self, item, **param_dict):
14 | input_text, target = item['input'], item.get('target', None)
15 | separator = param_dict.get('separator', ' ')
16 | word_token_mapping = []
17 | token_word_mapping = []
18 | pos = 0
19 |
20 | for word_i, word in enumerate(input_text.split(separator)):
21 | tokenize_word = self.tokenizer.tokenize(word)
22 | for _ in range(len(tokenize_word)):
23 | if _ < 1: # only record first token (one word one record)
24 | word_token_mapping.append({'char': word, 'pos': pos, 'len': len(tokenize_word)})
25 | token_word_mapping.append({'tok': tokenize_word[_], 'word': word, 'pos': len(word_token_mapping) - 1})
26 | pos += 1
27 |
28 | t_input_list, t_pos_list = tok.handle_exceed(self.tokenizer, input_text, self.parameters['maxlen'] - 2,
29 | mode=self.parameters.get('handle_exceed'),
30 | keep_after_sep=False)
31 | preprocessed_data = []
32 | for t_input, t_pos in zip(t_input_list, t_pos_list): # -1 for cls
33 | # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
34 | row_dict = dict()
35 | tokenized_input = [tok.tok_begin(self.tokenizer)] + t_input
36 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
37 |
38 | if target is not None:
39 | target_token = []
40 | for input_word, target_label in zip(word_token_mapping, target.split(separator)):
41 | if t_pos[0] <= input_word['pos'] < t_pos[1]:
42 | for _ in range(input_word['len']):
43 | target_token += [target_label]
44 |
45 | target_id = [target_token[0]] + target_token
46 |
47 | if len(input_id) != len(target_id):
48 | print(list(zip(input.split(separator), target.split(separator))))
49 | print(self.tokenizer.decode(input_id))
50 | print(input_id)
51 | print(target_id)
52 | print("input target len not equal ", len(input_id), len(target_id))
53 | continue
54 | row_dict['target'] = target_id
55 |
56 | row_dict['input'] = input_id
57 | row_dict['word_token_mapping'] = word_token_mapping
58 | row_dict['token_word_mapping'] = token_word_mapping
59 | row_dict['end'] = len(input_id)
60 | row_dict['pos'] = t_pos
61 | preprocessed_data.append(row_dict)
62 | return preprocessed_data
63 |
64 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 | labels = item['task_dict']
66 | print("item['input']",len(item['input']))
67 | mask_id = [1] * len(item['input'])
68 | mask_id.extend([0] * (maxlen - len(mask_id)))
69 | item['input'].extend([0] * (self.parameters['maxlen'] - len(item['input'])))
70 | row_dict = {
71 | 'input': item['input'],
72 | 'mask': mask_id,
73 | 'pos': item['pos'],
74 | }
75 | # 'token_word_mapping': item['token_word_mapping']
76 | if 'target' in item:
77 | print(labels['tag'])
78 | target_id = [labels['tag'].index(i) for i in item['target']]
79 | if "O" in labels['tag']:
80 | target_id = [labels['tag'].index("O")] + target_id
81 | else:
82 | target_id = [target_id[0]] + target_id
83 | target_id.extend([0] * (self.parameters['maxlen'] - len(target_id)))
84 | row_dict['target'] = target_id
85 |
86 | return row_dict
87 |
--------------------------------------------------------------------------------
/tfkit/test/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
4 |
5 | DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data')
6 | TAG_DATASET = os.path.join(DATASET_DIR, 'tag.csv')
7 | CLAS_DATASET = os.path.join(DATASET_DIR, 'classification.csv')
8 | GEN_DATASET = os.path.join(DATASET_DIR, 'generation.csv')
9 | MASK_DATASET = os.path.join(DATASET_DIR, 'mask.csv')
10 | MCQ_DATASET = os.path.join(DATASET_DIR, 'mcq.csv')
11 | QA_DATASET = os.path.join(DATASET_DIR, 'qa.csv')
12 | ADDTOK_DATASET = os.path.join(DATASET_DIR, 'unk_tok.csv')
13 | NEWTOKEN_FILE = os.path.join(DATASET_DIR, 'tok_list.txt')
14 |
15 | MODEL_SAVE_DIR = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
16 | ADDTOKFREQ_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfreq/')
17 | ADDTOKFILE_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfile/')
18 | CLAS_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clas/')
19 | TAG_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tag/')
20 | TAGCRF_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tagcrf/')
21 | ONEBYONE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'onebyone/')
22 | CLM_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clm/')
23 | SEQ2SEQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'seq2seq/')
24 | ONCE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'once/')
25 | ONCECTC_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'oncectc/')
26 | MASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mask/')
27 | MCQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mcq/')
28 | QA_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'qa/')
29 | MTTASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mttask/')
30 |
31 | ONEBYONE_MODEL_PATH = os.path.join(ONEBYONE_MODEL_DIR, '2.pt')
32 | ONCE_MODEL_PATH = os.path.join(ONCE_MODEL_DIR, '2.pt')
33 | ONCECTC_MODEL_PATH = os.path.join(ONCECTC_MODEL_DIR, '1.pt')
34 | SEQ2SEQ_MODEL_PATH = os.path.join(SEQ2SEQ_MODEL_DIR, '2.pt')
35 | CLM_MODEL_PATH = os.path.join(CLM_MODEL_DIR, '2.pt')
36 | CLAS_MODEL_PATH = os.path.join(CLAS_MODEL_DIR, '2.pt')
37 | MASK_MODEL_PATH = os.path.join(MASK_MODEL_DIR, '2.pt')
38 | MCQ_MODEL_PATH = os.path.join(MCQ_MODEL_DIR, '2.pt')
39 | TAG_MODEL_PATH = os.path.join(TAG_MODEL_DIR, '2.pt')
40 | QA_MODEL_PATH = os.path.join(QA_MODEL_DIR, '2.pt')
41 | ADDTOKFREQ_MODEL_PATH = os.path.join(ADDTOKFREQ_SAVE_DIR, '2.pt')
42 | ADDTOKFILE_MODEL_PATH = os.path.join(ADDTOKFILE_SAVE_DIR, '2.pt')
43 |
--------------------------------------------------------------------------------
/tfkit/test/task/test_task_model.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import tfkit
4 | from tfkit.utility.data_loader import pad_batch
5 | from torch import Tensor
6 | from transformers import BertTokenizer, AutoModel, AutoTokenizer
7 |
8 |
9 | class TestModel(unittest.TestCase):
10 |
11 |
12 | def testGenerationModel(self):
13 | input = "See you next time"
14 | maxlen = 32
15 | tokenizer = AutoTokenizer.from_pretrained('sshleifer/bart-tiny-random')
16 | pretrained = AutoModel.from_pretrained('sshleifer/bart-tiny-random')
17 | # tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm
18 | for gmodel in [tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm]:
19 | print(str(gmodel))
20 | model = gmodel.Model(tokenizer, pretrained, maxlen=maxlen)
21 | preprocessor = gmodel.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice', reserved_len=0)
22 | for preprocessed_item in preprocessor.preprocess(
23 | {'task': 'taskA', 'input': input}):
24 | print("preprocessed_item", preprocessed_item)
25 | feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen)
26 | feature = preprocessor.postprocess_batch(feature)
27 | print(model(feature, eval=True))
28 | self.assertTrue(isinstance(model(feature, eval=True), dict))
29 | model_dict = model(feature, eval=True)
30 | self.assertTrue('max_item' in model_dict)
31 |
32 | # greedy
33 | print("greedy")
34 | result, detail = model.predict(input=input)
35 | print(result, model_dict)
36 | self.assertTrue(len(result) == 1)
37 | self.assertTrue(isinstance(result, list))
38 | self.assertTrue(isinstance(detail, dict))
39 |
40 | # TopK
41 | result, detail = model.predict(input=input, decodenum=3, mode='topK', topK=3, filtersim=False)
42 | print("topK", result)
43 | self.assertTrue(len(result) == 3)
44 | self.assertTrue(isinstance(result, list))
45 | self.assertTrue(isinstance(detail, dict))
46 |
47 | # beamsearch
48 | result, detail = model.predict(input=input, decodenum=3)
49 | print("beamsaerch", len(result), result, model_dict)
50 | self.assertTrue(len(result) == 3)
51 | self.assertTrue(isinstance(result, list))
52 | self.assertTrue(isinstance(detail, dict))
53 |
54 | # TopP
55 | result, detail = model.predict(input=input, decodenum=3, mode='topP', topP=0.8)
56 | print("TopP", len(result), result, model_dict)
57 | self.assertTrue(len(result) == 3)
58 | self.assertTrue(isinstance(result, list))
59 | self.assertTrue(isinstance(detail, dict))
60 |
61 | # test exceed 512
62 | result, model_dict = model.predict(input="T " * 540)
63 | print("test exceed 512", len(result), result, model_dict)
64 | self.assertTrue(isinstance(result, list))
65 | self.assertTrue(isinstance(detail, dict))
66 | print("exceed max len", result)
67 |
68 | result, model_dict = model.predict(input="T " * 550, reserved_len=10)
69 | print(result)
70 | self.assertTrue(isinstance(result, list))
71 | self.assertTrue(isinstance(detail, dict))
72 | print("exceed max len with reserved len:", result)
73 |
74 | # def testClas(self):
75 | # input = "One hundred thirty-four patients suspected of having pancreas cancer successfully underwent gray scale ultrasound examination of the pancreas ."
76 | # target = "a"
77 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
78 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
79 | # maxlen = 512
80 | # model = tfkit.task.clas.Model(tokenizer, pretrained, tasks_detail={"taskA": ["a", "b"]})
81 | # preprocessor = tfkit.task.clas.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice',
82 | # reserved_len=0)
83 | # for preprocessed_item in preprocessor.preprocess(
84 | # {'task': 'taskA', 'input': input, 'target': target, 'task_dict': {"taskA": ["a", "b"]}}):
85 | # feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen)
86 | # for k, v in feature.items():
87 | # feature[k] = [v, v]
88 | # print(feature)
89 | # # test train
90 | # print(model(feature))
91 | # self.assertTrue(isinstance(model(feature), Tensor))
92 | # # test eval
93 | # print(model(feature, eval=True))
94 | # model_dict = model(feature, eval=True)
95 | # print(model_dict)
96 | #
97 | # # test predict
98 | # tok_label = model.predict(task="taskA", input=input)
99 | # self.assertTrue(len(tok_label) == 2)
100 | # # test predict with top k 2
101 | # top_k_label, top_k_dict = model.predict(task="taskA", input=input, topK=2)
102 | # print("test predict with top k 2, ", top_k_label, top_k_dict)
103 | # self.assertTrue(len(top_k_label) == 2)
104 | #
105 | # # test exceed 512
106 | # for merge_strategy in ['entropy', 'count', 'prob']:
107 | # result, model_dict = model.predict(task="taskA", input=" ".join([str(i) for i in range(2000)]),
108 | # merge_strategy=merge_strategy, topK=2)
109 | # print(result, len(model_dict), model_dict)
110 | # self.assertTrue(isinstance(result, list))
111 | # self.assertTrue(len(result) == 2)
112 |
113 | # def testQA(self):
114 | # input = "梵 語 在 社 交 中 口 頭 使 用 , 並 且 在 早 期 古 典 梵 語 文 獻 的 發 展 中 維 持 口 頭 傳 統 。 在 印 度 , 書 寫 形 式 是 當 梵 語 發 展 成 俗 語 之 後 才 出 現 的 ; 在 書 寫 梵 語 的 時 候 , 書 寫 系 統 的 選 擇 受 抄 寫 者 所 處 地 域 的 影 響 。 同 樣 的 , 所 有 南 亞 的 主 要 書 寫 系 統 事 實 上 都 用 於 梵 語 文 稿 的 抄 寫 。 自 1 9 世 紀 晚 期 , 天 城 文 被 定 為 梵 語 的 標 準 書 寫 系 統 , 十 分 可 能 的 原 因 是 歐 洲 人 有 用 這 種 文 字 印 刷 梵 語 文 本 的 習 慣 。 最 早 的 已 知 梵 語 碑 刻 可 確 定 為 公 元 前 一 世 紀 。 它 們 採 用 了 最 初 用 於 俗 語 而 非 梵 語 的 婆 羅 米 文 。 第 一 個 書 寫 梵 語 的 證 據 , 出 現 在 晚 於 它 的 俗 語 的 書 寫 證 據 之 後 的 幾 個 世 紀 , 這 被 描 述 為 一 種 悖 論 。 在 梵 語 被 書 寫 下 來 的 時 候 , 它 首 先 用 於 行 政 、 文 學 或 科 學 類 的 文 本 。 宗 教 文 本 口 頭 傳 承 , 在 相 當 晚 的 時 候 才 「 不 情 願 」 地 被 書 寫 下 來 。 [Question] 最 初 梵 語 以 什 麼 書 寫 系 統 被 記 錄 下 來 ?"
115 | # target = [201, 205]
116 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
117 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
118 | # model = tfkit.task.qa.Model(tokenizer, pretrained, maxlen=512)
119 | #
120 | # proc = tfkit.task.qa.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice',
121 | # reserved_len=0)
122 | # for items in proc.preprocess({"input": input}):
123 | # raw_input = items['raw_input']
124 | # feature = proc.postprocess(items, tokenizer, 512)
125 | # for k, v in feature.items():
126 | # feature[k] = [v]
127 | #
128 | # # test train
129 | # print(model(feature))
130 | # self.assertTrue(isinstance(model(feature), Tensor))
131 | # # test eval
132 | # print(model(feature, eval=True))
133 | # model_dict = model(feature, eval=True)
134 | # self.assertTrue('label_prob_all' in model_dict)
135 | # self.assertTrue('label_map' in model_dict)
136 | #
137 | # # test predict
138 | # result, model_dict = model.predict(input=input)
139 | # print("model_dict", model_dict, input, result)
140 | # self.assertTrue('label_prob_all' in model_dict[0])
141 | # self.assertTrue('label_map' in model_dict[0])
142 | # self.assertTrue(len(result) == 1)
143 | #
144 | # # # test eval top k = 2
145 | # # top_k_label, top_k_dict = task.predict(input=input, topK=2)
146 | # # print("top_k_label", top_k_label)
147 | # # self.assertTrue(len(top_k_label) == 2)
148 | #
149 | # # test exceed 512
150 | # for merge_strategy in ['entropy', 'count', 'prob']:
151 | # result, model_dict = model.predict(input=" ".join([str(i) for i in range(550)]),
152 | # handle_exceed='start_slice',
153 | # merge_strategy=merge_strategy)
154 | # print(result, len(model_dict))
155 | # self.assertTrue(isinstance(result, list))
156 | #
157 | # def testTag(self):
158 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_small')
159 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_small')
160 | #
161 | # input = "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 [SEP] 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 FOB ?"
162 | # target = "O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O"
163 | # label = ["O", "A"]
164 | #
165 | # model = tfkit.task.tag.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"default": label})
166 | #
167 | # # test exceed 512
168 | # for merge_strategy in ['count']:
169 | # result, model_dict = model.predict(
170 | # input="""
171 | # Rundfadsfdsfsfning 明朝(1368年1月23日-1644年4月25日[註 1])是中國歷史上最後一個由漢族建立的大一統王朝,歷經十二世、十六位皇帝,國祚二百七十六年[參 4]。\n\n元朝末年政治腐敗,種族紛爭,天災不斷,民不聊生,民變暴動屢禁不止,平民朱元璋加入紅巾軍並在其中乘勢崛起,跟隨佔據濠州的郭子興。郭子興死後,朱元璋被當時反抗軍擁立的小明王韓林兒封為左副元帥,並率部眾先後攻占滁州、和州等地,並最終攻佔集慶(今江蘇南京),採取朱升所建議的「高築牆,廣積糧,緩稱王」的政策,以鞏固根據地,讓士兵屯田積糧減少百姓負擔,以示自己為仁義之師而避免受敵。1364年,朱元璋稱吳王,建立西吳政權。1368年,在掃滅陳友諒、張士誠和方國珍等群雄勢力後,朱元璋於當年農曆正月初四日登基稱帝,立國號為大明[參 5],定都應天府(今南京市),其轄區稱為京師,由因皇室姓朱,故又稱朱明,之後以「驅逐胡虜,恢復中華」[參 6]為號召北伐中原[參 7][參 8],並收回了燕雲十六州[參 9],結束蒙元在中國漢地的統治,統一天下。\n\n明初天下大定,經過朱元璋的休養生息,社會經濟得以恢復和發展,國力迅速恢復,史稱洪武之治。朱元璋去世後,其孫朱允炆即位,但其在靖難之役中敗於駐守燕京的朱元璋第四子朱棣,也自此失蹤。朱棣登基後遷都至順天府(今北京市),將北平布政司升為京師,原京師改稱南京[參 3]。成祖朱棣時期,開疆拓土,又派遣鄭和七下西洋,此後許多漢人遠赴海外,國勢達到頂峰,史稱永樂盛世。其後的仁宗和宣宗時期國家仍處於興盛時期,史稱仁宣之治[參 10]。英宗和代宗時期,遭遇土木之變,國力中衰,經于謙等人抗敵,最終解除國家危機。憲宗和孝宗相繼與民休息,孝宗則力行節儉,減免稅賦,百姓安居樂業,史稱弘治中興[參 11]。武宗時期爆發了南巡之爭和寧王之亂。世宗即位初,引發大禮議之爭,他清除宦官和權臣勢力後總攬朝綱,實現嘉靖中興,並於屯門海戰與西草灣之戰中擊退葡萄牙殖民侵略,任用胡宗憲和俞大猷等將領平定東南沿海的倭患。世宗駕崩後經過隆慶新政國力得到恢復,神宗前期任用張居正,推行萬曆新政,國家收入大增,商品經濟空前繁榮、科學巨匠迭出、社會風尚呈現出活潑開放的新鮮氣息,史稱萬曆中興[參 12]。後經過萬曆三大征平定內憂外患,粉碎豐臣秀吉攻占朝鮮進而入明的計劃,然而因為國本之爭,皇帝逐漸疏於朝政,史稱萬曆怠政,同時東林黨爭也帶來了明中期的政治混亂。\n\n萬曆一朝成為明朝由盛轉衰的轉折期[參 13]。光宗繼位不久因紅丸案暴斃,熹宗繼承大統改元天啟,天啟年間魏忠賢閹黨禍亂朝綱,至明思宗即位後剷除閹黨,但閹黨倒臺後,黨爭又起,政治腐敗以及連年天災[註 2][註 3],導致國力衰退,最終爆發大規模民變。1644年4月25日(舊曆三月十九),李自成所建立的大順軍攻破北京,思宗自縊於煤山,是為甲申之變。隨後吳三桂倒戈相向,滿族建立的滿清入主中原。明朝宗室於江南地區相繼成立南明諸政權,而原本反明的流寇在李自成等領袖死後亦加入南明陣營,這些政權被清朝統治者先後以「為君父報仇」為名各個殲滅,1662年,明朝宗室最後政權被剷除,永曆帝被俘後被殺,滿清又陸續擊敗各地反抗軍,以及攻取台灣、澎湖,1683年,奉大明為正朔的明鄭向清朝投降,漢族抗爭勢力方為清朝所消滅。[參 16]。\n\n明代的核心領土囊括漢地[註 4],東北到外興安嶺及黑龍江流域[參 19],後縮為遼河流域;初年北達戈壁沙漠一帶,後改為今長城;西北至新疆哈密,後改為嘉峪關;西南臨孟加拉灣[註 5],後折回約今雲南境;曾經在今中國東北、新疆東部及西藏等地設有羈縻機構[參 21]。不過,明朝是否實際統治了西藏國際上尚存在有一定的爭議[註 6]。明成祖時期曾短暫征服及統治安南[參 22],永樂二十二年(1424年),明朝國土面積達到極盛,在東南亞設置舊港宣慰司[註 7]等行政機構,加強對東南洋一帶的管理[參 23][參 24]。\n\n明代商品經濟繁榮,出現商業集鎮,而手工業及文化藝術呈現世俗化趨勢[參 25]。根據《明實錄》所載的人口峰值於成化十五年(1479年)達七千餘萬人[參 26],不過許多學者考慮到當時存在大量隱匿戶口,故認為明朝人口峰值實際上逾億[參 27],還有學者認為晚明人口峰值接近2億[註 8]。這一時期,其GDP總量所占的世界比例在中國古代史上也是最高的,1600年明朝GDP總量為960億美元,占世界經濟總量的29.2%,晚明中國人均GDP在600美元[註 9]。\n\n明朝政治則是權力趨於集中,明太祖在誅殺胡惟庸後廢除傳統的丞相制,六部直接對皇帝負責,後來設置內閣;地方上由承宣布政使司、提刑按察使司、都指揮使司分掌權力,加強地方管理。仁宗、宣宗之後,文官治國的思想逐漸濃厚,行政權向內閣和六部轉移。同時還設有都察院等監察機構,為加強對全國臣民的監視,明太祖設立特務機構錦衣衛,明成祖設立東廠,明憲宗時再設西廠(後取消),明武宗又設內行廠(後取消),合稱「廠衛」。但明朝皇帝並非完全獨斷獨行,有許多事還必須經過經廷推、廷議、廷鞫程序,同時,能將原旨退還的給事中亦可對皇權形成制衡。[參 33]到了後期皇帝出現了怠政,宦官行使大權的陋習[參 3],儘管決策權始終集中在皇帝手中,然而政務大部分已經由內閣處理,此外,到了明代中晚期文官集團的集體意見足以與皇帝抗衡,在遇到事情決斷兩相僵持不下時,也容易產生一種類似於「憲政危機(英語:Constitutional crisis)」的情況,因此「名義上他是天子,實際上他受制於廷臣。」[參 34]但明朝皇權受制於廷臣主要是基於道德上而非法理上,因為明朝當時風氣普遍注重名節,受儒家教育的皇帝通常不願被冠以「昏君」之名。但雖然皇權受制衡,皇帝仍可任意動用皇權,例如明世宗「大禮議」事件最後以廷杖朝臣多人的方式結束[參 35],明神宗在國本之爭失利後也以長期拒絕參與政事向朝臣們示威[1][2][3]。\n\n有學者認為明代是繼漢唐之後的黃金時期,也被稱為最後一個可以和漢唐媲美的盛世[參 36]。清代張廷玉等修的官修《明史》評價明朝為「治隆唐宋」[註 10]、「遠邁漢唐」[參 37]。
172 | # """,
173 | # merge_strategy=merge_strategy, start_contain="B_",
174 | # end_contain="I_")
175 | # print(result)
176 | # self.assertTrue(isinstance(result, list))
177 |
178 | # proc = tfkit.task.tag.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice',
179 | # reserved_len=0)
180 | # for items in proc.prepare_data({"input": input}):
181 | # raw_input = items['raw_input']
182 | # feature = proc.postprocess(items, tokenizer, 512)
183 | # for k, v in feature.items():
184 | # feature[k] = [v]
185 | # self.assertTrue(isinstance(model(feature), Tensor))
186 | # print(model(feature))
187 | # # test eval
188 | # model_dict = model(feature, eval=True)
189 | # self.assertTrue('label_prob_all' in model_dict)
190 | # self.assertTrue('label_map' in model_dict)
191 | # self.assertEqual(len(model_dict['label_map']), len(input.split(" ")))
192 | #
193 | # # test predict
194 | # result, model_dict = model.predict(input=input, start_contain="A", end_contain="A")
195 | # self.assertTrue('label_prob_all' in model_dict[0])
196 | # self.assertTrue('label_map' in model_dict[0])
197 | # print("result", result, len(result))
198 | # self.assertTrue(isinstance(result, list))
199 | #
200 | # # test exceed 512
201 | # for merge_strategy in ['minentropy', 'maxcount', 'maxprob']:
202 | # result, model_dict = model.predict(input=" ".join([str(i) for i in range(1000)]),
203 | # merge_strategy=merge_strategy, start_contain="A", end_contain="A")
204 | # print(result)
205 | # self.assertTrue(isinstance(result, list))
206 |
--------------------------------------------------------------------------------
/tfkit/test/test_atrain.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 | from transformers import BertTokenizer, AutoModel
5 |
6 | import tfkit
7 | from tfkit.test import *
8 | from tfkit.utility.model import load_model_class
9 |
10 |
11 | class TestTrain(unittest.TestCase):
12 |
13 | def testHelp(self):
14 | result = os.system('tfkit-train -h')
15 | assert (result == 0)
16 |
17 | def test_parser(self):
18 | input_arg, model_arg = tfkit.train.parse_train_args(
19 | ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config',
20 | 'voidful/albert_chinese_tiny'])
21 | print(input_arg, model_arg)
22 | self.assertTrue(input_arg.get('task') == ['once'])
23 | self.assertTrue(isinstance(input_arg.get('train'), list))
24 |
25 | input_arg, model_arg = tfkit.train.parse_train_args(
26 | ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config',
27 | 'voidful/albert_chinese_tiny', '--likelihood', 'pos'])
28 | print(input_arg, model_arg)
29 | self.assertTrue(model_arg.get('likelihood') == 'pos')
30 | self.assertTrue(isinstance(input_arg.get('train'), list))
31 |
32 | def test_optimizer(self):
33 | model_class = load_model_class('clas')
34 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
35 | pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
36 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]},
37 | maxlen=128)
38 | optim, scheduler = tfkit.train.optimizer(model, lr=0.1, total_step=10)
39 | print(optim, scheduler)
40 | optim.zero_grad()
41 | scheduler.step()
42 |
43 | def testMultiTask(self):
44 | tfkit.train.main(
45 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
46 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
47 | 'voidful/albert_chinese_tiny', '--maxlen', '50'])
48 | result = os.system(
49 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + MTTASK_MODEL_DIR + ' --train ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --lr 5e-5 --test ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --task once clm --config voidful/albert_chinese_tiny --maxlen 50')
50 | self.assertTrue(result == 0)
51 |
52 | def testGenOnce(self):
53 | tfkit.train.main(
54 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
55 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
56 | 'voidful/albert_chinese_tiny', '--maxlen', '50'])
57 | result = os.system(
58 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task once --config voidful/albert_chinese_tiny --maxlen 50')
59 | self.assertTrue(result == 0)
60 |
61 | def testGenOnceCTC(self):
62 | tfkit.train.main(
63 | ['--batch', '2', '--epoch', '1', '--savedir', ONCECTC_MODEL_DIR, '--train',
64 | GEN_DATASET, '--lr', '3e-4', '--test', GEN_DATASET, '--task', 'oncectc', '--config',
65 | 'voidful/albert_chinese_tiny', '--maxlen', '50'])
66 | result = os.system(
67 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task oncectc --config voidful/albert_chinese_tiny --maxlen 50')
68 | self.assertTrue(result == 0)
69 |
70 | def testGenSeq2Seq(self):
71 | # result = os.system(
72 | # 'tfkit-train --batch 2 --epoch 1 --savedir ' + SEQ2SEQ_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task seq2seq --config prajjwal1/bert-small --maxlen 50 --selfkd True')
73 | # self.assertTrue(result == 0)
74 | tfkit.train.main(
75 | ['--batch', '1', '--epoch', '1', '--savedir', SEQ2SEQ_MODEL_DIR, '--train',
76 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config',
77 | 'prajjwal1/bert-small', '--maxlen', '20'])
78 | tfkit.train.main(
79 | ['--batch', '2', '--epoch', '2', '--savedir', SEQ2SEQ_MODEL_DIR, '--train',
80 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config',
81 | 'prajjwal1/bert-small', '--maxlen', '20', '--likelihood', 'pos'])
82 |
83 | def testGenCLM(self):
84 | result = os.system(
85 | 'tfkit-train --batch 2 --epoch 1 --savedir ' + CLM_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task clm --config prajjwal1/bert-small --maxlen 50')
86 | self.assertTrue(result == 0)
87 | tfkit.train.main(
88 | ['--batch', '2', '--epoch', '2', '--savedir', CLM_MODEL_DIR, '--train',
89 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'clm', '--config',
90 | 'prajjwal1/bert-small', '--maxlen', '20'])
91 |
92 | def testAddTokenFile(self):
93 | tfkit.train.main(
94 | ['--batch', '2', '--epoch', '1', '--savedir', ADDTOKFILE_SAVE_DIR, '--train',
95 | GEN_DATASET, '--lr', '5e-5', '--test', ADDTOK_DATASET, '--task', 'clm', '--config',
96 | 'voidful/albert_chinese_tiny', '--maxlen', '100', '--add_tokens_file', NEWTOKEN_FILE])
97 | result = os.system(
98 | f'tfkit-train --batch 2 --add_tokens_file {NEWTOKEN_FILE} --savedir {ADDTOKFILE_SAVE_DIR} --epoch 2 --train {ADDTOK_DATASET} --test {ADDTOK_DATASET} --task clm --config voidful/albert_chinese_tiny --maxlen 50')
99 | self.assertTrue(result == 0)
100 |
101 | def testResume(self):
102 | tfkit.train.main(
103 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
104 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
105 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'testresume'])
106 |
107 | tfkit.train.main(
108 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
109 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
110 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(ONCE_MODEL_DIR, "1.pt")])
111 |
112 | def testResumeMultiModel(self):
113 | tfkit.train.main(
114 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
115 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
116 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm'])
117 | # resume to train all task
118 | tfkit.train.main(
119 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET,
120 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config',
121 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm', '--resume',
122 | os.path.join(MTTASK_MODEL_DIR, "1.pt")])
123 | # resume to train only one task
124 | tfkit.train.main(
125 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train',
126 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'clm', '--config',
127 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(MTTASK_MODEL_DIR, "1.pt"),
128 | '--tag', 'clm'])
129 |
130 | @pytest.mark.skip()
131 | def testLoggerwandb(self):
132 | tfkit.train.main(
133 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train',
134 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config',
135 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--wandb'])
136 |
137 | def testClas(self):
138 | tfkit.train.main(
139 | ['--batch', '2', '--epoch', '1', '--savedir', CLAS_MODEL_DIR, '--train',
140 | CLAS_DATASET, '--lr', '5e-5', '--test', CLAS_DATASET, '--task', 'clas', '--config',
141 | 'voidful/albert_chinese_tiny', '--maxlen', '50'])
142 | result = os.system(
143 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + CLAS_MODEL_DIR + ' --train ' + CLAS_DATASET + ' --test ' + CLAS_DATASET + ' --task clas --config voidful/albert_chinese_tiny --maxlen 50')
144 | self.assertTrue(result == 0)
145 |
146 | # def testQA(self):
147 | # tfkit.train.main(
148 | # ['--batch', '2', '--epoch', '1', '--savedir', QA_MODEL_DIR, '--train',
149 | # QA_DATASET, '--lr', '5e-5', '--test', QA_DATASET, '--task', 'qa', '--config',
150 | # 'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'start_slice'])
151 | # result = os.system(
152 | # 'tfkit-train --batch 2 --epoch 2 --savedir ' + QA_MODEL_DIR + ' --train ' + QA_DATASET + ' --test ' + QA_DATASET + ' --task qa --config voidful/albert_chinese_tiny --maxlen 512 --handle_exceed start_slice')
153 | # self.assertTrue(result == 0)
154 | #
155 | # def testTag(self):
156 | # tfkit.train.main(
157 | # ['--batch', '2', '--epoch', '1', '--savedir', TAG_MODEL_DIR, '--train',
158 | # TAG_DATASET, '--lr', '5e-5', '--test', TAG_DATASET, '--task', 'tag', '--config',
159 | # 'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'slide'])
160 | # result = os.system(
161 | # 'tfkit-train --batch 2 --epoch 2 --savedir ' + TAG_MODEL_DIR + ' --train ' + TAG_DATASET + ' --test ' + TAG_DATASET + ' --task tag --config voidful/albert_chinese_tiny --maxlen 50 --handle_exceed slide')
162 | # self.assertTrue(result == 0)
163 |
--------------------------------------------------------------------------------
/tfkit/test/test_package.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from transformers import AutoTokenizer
4 |
5 | import tfkit
6 | import os
7 |
8 | class TestPackage(unittest.TestCase):
9 |
10 | def testImport(self):
11 | path = os.path.dirname(tfkit.__file__)
12 | print(path)
13 | tfkit.task
14 | tfkit.utility
15 |
--------------------------------------------------------------------------------
/tfkit/test/test_zeval.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import tfkit
4 | from tfkit.test import *
5 |
6 |
7 | class TestEval(unittest.TestCase):
8 |
9 | def testHelp(self):
10 | result = os.system('tfkit-eval -h')
11 | self.assertTrue(result == 0)
12 |
13 | def test_parser(self):
14 | parser, _ = tfkit.eval.parse_eval_args(
15 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print'])
16 | print(parser)
17 | self.assertTrue(parser.get('model') == ['once'])
18 |
19 | eval_parser, model_parser = tfkit.eval.parse_eval_args(
20 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print', '--decodenum', '2'])
21 | self.assertTrue(eval_parser.get('model') == ['once'])
22 | self.assertTrue(model_parser.get('decodenum') == '2')
23 |
24 | def testEvalGen(self):
25 | tfkit.eval.main(
26 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
27 | result = os.system(
28 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
29 | self.assertTrue(result == 0)
30 |
31 | def testEvalGenOnce(self):
32 | tfkit.eval.main(
33 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
34 | result = os.system(
35 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
36 | self.assertTrue(result == 0)
37 |
38 | def testEvalGenOnceCTC(self):
39 | tfkit.eval.main(
40 | ['--model', ONCECTC_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
41 | result = os.system(
42 | 'tfkit-eval --model ' + ONCECTC_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
43 | self.assertTrue(result == 0)
44 |
45 | def testEvalSeq2Seq(self):
46 | tfkit.eval.main(
47 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print',
48 | '--decodenum', '2'])
49 | tfkit.eval.main(
50 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
51 | result = os.system(
52 | 'tfkit-eval --model ' + SEQ2SEQ_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
53 | self.assertTrue(result == 0)
54 |
55 | def testEvalCLM(self):
56 | tfkit.eval.main(
57 | ['--model', CLM_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
58 | result = os.system(
59 | 'tfkit-eval --model ' + CLM_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
60 | self.assertTrue(result == 0)
61 |
62 | def testEvalAddedTokenModel(self):
63 | result = os.system(
64 | 'tfkit-eval --model ' + ADDTOKFILE_MODEL_PATH + ' --valid ' + ADDTOK_DATASET + ' --metric emf1 --print')
65 | self.assertTrue(result == 0)
66 |
67 | def testEvalClassify(self):
68 | tfkit.eval.main(
69 | ['--model', CLAS_MODEL_PATH, '--valid', CLAS_DATASET, '--metric', 'clas', '--print'])
70 | result = os.system(
71 | 'tfkit-eval --model ' + CLAS_MODEL_PATH + ' --valid ' + CLAS_DATASET + ' --metric clas --print')
72 | self.assertTrue(result == 0)
73 |
74 | # def testEvalQA(self):
75 | # tfkit.eval.main(
76 | # ['--model', QA_MODEL_PATH, '--valid', QA_DATASET, '--metric', 'emf1', '--print'])
77 | # result = os.system(
78 | # 'tfkit-eval --model ' + QA_MODEL_PATH + ' --valid ' + QA_DATASET + ' --metric emf1 --print')
79 | # self.assertTrue(result == 0)
80 | #
81 | # def testEvalTag(self):
82 | # tfkit.eval.main(
83 | # ['--model', TAG_MODEL_PATH, '--valid', TAG_DATASET, '--metric', 'clas', '--print'])
84 | # result = os.system(
85 | # 'tfkit-eval --model ' + TAG_MODEL_PATH + ' --valid ' + TAG_DATASET + ' --metric clas --print')
86 | # self.assertTrue(result == 0)
--------------------------------------------------------------------------------
/tfkit/test/test_zzdump.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from tfkit.test import *
3 | import os
4 |
5 | import tfkit
6 |
7 |
8 | class TestEval(unittest.TestCase):
9 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
10 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
11 |
12 | def testHelp(self):
13 | result = os.system('tfkit-dump -h')
14 | assert (result == 0)
15 |
16 | def test_parser(self):
17 | parser = tfkit.dump.parse_dump_args(['--model', 'a', '--dumpdir', 'b'])
18 | self.assertTrue(parser.get('model') == 'a')
19 | self.assertTrue(parser.get('dumpdir') == 'b')
20 |
21 | def testDump(self):
22 | dump_dir = './cache/dump'
23 | tfkit.dump.main(["--model", CLM_MODEL_PATH, '--dumpdir', dump_dir])
24 | result = os.system(
25 | 'tfkit-dump --model ' + CLM_MODEL_PATH + ' --dumpdir ' + dump_dir)
26 | self.assertTrue(result == 0)
27 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_filereader.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from tfkit.test import *
4 | from tfkit.utility.data_filereader import *
5 |
6 |
7 | class TestDataFile(unittest.TestCase):
8 |
9 | def test_get_x_data_from_file(self):
10 | for get_x_iter in [get_gen_data_from_file(GEN_DATASET),
11 | get_qa_data_from_file(QA_DATASET),
12 | get_tag_data_from_file(TAG_DATASET),
13 | get_clas_data_from_file(CLAS_DATASET),
14 | get_multiclas_data_from_file(CLAS_DATASET)]:
15 | while True:
16 | try:
17 | print(next(get_x_iter))
18 | except StopIteration as e:
19 | task_label_dict = e.value
20 | break
21 | print(task_label_dict)
22 | for k, v in task_label_dict.items():
23 | print(k, v)
24 | self.assertTrue(isinstance(v, list))
25 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_loader.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import torch
4 |
5 | from tfkit.utility.data_loader import pad_batch
6 |
7 |
8 | class TestUtilityDataLoader(unittest.TestCase):
9 |
10 | def test_batch_reduce_pad(self):
11 | k = [{'input': torch.tensor([1, 2, 3])},
12 | {'input': torch.tensor([3, 4])},
13 | {'input': torch.tensor([5])}]
14 | reduced_batch = pad_batch(k)
15 | self.assertEqual(len(reduced_batch[0]['input']), len(reduced_batch[1]['input']))
16 | print(reduced_batch)
17 | self.assertCountEqual(reduced_batch[0]['input'], [1, 2, 3])
18 | self.assertCountEqual(reduced_batch[1]['input'], [3, 4, 0])
19 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_data_processor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from tfkit.test import *
4 | from tfkit.utility.data_filereader import *
5 |
6 |
7 | class TestDataPreprocess(unittest.TestCase):
8 |
9 | def test_get_x_data_from_file(self):
10 | for get_x_iter in [get_gen_data_from_file(GEN_DATASET),
11 | get_qa_data_from_file(QA_DATASET),
12 | get_tag_data_from_file(TAG_DATASET),
13 | get_clas_data_from_file(CLAS_DATASET),
14 | get_multiclas_data_from_file(CLAS_DATASET)]:
15 | while True:
16 | try:
17 | print(next(get_x_iter))
18 | except StopIteration as e:
19 | task_label_dict = e.value
20 | break
21 | print(task_label_dict)
22 | for k, v in task_label_dict.items():
23 | print(k, v)
24 | self.assertTrue(isinstance(v, list))
25 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_logger.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import sys
3 | import os
4 |
5 | from tfkit.utility.logger import Logger
6 |
7 | dir_path = os.path.dirname(os.path.realpath(__file__))
8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
9 |
10 | import unittest
11 | import tfkit
12 |
13 |
14 | class TestLogger(unittest.TestCase):
15 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../"))
16 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, './test/cache/')
17 |
18 | def test_write_log(self):
19 | logger = Logger(savedir=self.MODEL_SAVE_PATH)
20 | logger.write_log("test")
21 | with open(logger.logfilepath, 'r') as f:
22 | lines = f.read().splitlines()
23 | last_line = lines[-1]
24 | print(last_line)
25 | self.assertEqual(last_line, "test")
26 |
27 | def test_write_metric(self):
28 | logger = Logger(savedir=self.MODEL_SAVE_PATH)
29 | logger.write_metric("test", 1, 0)
30 | with open(logger.metricfilepath, 'r') as f:
31 | last_row = list(csv.reader(f))[-1]
32 | self.assertEqual(last_row, ["test", '1', '0'])
33 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_loss.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import torch
5 | from torch import nn
6 | from torch.autograd import Variable
7 |
8 | dir_path = os.path.dirname(os.path.realpath(__file__))
9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
10 |
11 | import unittest
12 | import tfkit
13 |
14 |
15 | class TestLoss(unittest.TestCase):
16 | outputs = Variable(torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]), requires_grad=False)
17 | targets = Variable(torch.Tensor([1, 1]).long(), requires_grad=False)
18 | alln_targets = Variable(torch.Tensor([-1, -1]).long(), requires_grad=False)
19 | onen_targets = Variable(torch.Tensor([1, -1]).long(), requires_grad=False)
20 |
21 | def testLabelSmoothingCrossEntropy(self):
22 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
23 | targets = torch.Tensor([1, 1]).long()
24 | alln_targets = torch.Tensor([0, -1]).long()
25 | onen_targets = torch.Tensor([1, -1]).long()
26 |
27 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
28 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, ignore_index=-1)
29 |
30 | self.assertTrue(criterion(outputs, targets).item() <
31 | custom_criterion(outputs, targets).item())
32 | self.assertTrue(criterion(outputs, onen_targets).item() <
33 | custom_criterion(outputs, onen_targets).item())
34 |
35 | criterion = nn.CrossEntropyLoss()
36 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3)
37 | self.assertTrue(criterion(outputs, targets).item() <
38 | custom_criterion(outputs, targets).item())
39 |
40 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, reduction='none')
41 | print(custom_criterion(self.outputs, self.targets))
42 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
43 |
44 | def testDiceLoss(self):
45 | custom_criterion = tfkit.utility.loss.DiceLoss(ignore_index=-1)
46 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.targets).item() < 1)
47 | self.assertTrue(0.99 < custom_criterion(self.outputs, self.alln_targets).item() <= 1)
48 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.onen_targets).item() < 1)
49 |
50 | custom_criterion = tfkit.utility.loss.DiceLoss(reduction='none')
51 | print(custom_criterion(self.outputs, self.targets))
52 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
53 |
54 | def testLossDrop(self):
55 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
56 | targets = torch.Tensor([1, 1]).long()
57 | norm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
58 | loss_fct = nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # -1 index = padding token
59 | masked_lm_loss = loss_fct(outputs, targets)
60 | masked_lm_loss = masked_lm_loss.view(-1, len(targets)) # view by batch size
61 | masked_lm_loss = masked_lm_loss.sum(dim=0)
62 | masked_lm_loss = masked_lm_loss.mean()
63 | print(masked_lm_loss.mean(), norm_loss_fct(outputs, targets).mean())
64 |
65 | def testBCEFocalLoss(self):
66 | outputs = torch.Tensor([[0, 1, 0], [0.2, 0, 0]])
67 | targets = torch.Tensor([[0, 1, 0], [1, 0, 0]])
68 | criterion = nn.BCELoss()
69 | custom_criterion = tfkit.utility.loss.BCEFocalLoss()
70 | self.assertTrue(criterion(outputs, targets).item() >
71 | custom_criterion(outputs, targets).item())
72 |
73 | def testNegativeCElLoss(self):
74 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
75 | targets = torch.Tensor([1, 1]).long()
76 | alln_targets = torch.Tensor([-1, -1]).long()
77 | onen_targets = torch.Tensor([1, -1]).long()
78 |
79 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
80 | custom_criterion = tfkit.utility.loss.NegativeCElLoss()
81 | self.assertTrue(
82 | criterion(outputs, targets).item() < custom_criterion(outputs, self.targets).item())
83 | self.assertTrue(criterion(outputs, onen_targets).item() < custom_criterion(outputs, onen_targets).item())
84 |
85 | def testFocalLoss(self):
86 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
87 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=0)
88 | self.assertAlmostEqual(criterion(self.outputs, self.targets).item(),
89 | custom_criterion(self.outputs, self.targets).item())
90 | self.assertAlmostEqual(criterion(self.outputs, self.alln_targets).item(),
91 | custom_criterion(self.outputs, self.alln_targets).item())
92 | self.assertAlmostEqual(criterion(self.outputs, self.onen_targets).item(),
93 | custom_criterion(self.outputs, self.onen_targets).item())
94 |
95 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=1)
96 | self.assertTrue(criterion(self.outputs, self.targets) > custom_criterion(self.outputs, self.targets))
97 | self.assertTrue(criterion(self.outputs, self.alln_targets).item() - custom_criterion(self.outputs,
98 | self.alln_targets).item() < 1)
99 | self.assertTrue(criterion(self.outputs, self.onen_targets) > custom_criterion(self.outputs, self.onen_targets))
100 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from tfkit.utility.model import list_all_model, load_model_class, load_predict_parameter, load_trained_model
5 |
6 | dir_path = os.path.dirname(os.path.realpath(__file__))
7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
8 |
9 | import unittest
10 | from transformers import BertTokenizer, AutoModel
11 |
12 |
13 | class TestModelLoader(unittest.TestCase):
14 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../"))
15 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/')
16 |
17 | def test_list_all_model(self):
18 | models = list_all_model()
19 | self.assertTrue(isinstance(models, list))
20 |
21 | def test_load_model_class(self):
22 | load_model_class('clas')
23 | load_model_class('once')
24 |
25 | def test_load_predict_parameter(self):
26 | model_class = load_model_class('clas')
27 | # load pre-train task
28 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
29 | pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny')
30 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]},
31 | maxlen=128)
32 | clas_param = load_predict_parameter(model)
33 | print("clas_param", clas_param)
34 | self.assertTrue('input' in clas_param)
35 | self.assertTrue('topK' in clas_param)
36 | self.assertTrue('task' in clas_param)
37 | self.assertTrue('handle_exceed' in clas_param)
38 | self.assertTrue(isinstance(clas_param['handle_exceed'], str))
39 |
40 | # def test_load_trained_model(self):
41 | # model_path = os.path.join(self.MODEL_SAVE_PATH, '1.pt')
42 | # model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path)
43 | # print(model)
44 | # print(model_type)
45 | # print(model_class)
46 | # print(model_info)
47 | # print(model.predict)
48 | # print(model.predict(input="a"))
49 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_tok.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | dir_path = os.path.dirname(os.path.realpath(__file__))
5 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
6 |
7 | import unittest
8 | import tfkit
9 | from transformers import AutoTokenizer, BertTokenizer
10 |
11 |
12 | class TestTok(unittest.TestCase):
13 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../"))
14 | DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data')
15 |
16 | def testTok(self):
17 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
18 | begin = tfkit.utility.tok.tok_begin(tokenizer)
19 | self.assertEqual(begin, "[CLS]")
20 | sep = tfkit.utility.tok.tok_sep(tokenizer)
21 | self.assertEqual(sep, "[SEP]")
22 | mask = tfkit.utility.tok.tok_mask(tokenizer)
23 | self.assertEqual(mask, "[MASK]")
24 | pad = tfkit.utility.tok.tok_pad(tokenizer)
25 | self.assertEqual(pad, "[PAD]")
26 |
27 | def testTok(self):
28 | tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
29 | begin = tfkit.utility.tok.tok_begin(tokenizer)
30 | self.assertEqual(begin, "")
31 | sep = tfkit.utility.tok.tok_sep(tokenizer)
32 | self.assertEqual(sep, "")
33 | mask = tfkit.utility.tok.tok_mask(tokenizer)
34 | self.assertEqual(mask, "")
35 | pad = tfkit.utility.tok.tok_pad(tokenizer)
36 | self.assertEqual(pad, "")
37 |
38 | def testGetXUnkToken(self):
39 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
40 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[], topP=0.5)
41 | self.assertFalse(result)
42 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[], freqK=10)
43 | self.assertFalse(result)
44 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
45 | freqK=1)
46 | self.assertTrue(len(result) > 0)
47 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
48 | topP=0.9)
49 | self.assertTrue(len(result) > 0)
50 |
51 | def testHandleExceed(self):
52 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
53 | seq = " ".join([str(_) for _ in range(100)])
54 | maxlen = 50
55 | for mode in ['noop', 'remove', 'slide', 'start_slice', 'end_slice']:
56 | rlt, _ = tfkit.utility.tok.handle_exceed(tokenizer, seq, maxlen, mode=mode)
57 | if mode == 'remove':
58 | self.assertTrue(len(rlt) == 0)
59 | if mode == 'slide':
60 | self.assertTrue(len(rlt) > 1)
61 | for i in rlt:
62 | print(i)
63 | if mode != 'noop':
64 | self.assertTrue(len(i) == 50)
65 |
--------------------------------------------------------------------------------
/tfkit/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/tfkit/utility/__init__.py
--------------------------------------------------------------------------------
/tfkit/utility/data_filereader.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from collections import defaultdict
3 |
4 | import nlp2
5 |
6 |
7 | # ignore sklearn warning
8 | def warn(*args, **kwargs):
9 | pass
10 |
11 |
12 | import warnings
13 |
14 | warnings.warn = warn
15 |
16 | from tqdm.auto import tqdm
17 |
18 | from tfkit.utility import tok
19 |
20 |
21 | def get_multiclas_data_from_file(fpath):
22 | task_label_dict = defaultdict(list)
23 | with open(fpath, 'r') as infile:
24 | reader = csv.DictReader(infile)
25 | fieldnames = reader.fieldnames
26 | headers = ['input'] + ['target_' + str(i) for i in range(len(fieldnames) - 1)]
27 |
28 | is_multi_label = ""
29 | for rows in nlp2.read_csv_chunk(fpath, ','):
30 | for row in rows:
31 | if tok.UNIVERSAL_SEP in row[1]:
32 | is_multi_label = "_multi_label"
33 | break
34 |
35 | for rows in nlp2.read_csv_chunk(fpath, ','):
36 | for row in rows:
37 | start_pos = 1
38 | for pos, item in enumerate(row[start_pos:]):
39 | pos += start_pos
40 | task = headers[0] + "_" + headers[pos] + is_multi_label
41 | item = item.strip()
42 | if tok.UNIVERSAL_SEP in item:
43 | for i in item.split(tok.UNIVERSAL_SEP):
44 | task_label_dict[task].append(i) if i not in task_label_dict[task] else task_label_dict[task]
45 | else:
46 | task_label_dict[task].append(item) if item not in task_label_dict[task] else task_label_dict[
47 | task]
48 | task_label_dict[task].sort()
49 |
50 | for rows in nlp2.read_csv_chunk(fpath, ','):
51 | chunk = []
52 | for row in rows:
53 | start_pos = 1
54 | for pos, item in enumerate(row[start_pos:]):
55 | pos += start_pos
56 | task = headers[0] + "_" + headers[pos] + is_multi_label
57 | item = item.strip()
58 | targets = item.split(tok.UNIVERSAL_SEP) if tok.UNIVERSAL_SEP in item else [item]
59 | targets = [task_label_dict[task][task_label_dict[task].index(target)] for target in targets]
60 | input = row[0]
61 | chunk.append({"task": task, "input": input, "target": targets})
62 | yield chunk
63 | return task_label_dict
64 |
65 |
66 | def get_clas_data_from_file(fpath):
67 | task_label_dict = defaultdict(list)
68 | task = 'clas'
69 | task_label_dict[task] = []
70 | for rows in nlp2.read_csv_chunk(fpath, ','):
71 | chunk = []
72 | for row in rows:
73 | source_text = row[0]
74 | target_text = row[1]
75 | if target_text not in task_label_dict[task]:
76 | task_label_dict[task].append(target_text)
77 | chunk.append({"task": task, "input": source_text, "target": task_label_dict[task].index(target_text)})
78 | yield chunk
79 | return task_label_dict
80 |
81 |
82 | def get_gen_data_from_file(fpath):
83 | task_label_dict = defaultdict(list)
84 | task = 'gen'
85 | task_label_dict[task] = []
86 | print("Reading data from file...")
87 | for rows in nlp2.read_csv_chunk(fpath, ','):
88 | chunk = []
89 | for row in rows:
90 | source_text = str(row[0]).strip()
91 | target_text = str(row[1]).strip()
92 | negative_text = str(row[2]).strip() if len(row) > 2 else None
93 | if len(source_text) == 0 or len(target_text) == 0:
94 | continue
95 | chunk.append({"task": task, "input": source_text, "target": target_text, "ntarget": negative_text})
96 | yield chunk
97 | return task_label_dict
98 |
99 |
100 | def get_qa_data_from_file(fpath):
101 | task_label_dict = defaultdict(list)
102 | task = 'qa'
103 | task_label_dict[task] = []
104 | for rows in nlp2.read_csv_chunk(fpath, ','):
105 | chunk = []
106 | for row in rows:
107 | context, start, end = row
108 | chunk.append({"task": task, "input": context, "target": [start, end]})
109 | yield chunk
110 | return task_label_dict
111 |
112 |
113 | def get_tag_data_from_file(fpath, text_index: int = 0, label_index: int = 1, separator=" "):
114 | task_label_dict = defaultdict(list)
115 | task = 'tag'
116 | labels = []
117 | for rows in nlp2.read_csv_chunk(fpath, ','):
118 | for row in rows:
119 | for i in row[1].split(separator):
120 | if i not in labels and len(i.strip()) > 0:
121 | labels.append(i)
122 | labels.sort()
123 | task_label_dict[task] = labels
124 |
125 | for rows in nlp2.read_csv_chunk(fpath, ','):
126 | chunk = []
127 | for row in rows:
128 | chunk.append({"task": task, "input": row[text_index].strip(), "target": row[label_index].strip(),
129 | 'separator': separator})
130 | yield chunk
131 | return task_label_dict
132 |
133 |
134 | def get_tag_data_from_file_col(fpath, text_index: int = 0, label_index: int = 1, separator=" ", **kwargs):
135 | tasks = defaultdict(list)
136 | task = 'default'
137 | labels = []
138 | with open(fpath, 'r', encoding='utf-8') as f:
139 | lines = f.read().splitlines()
140 | for line in tqdm(lines):
141 | rows = line.split(separator)
142 | if len(rows) > 1:
143 | if rows[label_index] not in labels and len(rows[label_index]) > 0:
144 | labels.append(rows[label_index])
145 | labels.sort()
146 | tasks[task] = labels
147 | with open(fpath, 'r', encoding='utf-8') as f:
148 | lines = f.read().splitlines()
149 | x, y = "", ""
150 | for line in tqdm(lines):
151 | rows = line.split(separator)
152 | if len(rows) == 1:
153 | yield tasks, task, x.strip(), [y.strip()]
154 | x, y = "", ""
155 | else:
156 | if len(rows[text_index]) > 0:
157 | x += rows[text_index].replace(" ", "_") + separator
158 | y += rows[label_index].replace(" ", "_") + separator
159 |
--------------------------------------------------------------------------------
/tfkit/utility/data_loader.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import torch
3 | from torch import nn
4 | from torch.utils import data
5 |
6 |
7 | def index_of(in_list, val):
8 | """
9 | get token index in list, return -1 when it is not in the list
10 | :rtype: int
11 | :param in_list: query list
12 | :param val: query target
13 | :return: position index
14 | """
15 | try:
16 | return in_list.index(val)
17 | except ValueError:
18 | return -1
19 |
20 |
21 | def pad_batch(batch):
22 | """
23 | reduce batch data shape by reduce their padding to common max
24 | it needs to Handel some exception since some key is no need to be padded
25 | :param batch: list of dict, with key input and target as model input and target
26 | :return: list of dict
27 | """
28 | keys = list(batch[0].keys())
29 | for k in keys:
30 | batch_key_length = [len(i[k]) if not isinstance(i[k], int) else 1 for i in batch]
31 | if len(set(batch_key_length)) > 1: # is all value same? if no, it need to pad with max length
32 | pad_length = max(batch_key_length)
33 | for idx, _ in enumerate(batch):
34 | if f"{k}_pad" in batch[idx]:
35 | padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), batch[idx][f"{k}_pad"][0])
36 | else:
37 | padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), 0)
38 | # batch[idx][k] = torch.unsqueeze(padded(batch[idx][k]), 0)
39 | batch[idx][k] = padded(batch[idx][k])
40 | for ind, dat in enumerate(batch):
41 | for k, v in dat.items():
42 | batch[ind][k] = numpy.asarray(batch[ind][k])
43 | return batch
44 |
45 |
46 | def dataloader_collate(batch):
47 | """
48 | dataloader_collate function to apply batch reduce padding
49 | :param batch: list of dict
50 | :return: batch: list of dict
51 | """
52 | # batch = copy.deepcopy(batch)
53 | return torch.utils.data._utils.collate.default_collate(pad_batch(batch))
54 |
--------------------------------------------------------------------------------
/tfkit/utility/data_processor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from numpy import uint16
4 |
5 | from tfkit.utility import tok
6 |
7 |
8 | class GeneralNLPPreprocessor:
9 | """
10 | The design of NLPPreprocessor is to handle a pure text input,
11 | perform preprocessing on it base on model constrain,
12 | return ids as output
13 |
14 | This class will be applied before model training, splitting and prepare the data for model input
15 | it will call get feature from data when it's converting to model input
16 | """
17 |
18 | def __init__(self, tokenizer, maxlen=512, handle_exceed='slide', reserved_len=0, uint16_save=False,
19 | kwargs={}):
20 | self.tokenizer = tokenizer
21 | self.uint16_save = uint16_save
22 | self.parameters = {**{'tokenizer': tokenizer, 'maxlen': maxlen, 'handle_exceed': handle_exceed,
23 | 'reserved_len': reserved_len}, **kwargs}
24 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
25 | # item = {key: value.tolist() for key, value in item.items()}
26 | self.tok_pad_id = tok.tok_pad_id(tokenizer)
27 | self.tok_bos_id = tok.tok_begin_id(tokenizer)
28 | self.tok_sep_id = tok.tok_sep_id(tokenizer)
29 | self.tok_mask_id = tok.tok_mask_id(tokenizer)
30 |
31 | def read_file_to_data(self, filepath):
32 | assert 'plz override this funciton'
33 |
34 | def set_global_parameters(self):
35 | self.tokenize_target = False
36 |
37 | def preprocess(self, item):
38 | self.set_global_parameters()
39 | preprocessed_data = []
40 | item = self.preprocess_component_prepare_input(item)
41 | # target may be none in eval
42 | t_input_list, t_target_list, t_input_index, t_target_index = self.preprocess_component_split_into_list(
43 | item['input'],
44 | item.get('target'))
45 | for t_input, t_target, t_input_index, t_target_index in zip(t_input_list,
46 | t_target_list,
47 | t_input_index,
48 | t_target_index):
49 | slice_length = self.parameters['maxlen'] - self.parameters.get('reserved_len') - 3
50 | item['input'] = [tok.tok_begin(self.tokenizer)] + t_input[:slice_length]
51 | item['input_index'] = t_input_index
52 | item['target_index'] = t_target_index
53 | if len(t_target) > 0:
54 | item['target'] = t_target
55 | for convert_feature_input_dict in self.preprocess_component_convert_to_id(item):
56 | if self.uint16_save:
57 | data_item = {k: np.array(v, dtype=uint16) if isinstance(v, list) else v for k, v in
58 | convert_feature_input_dict.items()}
59 | else:
60 | data_item = convert_feature_input_dict
61 | preprocessed_data.append(data_item)
62 | return preprocessed_data
63 |
64 | def preprocess_component_prepare_input(self, item):
65 | if tok.UNIVERSAL_SEP in item['input']:
66 | part = item['input'].split(tok.UNIVERSAL_SEP)
67 | item['previous'] = self.tokenizer.tokenize(part[-1])
68 | item['input'] = "".join(part[:-1])
69 | return item
70 |
71 | def preprocess_component_split_into_list(self, input_text, target_text=None):
72 | t_input_list, t_input_index = tok.handle_exceed(self.tokenizer, input_text,
73 | maxlen=self.parameters['maxlen'] - 3,
74 | mode=self.parameters.get('handle_exceed'))
75 | if self.tokenize_target and target_text:
76 | t_target_list, t_target_index = tok.handle_exceed(self.tokenizer, target_text,
77 | maxlen=self.parameters['maxlen'] - 3,
78 | mode=self.parameters.get('handle_exceed'))
79 | elif target_text:
80 | t_target_list, t_target_index = [target_text * len(t_input_list)], [[0] * len(t_input_list)]
81 | else:
82 | t_target_list, t_target_index = ['' * len(t_input_list)], [[0] * len(t_input_list)]
83 | return t_input_list, t_target_list, t_input_index, t_target_index
84 |
85 | def preprocess_component_convert_to_id(self, item):
86 | yield {k: self.tokenizer.convert_tokens_to_ids(v) if isinstance(v, list) else v for k, v in item.items()}
87 |
88 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
89 | return {key: torch.tensor(value) for key, value in item.items() if isinstance(value, list)}
90 |
91 | def postprocess_batch(self, feature_dict, **kwargs):
92 | return {key: torch.unsqueeze(torch.tensor(value), 0).to(self.device) for key, value in feature_dict.items()}
93 |
94 |
95 | class GeneralCVPreprocessor:
96 | def __init__(self, feature_extractor, kwargs={}):
97 | self.feature_extractor = feature_extractor
98 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
99 | self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs}
100 |
101 | def read_file_to_data(self, filepath):
102 | assert 'plz override this funciton'
103 |
104 | def preprocess(self, item):
105 | preprocessed_data = []
106 | preprocessed_data.append(item)
107 | return preprocessed_data
108 |
109 | def postprocess(self, item, **kwargs):
110 | item['input'] = self.feature_extractor(item['input'])
111 | return {key: torch.tensor(value) for key, value in item.items()}
112 |
113 |
114 | class GeneralSpeechPreprocessor:
115 | def __init__(self, feature_extractor, kwargs={}):
116 | self.feature_extractor = feature_extractor
117 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
118 | self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs}
119 |
120 | def read_file_to_data(self, filepath):
121 | assert 'plz override this function'
122 |
123 | def preprocess(self, item):
124 | preprocessed_data = []
125 | preprocessed_data.append(item)
126 | return preprocessed_data
127 |
128 | def postprocess(self, item, **kwargs):
129 | item['input'] = self.feature_extractor(item['input'])
130 | return {key: torch.tensor(value) for key, value in item.items()}
--------------------------------------------------------------------------------
/tfkit/utility/dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | from collections import defaultdict
3 | from random import choice
4 |
5 | import joblib
6 | import nlp2
7 | from torch.utils import data
8 | from tqdm.contrib.concurrent import process_map
9 |
10 |
11 | def get_dataset(file_path, task_class, tokenizer, parameter):
12 | panel = nlp2.Panel()
13 | # all_arg = nlp2.function_get_all_arg_with_value(task_class.preprocessor.prepare_convert_to_id)
14 | # if parameter.get('panel'):
15 | # print("Operation panel for data preprocessing.")
16 | # for missarg in nlp2.function_check_missing_arg(task_class.preprocessor,
17 | # parameter):
18 | # panel.add_element(k=missarg, v=all_arg[missarg], msg=missarg, default=all_arg[missarg])
19 | # filled_arg = panel.get_result_dict()
20 | # parameter.update(filled_arg)
21 | ds = TFKitDataset(fpath=file_path, tokenizer=tokenizer,
22 | preprocessor=task_class.Preprocessor,
23 | preprocessing_arg=parameter)
24 | return ds
25 |
26 |
27 | class TFKitDataset(data.Dataset):
28 | def __init__(self, fpath, tokenizer, preprocessor, preprocessing_arg={}):
29 | cache_path = fpath + "_" + tokenizer.name_or_path.replace("/", "_") + ".cache"
30 | self.task_dict = {}
31 | self.preprocessor = preprocessor(tokenizer, kwargs=preprocessing_arg)
32 | self.tokenizer = tokenizer
33 | if os.path.isfile(cache_path) and preprocessing_arg.get('cache', False):
34 | with open(cache_path, "rb") as fo:
35 | outdata = joblib.load(fo)
36 | sample = outdata['sample']
37 | length = outdata['length']
38 | self.task_dict = outdata['task']
39 | else:
40 | print(f"Start preprocessing...")
41 | sample = defaultdict(list)
42 | length = 0
43 | get_data_item = self.preprocessor.read_file_to_data(fpath)
44 | while True:
45 | try:
46 | for items in process_map(self.preprocessor.preprocess, next(get_data_item),
47 | chunksize=1000):
48 | for i in items:
49 | length += 1
50 | for k, v in i.items():
51 | sample[k].append(v)
52 | print(f"loaded {length} data.")
53 | except StopIteration as e:
54 | tasks = e.value
55 | break
56 | self.task_dict = tasks
57 | print(f"There are {length} datas after preprocessing.")
58 | if preprocessing_arg.get('cache', False):
59 | with open(cache_path, 'wb') as fo:
60 | outdata = {'sample': sample, 'task': self.task_dict, 'length': length}
61 | joblib.dump(outdata, fo)
62 | self.length = length
63 | self.sample = sample
64 | self.task = self.task_dict
65 |
66 | def increase_with_sampling(self, total):
67 | for _ in range(total - self.length):
68 | for key in self.sample.keys():
69 | self.sample[key].append(choice(self.sample[key]))
70 |
71 | def __len__(self):
72 | return self.length
73 |
74 | def __getitem__(self, idx):
75 | return self.preprocessor.postprocess(
76 | {**{'task_dict': self.task_dict}, **{key: self.sample[key][idx] for key in self.sample.keys()}},
77 | self.tokenizer,
78 | maxlen=self.preprocessor.parameters['maxlen'])
79 |
--------------------------------------------------------------------------------
/tfkit/utility/eval_metric.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import re
3 | import string
4 | from collections import Counter
5 | from collections import defaultdict
6 |
7 | import editdistance as ed
8 | from tqdm.auto import tqdm
9 |
10 | from tfkit.utility import tok
11 |
12 |
13 | def _normalize_answer(s, task='emf1'):
14 | """Lower text and remove punctuation, articles and extra whitespace."""
15 |
16 | def remove_articles(text):
17 | if len(text) > 1:
18 | return re.sub(r'\b(a|an|the)\b', ' ', text)
19 | else:
20 | return text
21 |
22 | def white_space_fix(text):
23 | return ' '.join(text.split())
24 |
25 | def remove_punc(text):
26 | exclude = set(string.punctuation)
27 | return ''.join(ch for ch in text if ch not in exclude)
28 |
29 | def lower(text):
30 | return text.lower()
31 |
32 | if task == 'emf1':
33 | return white_space_fix(remove_articles(remove_punc(lower(s))))
34 | else:
35 | return white_space_fix((remove_punc(lower(s))))
36 |
37 |
38 | def _f1_score(prediction, ground_truth):
39 | prediction_tokens = _normalize_answer(prediction).split()
40 | ground_truth_tokens = _normalize_answer(ground_truth).split()
41 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
42 | num_same = sum(common.values())
43 | if num_same == 0:
44 | return 0
45 | precision = 1.0 * num_same / len(prediction_tokens)
46 | recall = 1.0 * num_same / len(ground_truth_tokens)
47 | f1 = (2 * precision * recall) / (precision + recall)
48 | return f1
49 |
50 |
51 | def _cer(groundtruth, hypothesis):
52 | err = 0
53 | tot = 0
54 | for p, t in zip(hypothesis, groundtruth):
55 | err += float(ed.eval(p.lower(), t.lower()))
56 | tot += len(t)
57 | return err / tot
58 |
59 |
60 | def _wer(groundtruth, hypothesis):
61 | err = 0
62 | tot = 0
63 | for p, t in zip(hypothesis, groundtruth):
64 | p = p.lower().split(' ')
65 | t = t.lower().split(' ')
66 | err += float(ed.eval(p, t))
67 | tot += len(t)
68 | return err / tot
69 |
70 |
71 | class EvalMetric:
72 |
73 | def __init__(self, tokenizer, normalize_text=True):
74 | self.tasks = defaultdict(lambda: defaultdict(list))
75 | self.tokenizer = tokenizer
76 | self.target_list = defaultdict(lambda: defaultdict(int))
77 | self.normalize_text = normalize_text
78 |
79 | def tokenize_text(self, text):
80 | text = self.tokenizer.decode(self.tokenizer.encode(text, add_special_tokens=False))
81 | if self.normalize_text:
82 | text = text.replace(tok.tok_sep(self.tokenizer), " ")
83 | # return _normalize_answer(text, task='others') # remove punctuation
84 | # keep punctuation
85 | text = "".join(
86 | (char if char.isalpha() or char == " " else " " + char + " ") for char in text) # separate punctuation
87 | text = ' '.join(text.split()).lower().strip() # remove extra blank
88 | return text
89 |
90 | def add_record(self, ori_input, ori_predicted, ori_target, task='default'):
91 | input = predicted = target = ""
92 | input_list = predicted_list = ori_predicted_list = target_list = []
93 |
94 | if isinstance(ori_input, str):
95 | input = self.tokenize_text(ori_input.strip())
96 | input_list = [input]
97 | if isinstance(ori_input, list):
98 | input_list = copy.copy(ori_input)
99 | for i, t in enumerate(ori_input):
100 | input_list[i] = self.tokenize_text(t.strip())
101 | input = " ".join(input_list)
102 |
103 | if isinstance(ori_predicted, str):
104 | predicted = self.tokenize_text(ori_predicted)
105 | predicted_list = [predicted]
106 | ori_predicted_list = [ori_predicted]
107 | if isinstance(ori_predicted, list):
108 | predicted_list = copy.copy(ori_predicted)
109 | ori_predicted_list = copy.copy(ori_predicted)
110 | for i, t in enumerate(ori_predicted):
111 | if not isinstance(t, list):
112 | predicted_list[i] = self.tokenize_text(t.strip())
113 | ori_predicted_list[i] = t
114 | else:
115 | predicted_list[i] = ''
116 | ori_predicted_list[i] = ''
117 | predicted = " ".join(predicted_list)
118 | if isinstance(ori_target, str):
119 | target_list = []
120 | if tok.UNIVERSAL_SEP in ori_target:
121 | target = ori_target
122 | target_list.extend([self.tokenize_text(st.strip()) for st in ori_target.split(tok.UNIVERSAL_SEP)])
123 | else:
124 | target = self.tokenize_text(ori_target.strip())
125 | target_list.append(target)
126 | elif isinstance(ori_target, list):
127 | for i, t in enumerate(ori_target):
128 | if isinstance(t, list):
129 | ori_target[i] = self.tokenize_text(t.strip())
130 |
131 | target_list = ori_target
132 |
133 | for t in target_list:
134 | self.target_list[task][t] += 1
135 |
136 | self.tasks[task]['input'].append(input)
137 | self.tasks[task]['input_list'].append(input_list)
138 | self.tasks[task]['predicted'].append(predicted)
139 | self.tasks[task]['predicted_list'].append(predicted_list)
140 | self.tasks[task]['target'].append(target)
141 | self.tasks[task]['target_list'].append(target_list)
142 | self.tasks[task]['ori_input'].append(ori_input)
143 | self.tasks[task]['ori_predicted'].append(ori_predicted)
144 | self.tasks[task]['ori_predicted_list'].append(ori_predicted_list)
145 | self.tasks[task]['ori_target'].append(ori_target)
146 |
147 | def get_record(self, task='default'):
148 | return self.tasks[task]
149 |
150 | def cal_score(self, metric):
151 | data_score = []
152 | for task_name, task in self.tasks.items():
153 | print("Task : " + task_name + " report ")
154 | if "emf1" in metric:
155 | em = 0
156 | total = 0
157 | f1 = 0
158 | for pos, predict in enumerate(task['predicted']):
159 | em_list = []
160 | f1_list = []
161 | for target in task['target_list'][pos]:
162 | if _normalize_answer(str(predict)) == _normalize_answer(str(target)) and len(
163 | _normalize_answer(str(predict))) > 0 or len(str(predict)) == len(str(target)) == 0:
164 | em_score = 1
165 | f1_score = 1
166 | else:
167 | em_score = 0
168 | f1_score = _f1_score(str(predict), str(target))
169 | em_list.append(em_score)
170 | f1_list.append(f1_score)
171 | em += max(em_list)
172 | f1 += max(f1_list)
173 | data_score.append([predict, task['target_list'][pos][em_list.index(max(em_list))],
174 | {'em': max(em_list), 'f1': max(f1_list)}])
175 | total += 1
176 | result = {"EM": em / (total or not total), "F1": f1 / (total or not total)}
177 | data_score = sorted(data_score, key=lambda i: i[2]['em'], reverse=True)
178 | if "er" in metric:
179 | predicts = []
180 | targets = []
181 | for pos, predict in enumerate(task['predicted']):
182 | wer_list = []
183 | cer_list = []
184 | for target in task['target_list'][pos]:
185 | if len(target) > 0 and len(predict) > 0:
186 | wer_list.append(100 * _wer([target], [predict]))
187 | cer_list.append(100 * _cer([target], [predict]))
188 | else:
189 | wer_list.append(100)
190 | cer_list.append(100)
191 | wer = min(wer_list)
192 | cer = min(cer_list)
193 | target = task['target_list'][pos][wer_list.index(wer)]
194 | predicts.append(predict)
195 | targets.append(target)
196 | data_score.append([predict, target, {'wer': wer, 'cer': cer}])
197 |
198 | wer = 100 * _wer(targets, predicts) if len(target) > 0 else 100
199 | cer = 100 * _cer(targets, predicts) if len(target) > 0 else 100
200 | result = {"WER": wer, "CER": cer}
201 | data_score = sorted(data_score, key=lambda i: i[2]['wer'], reverse=False)
202 | if "nlg" in metric:
203 | try:
204 | from nlgeval import NLGEval
205 | except ImportError:
206 | print(
207 | "nlg-eval package not install, plz install it: pip install git+https://github.com/voidful/nlg-eval.git ; nlg-eval --setup ./nlg-eval-data/")
208 | raise
209 | nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=["METEOR"])
210 |
211 | target_list = task['target_list']
212 | predicted = task['predicted']
213 | for idx, tl in enumerate(target_list):
214 | max_candidate = max([len(i) for i in target_list])
215 | if max_candidate - len(tl) > 0:
216 | target_list[idx].extend([""] * (max_candidate - len(tl)))
217 |
218 | for t, p in tqdm(zip(target_list, predicted), total=len(target_list)):
219 | data_score.append([p, t, nlgeval.compute_metrics(ref_list=list(map(list, zip(t))), hyp_list=[p])])
220 | result = nlgeval.compute_metrics(ref_list=list(map(list, zip(*task['target_list']))), # transpose
221 | hyp_list=predicted)
222 | data_score = sorted(data_score, key=lambda i: i[2]['ROUGE_L'])
223 | if "clas" in metric:
224 | from sklearn.metrics import classification_report
225 | from sklearn.preprocessing import MultiLabelBinarizer
226 | from sklearn.metrics import precision_recall_fscore_support
227 | target_key = [t for t in self.target_list[task_name].keys() if len(t) > 0]
228 | mlb = MultiLabelBinarizer().fit([target_key])
229 | # remove all blank target
230 | task['target_list'] = [[j for j in sub if len(j) > 0] for sub in task['target_list']]
231 | # modify for tagging result
232 | if isinstance(task['ori_predicted_list'][0][0], list):
233 | target_list = sum([[[j] for j in sub] for sub in task['target_list']], [])
234 | predicted = sum([[[j] for j in sub] for sub in task['ori_predicted_list']], [])
235 | if len(target_list) != len(predicted):
236 | diff = len(task['target_list']) - len(task['ori_predicted_list'])
237 | predicted.extend([['']] * diff)
238 | else:
239 | target_list = task['target_list']
240 | predicted = task['ori_predicted_list']
241 |
242 | for p, t in zip(predicted, target_list):
243 | score = dict(zip(["precision", "recall", "fbeta_score", "support"],
244 | precision_recall_fscore_support(mlb.transform([t]), mlb.transform([p]),
245 | average='weighted')))
246 | data_score.append([p, t, score])
247 | result = classification_report(
248 | mlb.transform(target_list),
249 | mlb.transform(predicted),
250 | target_names=list(mlb.classes_))
251 | data_score = sorted(data_score, key=lambda i: i[2]['fbeta_score'])
252 | yield (task_name, result, data_score)
253 |
--------------------------------------------------------------------------------
/tfkit/utility/logger.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import json
4 |
5 |
6 | class Logger:
7 |
8 | def __init__(self, savedir, logfilename="message.log", metricfilename="metric.log", tensorboard=False, wandb=False,
9 | print_fn=print):
10 | self.savedir = savedir
11 | self.logfilepath = os.path.join(savedir, logfilename)
12 | self.metricfilepath = os.path.join(savedir, metricfilename)
13 | self.tensorboard_writer = None
14 | self.wandb_writer = None
15 | self.print_fn = print_fn
16 | if tensorboard:
17 | from torch.utils.tensorboard import SummaryWriter
18 | self.tensorboard_writer = SummaryWriter()
19 | if wandb:
20 | import wandb
21 | project_name = savedir.replace("/", "_")
22 | self.wandb_writer = wandb.init(project=project_name)
23 |
24 | def write_config(self, config_dict):
25 | if self.wandb_writer:
26 | self.wandb_writer.config.update(config_dict)
27 | if self.tensorboard_writer:
28 | self.tensorboard_writer.add_hparams(config_dict)
29 |
30 | with open(self.metricfilepath, "a", encoding='utf8') as log_file:
31 | writer = csv.writer(log_file)
32 | writer.writerow([json.dumps(config_dict)])
33 |
34 | def write_log(self, *args):
35 | line = ' '.join([str(a) for a in args])
36 | with open(self.logfilepath, "a", encoding='utf8') as log_file:
37 | log_file.write(line + '\n')
38 | self.print_fn(line)
39 |
40 | def write_metric(self, tag, scalar_value, global_step):
41 | if self.wandb_writer:
42 | self.wandb_writer.log({tag: scalar_value, "global_step": global_step})
43 | if self.tensorboard_writer:
44 | self.tensorboard_writer.add_scalar(tag, scalar_value, global_step)
45 | with open(self.metricfilepath, "a", encoding='utf8') as log_file:
46 | writer = csv.writer(log_file)
47 | writer.writerow([tag, scalar_value, global_step])
48 |
--------------------------------------------------------------------------------
/tfkit/utility/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 |
6 |
7 | class BCEFocalLoss(nn.Module):
8 | def __init__(self, gamma=2):
9 | super(BCEFocalLoss, self).__init__()
10 | self.gamma = gamma
11 |
12 | def forward(self, input, target):
13 | BCE_loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
14 | pt = torch.exp(-BCE_loss) # prevents nans when probability 0
15 | focal_loss = (1 - pt) ** self.gamma * BCE_loss
16 | return focal_loss.mean()
17 |
18 |
19 | class FocalLoss(nn.Module):
20 | def __init__(self, gamma=2, ignore_index=-1):
21 | super(FocalLoss, self).__init__()
22 | self.gamma = gamma
23 | self.softmax = nn.Softmax(dim=1)
24 | self.nll = nn.NLLLoss(ignore_index=ignore_index)
25 |
26 | def forward(self, input, target):
27 | softmax = self.softmax(input)
28 | logpt = torch.log(softmax)
29 | pt = Variable(logpt.data.exp())
30 | return self.nll((1 - pt) ** self.gamma * logpt, target)
31 |
32 |
33 | class SeqCTCLoss(nn.Module):
34 | def __init__(self, blank_index):
35 | super(SeqCTCLoss, self).__init__()
36 | self.blank_index = blank_index
37 |
38 | def forward(self, logits, input_lengths, targets, target_lengths):
39 | # lengths : (batch_size, )
40 | # log_logits : (T, batch_size, n_class), this kind of shape is required for ctc_loss
41 | # log_logits = logits + (logit_mask.unsqueeze(-1) + 1e-45).log()
42 | log_logits = logits.log_softmax(-1).transpose(0, 1)
43 | loss = F.ctc_loss(log_logits,
44 | targets,
45 | input_lengths,
46 | target_lengths,
47 | blank=self.blank_index,
48 | reduction='mean',
49 | zero_infinity=True)
50 | return loss
51 |
52 |
53 | class SelfKDLoss(nn.Module):
54 |
55 | def __init__(self, alpha=0.1, temperature=2,ignore_index=-1):
56 | super(SelfKDLoss, self).__init__()
57 | self.alpha = alpha
58 | self.temperature = temperature
59 | self.ignore_index = ignore_index
60 |
61 | def forward(self, outputs, teacher_outputs, labels):
62 | loss = nn.KLDivLoss()(F.log_softmax(outputs / self.temperature, dim=-1),
63 | F.softmax(teacher_outputs / self.temperature, dim=-1)) * (
64 | self.alpha * self.temperature * self.temperature) + F.cross_entropy(outputs, labels,ignore_index=self.ignore_index,) * (
65 | 1. - self.alpha)
66 | return loss
67 |
68 |
69 | class DiceLoss(nn.Module):
70 | """From 'Dice Loss for Data-imbalanced NLP Tasks'"""
71 |
72 | def __init__(self, ignore_index=None, reduction='mean'):
73 | super(DiceLoss, self).__init__()
74 | self.ignore_index = ignore_index
75 | self.reduction = reduction
76 |
77 | def forward(self, y_pred, y_true):
78 | y_pred = torch.softmax(y_pred, dim=1)
79 | if self.ignore_index is not None:
80 | mask = y_true == -1
81 | filtered_target = y_true
82 | filtered_target[mask] = 0
83 | torch.gather(y_pred, dim=1, index=filtered_target.unsqueeze(1))
84 | mask = mask.unsqueeze(1).expand(y_pred.data.size())
85 | y_pred[mask] = 0
86 | pred_prob = torch.gather(y_pred, dim=1, index=y_true.unsqueeze(1))
87 | dsc_i = 1 - ((1 - pred_prob) * pred_prob) / ((1 - pred_prob) * pred_prob + 1)
88 | if self.reduction == 'mean':
89 | return dsc_i.mean()
90 | else:
91 | return dsc_i.view(-1)
92 |
93 |
94 | class NegativeCElLoss(nn.Module):
95 | def __init__(self, ignore_index=-1, reduction='mean'):
96 | super(NegativeCElLoss, self).__init__()
97 | self.softmax = nn.Softmax(dim=1)
98 | self.alpha = 1
99 | self.nll = nn.NLLLoss(ignore_index=ignore_index, reduction=reduction)
100 |
101 | def forward(self, input, target):
102 | nsoftmax = self.softmax(input)
103 | nsoftmax = torch.clamp((1.0 - nsoftmax), min=1e-32)
104 | return self.nll(torch.log(nsoftmax) * self.alpha, target)
105 |
106 |
107 | class LabelSmoothingLoss(nn.Module):
108 | def __init__(self, classes, smoothing=0.1, dim=-1, ignore_index=None, reduction='mean'):
109 | super(LabelSmoothingLoss, self).__init__()
110 | self.confidence = 1.0 - smoothing
111 | self.smoothing = smoothing
112 | self.cls = classes
113 | self.dim = dim
114 | self.reduction = reduction
115 | self.ignore_index = ignore_index
116 |
117 | def forward(self, pred, target):
118 | pred = pred.log_softmax(dim=self.dim)
119 | with torch.no_grad():
120 | true_dist = torch.zeros_like(pred)
121 | true_dist.fill_(self.smoothing / (self.cls - 1))
122 | if self.ignore_index is not None:
123 | mask = target == -1
124 | filtered_target = target.clone()
125 | filtered_target[mask] = 0
126 | true_dist.scatter_(1, filtered_target.unsqueeze(1), self.confidence)
127 | mask = mask.unsqueeze(1).expand(pred.data.size())
128 | true_dist[mask] = 0
129 | else:
130 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
131 | if self.reduction == 'mean':
132 | return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
133 | else:
134 | return torch.sum(-true_dist * pred, dim=self.dim)
135 |
--------------------------------------------------------------------------------
/tfkit/utility/model.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import importlib
3 | import os
4 | from typing import List
5 |
6 | import inquirer
7 | import nlp2
8 | import torch
9 | from torch import nn
10 | from transformers import AutoTokenizer, AutoModel
11 |
12 |
13 | def list_all_model(ignore_list=[]):
14 | dataset_dir = os.path.abspath(__file__ + "/../../") + '/task'
15 | return list(filter(
16 | lambda x: os.path.isdir(os.path.join(dataset_dir, x)) and '__pycache__' not in x and x not in ignore_list,
17 | os.listdir(dataset_dir)))
18 |
19 |
20 | def load_predict_parameter(model, model_arg={}, enable_arg_panel=False):
21 | """use inquirer panel to let user input task parameter or just use default value"""
22 | return nlp2.function_argument_panel(model.predictor.wrap_input, model_arg,
23 | disable_input_panel=(not enable_arg_panel),
24 | func_parent=model,
25 | ignore_empty=True)
26 |
27 |
28 | def load_model_class(model_name):
29 | return importlib.import_module('.' + model_name, 'tfkit.task')
30 |
31 |
32 | def load_pretrained_model(pretrained_config, model_type):
33 | pretrained = AutoModel.from_pretrained(pretrained_config)
34 | if 'clm' in model_type:
35 | pretrained.config.is_decoder = True
36 | return pretrained
37 |
38 |
39 | def load_pretrained_tokenizer(pretrained_config):
40 | tokenizer = AutoTokenizer.from_pretrained(pretrained_config)
41 | return tokenizer
42 |
43 |
44 | def resize_pretrain_tok(pretrained, tokenizer):
45 | if pretrained.config.vocab_size != len(tokenizer):
46 | pretrained.resize_token_embeddings(len(tokenizer))
47 | return pretrained, tokenizer
48 |
49 |
50 | def add_tokens_to_pretrain(pretrained, tokenizer, add_tokens, sample_init=False):
51 | origin_vocab_size = tokenizer.vocab_size
52 | print("===ADD TOKEN===")
53 | num_added_toks = tokenizer.add_tokens(add_tokens)
54 | print('We have added', num_added_toks, 'tokens')
55 | pretrained.resize_token_embeddings(len(tokenizer))
56 | if sample_init:
57 | input_embedding = pretrained.get_input_embeddings()
58 | state_dict_weight = input_embedding.state_dict()['weight']
59 | state_dict_weight[origin_vocab_size:len(tokenizer)] = copy.copy(
60 | state_dict_weight[100:100 + num_added_toks])
61 | pretrained.set_input_embeddings(input_embedding)
62 | print("===============")
63 | return pretrained, tokenizer
64 |
65 |
66 | def load_trained_model(model_path, pretrained_config=None, tag=None):
67 | """loading saved task"""
68 |
69 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
70 | torchpack = torch.load(model_path, map_location=device)
71 |
72 | model_info = {key: torchpack[key] for key in torchpack.keys() if 'state_dict' not in key and 'models' not in key}
73 | print("===task info===")
74 | [print(k, v[:10], "...") if isinstance(v, list) and len(v) > 10 else print(k, v) for k, v in model_info.items()]
75 | print('===============')
76 |
77 | if 'tags' in torchpack and len(torchpack['tags']) > 1:
78 | if tag is None:
79 | print("Pick which models to use in multi-task models")
80 | inquirer_res = inquirer.prompt(
81 | [inquirer.List('tag', message="Select task", choices=torchpack['tags'])])
82 | tag = inquirer_res['tag']
83 | type_ind = torchpack['tags'].index(tag)
84 | else:
85 | type_ind = 0
86 | print("loading saved task")
87 |
88 | # get all loading parameter
89 | maxlen = torchpack['maxlen']
90 | if pretrained_config is not None:
91 | config = pretrained_config
92 | else:
93 | config = torchpack['model_config'] if 'model_config' in torchpack else torchpack['bert']
94 | model_types = [torchpack['type']] if not isinstance(torchpack['type'], list) else torchpack['type']
95 | models_state = torchpack['models'] if 'models' in torchpack else [torchpack['model_state_dict']]
96 | type = model_types[type_ind]
97 | add_tokens = torchpack['add_tokens'] if 'add_tokens' in torchpack else None
98 | # load task
99 | tokenizer = AutoTokenizer.from_pretrained(config)
100 | pretrained = AutoModel.from_pretrained(config)
101 |
102 | pretrained, tokenizer = add_tokens_to_pretrain(pretrained, tokenizer, add_tokens)
103 |
104 | model_class = load_model_class(type)
105 | task_detail = {}
106 | if 'task-label' in torchpack:
107 | task_detail = torchpack['task-label']
108 | elif 'label' in torchpack:
109 | task_detail = {'label': torchpack['label']}
110 |
111 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail=task_detail,
112 | maxlen=maxlen)
113 | model.load_state_dict(models_state[type_ind], strict=False)
114 | model = model.to(device)
115 |
116 | preprocessor = model_class.Preprocessor(tokenizer)
117 |
118 | print("finish loading")
119 | return model, type, model_class, model_info, preprocessor
120 |
121 |
122 | def save_model(models, input_arg, models_tag, epoch, fname, logger, accelerator, add_tokens=None):
123 | accelerator.wait_for_everyone()
124 | save_model = {
125 | 'models': [accelerator.get_state_dict(m) for m in models],
126 | 'model_config': input_arg.get('config'),
127 | 'add_tokens': add_tokens,
128 | 'tags': models_tag,
129 | 'type': input_arg.get('task'),
130 | 'maxlen': input_arg.get('maxlen'),
131 | 'epoch': epoch
132 | }
133 |
134 | for ind, m in enumerate(input_arg.get('task')):
135 | if 'tag' in m:
136 | save_model['label'] = models[ind].labels
137 | if "clas" in m:
138 | save_model['task-label'] = models[ind].tasks_detail
139 |
140 | torch.save(save_model, f"{fname}.pt")
141 | logger.write_log(f"weights were saved to {fname}.pt")
142 |
143 |
144 | def tie_encoder_decoder_weights(encoder, decoder, base_model_prefix):
145 | uninitialized_encoder_weights: List[str] = []
146 | if decoder.__class__ != encoder.__class__:
147 | print(
148 | f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
149 | )
150 |
151 | def tie_encoder_to_decoder_recursively(
152 | decoder_pointer: nn.Module,
153 | encoder_pointer: nn.Module,
154 | module_name: str,
155 | uninitialized_encoder_weights: List[str],
156 | depth=0,
157 | ):
158 | assert isinstance(decoder_pointer, nn.Module) and isinstance(
159 | encoder_pointer, nn.Module
160 | ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
161 | if hasattr(decoder_pointer, "weight"):
162 | assert hasattr(encoder_pointer, "weight")
163 | encoder_pointer.weight = decoder_pointer.weight
164 | if hasattr(decoder_pointer, "bias"):
165 | assert hasattr(encoder_pointer, "bias")
166 | encoder_pointer.bias = decoder_pointer.bias
167 | return
168 |
169 | encoder_modules = encoder_pointer._modules
170 | decoder_modules = decoder_pointer._modules
171 | if len(decoder_modules) > 0:
172 | assert (
173 | len(encoder_modules) > 0
174 | ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
175 |
176 | all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
177 | encoder_layer_pos = 0
178 | for name, module in decoder_modules.items():
179 | if name.isdigit():
180 | encoder_name = str(int(name) + encoder_layer_pos)
181 | decoder_name = name
182 | if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
183 | encoder_modules
184 | ) != len(decoder_modules):
185 | # this can happen if the name corresponds to the position in a list module list of layers
186 | # in this case the decoder has added a cross-attention that the encoder does not have
187 | # thus skip this step and subtract one layer pos from encoder
188 | encoder_layer_pos -= 1
189 | continue
190 | elif name not in encoder_modules:
191 | continue
192 | elif depth > 500:
193 | raise ValueError(
194 | "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your task."
195 | )
196 | else:
197 | decoder_name = encoder_name = name
198 | tie_encoder_to_decoder_recursively(
199 | decoder_modules[decoder_name],
200 | encoder_modules[encoder_name],
201 | module_name + "/" + name,
202 | uninitialized_encoder_weights,
203 | depth=depth + 1,
204 | )
205 | all_encoder_weights.remove(module_name + "/" + encoder_name)
206 |
207 | uninitialized_encoder_weights += list(all_encoder_weights)
208 |
209 | # tie weights recursively
210 | tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights)
211 | if len(uninitialized_encoder_weights) > 0:
212 | print(
213 | f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
214 | )
215 | else:
216 | print("All encoder weights tied to the decoder")
217 |
--------------------------------------------------------------------------------
/tfkit/utility/tok.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import nlp2
4 | from tqdm import tqdm
5 | from transformers import AutoTokenizer
6 |
7 | UNIVERSAL_SEP = "///"
8 |
9 |
10 | def tok_begin(tokenizer):
11 | if tokenizer.special_tokens_map.get('bos_token') is not None:
12 | return tokenizer.special_tokens_map.get('bos_token')
13 | elif tokenizer.special_tokens_map.get('cls_token') is not None:
14 | tokenizer.special_tokens_map.get('cls_token')
15 | return 'cls'
16 |
17 |
18 | def tok_begin_id(tokenizer):
19 | return tokenizer.convert_tokens_to_ids(tok_begin(tokenizer))
20 |
21 |
22 | def tok_sep(tokenizer):
23 | if tokenizer.special_tokens_map.get('sep_token') is not None:
24 | return tokenizer.special_tokens_map.get('sep_token')
25 | elif tokenizer.special_tokens_map.get('eos_token') is not None:
26 | return tokenizer.special_tokens_map.get('eos_token')
27 | return 'sep'
28 |
29 |
30 | def tok_sep_id(tokenizer):
31 | return tokenizer.convert_tokens_to_ids(tok_sep(tokenizer))
32 |
33 |
34 | def tok_mask(tokenizer):
35 | if tokenizer.special_tokens_map.get('mask_token'):
36 | return tokenizer.special_tokens_map.get('mask_token')
37 | return 'msk'
38 |
39 |
40 | def tok_mask_id(tokenizer):
41 | return tokenizer.convert_tokens_to_ids(tok_mask(tokenizer))
42 |
43 |
44 | def tok_pad(tokenizer):
45 | if tokenizer.special_tokens_map.get('pad_token'):
46 | return tokenizer.special_tokens_map.get('pad_token')
47 | return 'pad'
48 |
49 |
50 | def tok_pad_id(tokenizer):
51 | return tokenizer.convert_tokens_to_ids(tok_pad(tokenizer))
52 |
53 |
54 | def get_all_tok_from_config(config):
55 | tokenizer = AutoTokenizer.from_pretrained(config)
56 | return list(tokenizer.get_vocab().keys())
57 |
58 |
59 | def handle_exceed(tokenizer, seq, maxlen, mode=['noop', 'remove', 'slide', 'start_slice', 'end_slice'],
60 | keep_after_sep=True):
61 | if isinstance(seq, list):
62 | return seq, [[len(seq)]]
63 | mode = mode[0] if isinstance(mode, list) else mode
64 | sep_tok = tok_sep(tokenizer)
65 | sep_split = seq.split(sep_tok)
66 | ext_seq = [sep_tok] + tokenizer.tokenize(sep_tok.join(sep_split[1:])) \
67 | if len(sep_split) > 1 and keep_after_sep else []
68 | t_seq = tokenizer.tokenize(sep_split[0])
69 | if mode == 'noop':
70 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
71 | if mode == 'remove':
72 | if len(t_seq + ext_seq) <= maxlen:
73 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
74 | else:
75 | return [], [[0, 0]]
76 | if mode == 'slide':
77 | return nlp2.sliding_windows(t_seq, maxlen - len(ext_seq), append_seq=ext_seq)
78 | if mode == 'start_slice':
79 | slices = t_seq[:maxlen - len(ext_seq)]
80 | slices.extend(ext_seq)
81 | return [slices], [[0, maxlen - len(ext_seq)]]
82 | if mode == 'end_slice':
83 | start_pos = len(t_seq) + len(ext_seq) - maxlen
84 | slices = t_seq[start_pos:]
85 | slices.extend(ext_seq)
86 | return [slices], [[max(0, start_pos), len(t_seq)]]
87 |
88 |
89 | def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
90 | unk_count_dict = OrderedDict()
91 | for path in file_paths:
92 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
93 | for tok in nlp2.split_sentence_to_array(input_sent):
94 | if tokenizer._unk_token in tokenizer.tokenize(tok):
95 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
96 | top_range = int((len(unk_count_dict) + 1) * topP * 100)
97 | return list(unk_count_dict.keys())[:top_range]
98 |
99 |
100 | def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
101 | unk_count_dict = OrderedDict()
102 | for path in file_paths:
103 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
104 | for tok in nlp2.split_sentence_to_array(input_sent):
105 | if tokenizer._unk_token in tokenizer.tokenize(tok):
106 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
107 | return [key for key, value in unk_count_dict.items() if value >= freqK]
108 |
--------------------------------------------------------------------------------