├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── demo_data ├── classification.csv ├── generation.csv ├── mask.csv ├── mcq.csv ├── qa.csv ├── tag.csv ├── tok_list.txt └── unk_tok.csv ├── docs ├── benchmark.md ├── img │ ├── flow.png │ ├── tfkit-icon.png │ └── tfkit.png ├── index.md ├── installation.md ├── models.md ├── structure.md └── tasks.md ├── mkdocs.yml ├── requirements.txt ├── setup.py └── tfkit ├── __init__.py ├── dump.py ├── eval.py ├── task ├── __init__.py ├── clas │ ├── __init__.py │ ├── model.py │ └── preprocessor.py ├── clm │ ├── __init__.py │ ├── model.py │ └── preprocessor.py ├── once │ ├── __init__.py │ ├── model.py │ └── preprocessor.py ├── oncectc │ ├── __init__.py │ └── model.py ├── qa │ ├── __init__.py │ ├── model.py │ └── preprocessor.py ├── seq2seq │ ├── __init__.py │ ├── model.py │ └── preprocessor.py └── tag │ ├── __init__.py │ ├── model.py │ └── preprocessor.py ├── test ├── __init__.py ├── task │ └── test_task_model.py ├── test_atrain.py ├── test_package.py ├── test_zeval.py ├── test_zzdump.py └── utility │ ├── test_utility_data_filereader.py │ ├── test_utility_data_loader.py │ ├── test_utility_data_processor.py │ ├── test_utility_dataset.py │ ├── test_utility_eval_metric.py │ ├── test_utility_logger.py │ ├── test_utility_loss.py │ ├── test_utility_model.py │ └── test_utility_tok.py ├── train.py └── utility ├── __init__.py ├── data_filereader.py ├── data_loader.py ├── data_processor.py ├── dataset.py ├── eval_metric.py ├── logger.py ├── loss.py ├── model.py ├── predictor.py └── tok.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [ 3.9 ] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - uses: actions/cache@v2 27 | with: 28 | path: ~/.cache/pip 29 | key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} 30 | restore-keys: | 31 | ${{ runner.os }}-pip- 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install flake8 pytest 36 | pip install -r requirements.txt 37 | pip install . 38 | - name: Lint with flake8 39 | run: | 40 | # stop the build if there are Python syntax errors or undefined names 41 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 42 | - name: Test with pytest 43 | run: | 44 | pytest 45 | - name: Generate coverage report 46 | run: | 47 | pip install pytest-cov 48 | pytest --cov=./ --cov-report=xml 49 | - name: Upload coverage to Codecov 50 | uses: codecov/codecov-action@v1 51 | with: 52 | fail_ci_if_error: false 53 | verbose: false 54 | - name: Build 55 | run: | 56 | python setup.py install -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | .AppleDouble 4 | .LSOverride 5 | 6 | # Icon must end with two \r 7 | Icon 8 | 9 | # Thumbnails 10 | ._* 11 | 12 | # Files that might appear in the root of a volume 13 | .DocumentRevisions-V100 14 | .fseventsd 15 | .Spotlight-V100 16 | .TemporaryItems 17 | .Trashes 18 | .VolumeIcon.icns 19 | .com.apple.timemachine.donotpresent 20 | 21 | # Directories potentially created on remote AFP share 22 | .AppleDB 23 | .AppleDesktop 24 | Network Trash Folder 25 | Temporary Items 26 | .apdisk 27 | 28 | # IntelliJ project files 29 | .idea 30 | *.iml 31 | out 32 | gen### Example user template template 33 | ### Example user template 34 | 35 | # IntelliJ project files 36 | .idea 37 | *.iml 38 | out 39 | gen### Python template 40 | # Byte-compiled / optimized / DLL files 41 | __pycache__/ 42 | *.py[cod] 43 | *$py.class 44 | 45 | # C extensions 46 | *.so 47 | 48 | # Distribution / packaging 49 | .Python 50 | build/ 51 | develop-eggs/ 52 | dist/ 53 | downloads/ 54 | eggs/ 55 | .eggs/ 56 | lib/ 57 | lib64/ 58 | parts/ 59 | sdist/ 60 | var/ 61 | wheels/ 62 | *.egg-info/ 63 | .installed.cfg 64 | *.egg 65 | MANIFEST 66 | 67 | # PyInstaller 68 | # Usually these files are written by a python script from a template 69 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 70 | *.manifest 71 | *.spec 72 | 73 | # Installer logs 74 | pip-log.txt 75 | pip-delete-this-directory.txt 76 | 77 | # Unit test / coverage reports 78 | htmlcov/ 79 | .tox/ 80 | .coverage 81 | .coverage.* 82 | .cache 83 | nosetests.xml 84 | coverage.xml 85 | *.cover 86 | .hypothesis/ 87 | .pytest_cache/ 88 | 89 | # Translations 90 | *.mo 91 | *.pot 92 | 93 | # Django stuff: 94 | *.log 95 | local_settings.py 96 | db.sqlite3 97 | 98 | # Flask stuff: 99 | instance/ 100 | .webassets-cache 101 | 102 | # Scrapy stuff: 103 | .scrapy 104 | 105 | # Sphinx documentation 106 | docs/_build/ 107 | 108 | # PyBuilder 109 | target/ 110 | 111 | # Jupyter Notebook 112 | .ipynb_checkpoints 113 | 114 | # pyenv 115 | .python-version 116 | 117 | # celery beat schedule file 118 | celerybeat-schedule 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # how2 143 | .how2 144 | how2 145 | /how2 146 | 147 | # test cache 148 | ./tfkit/test/cache 149 | /tfkit/test/cache 150 | tfkit/test/cache 151 | 152 | # test cache 153 | ./tfkit/test/runs 154 | /tfkit/test/runs 155 | tfkit/test/runs 156 | 157 | ./tfkit/test/wandb 158 | /tfkit/test/wandb 159 | tfkit/test/wandb 160 | 161 | # cache 162 | ./cache 163 | cache 164 | /cache 165 | 166 | # mypy 167 | .mypy_cache/ 168 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to tfkit 2 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: 3 | 4 | - Reporting a bug 5 | - Discussing the current state of the code 6 | - Submitting a fix 7 | - Proposing new features 8 | - Becoming a maintainer 9 | 10 | ## We Develop with Github 11 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 12 | 13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 15 | 16 | 1. Fork the repo and create your branch from `master`. 17 | 2. If you've added code that should be tested, add tests. 18 | 3. If you've changed APIs, update the documentation. 19 | 4. Ensure the test suite passes. 20 | 5. Make sure your code lints. 21 | 6. Issue that pull request! 22 | 23 | ## Any contributions you make will be under the Apache 2.0 Software License 24 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/) that covers the project. Feel free to contact the maintainers if that's a concern. 25 | 26 | ## Report bugs using Github's [issues](https://github.com/voidful/tfkit/issues) 27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](); it's that easy! 28 | 29 | ## Write bug reports with detail, background, and sample code 30 | **Great Bug Reports** tend to have: 31 | 32 | - A quick summary and/or background 33 | - Steps to reproduce 34 | - Be specific! 35 | - Give sample code if you can. 36 | - What you expected would happen 37 | - What actually happens 38 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 39 | 40 | People *love* thorough bug reports. I'm not even kidding. 41 | 42 | ## License 43 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License. 44 | 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-devel 2 | 3 | ENV LANG=C.UTF-8 4 | WORKDIR /workspace/ 5 | COPY ./ /workspace/ 6 | 7 | # install basics 8 | RUN apt-get update -y 9 | RUN apt-get install -y git curl htop wget tmux 10 | 11 | # install python deps 12 | RUN pip install -r /workspace/requirements.txt 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright voidful 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 |
5 |

6 |
7 |

8 | 9 | PyPI 10 | 11 | 12 | Download 13 | 14 | 15 | Last Commit 16 | 17 | 18 | CodeFactor 19 | 20 | 21 | Visitor 22 | 23 | 24 | 25 | 26 |

27 | 28 | ## What is it 29 | TFKit is a tool kit mainly for language generation. 30 | It leverages the use of transformers on many tasks with different models in this all-in-one framework. 31 | All you need is a little change of config. 32 | 33 | ## Task Supported 34 | With transformer models - BERT/ALBERT/T5/BART...... 35 | | | | 36 | |-|-| 37 | | Text Generation | :memo: seq2seq language model | 38 | | Text Generation | :pen: causal language model | 39 | | Text Generation | :printer: once generation model / once generation model with ctc loss | 40 | | Text Generation | :pencil: onebyone generation model | 41 | 42 | # Getting Started 43 | Learn more from the [document](https://voidful.github.io/TFkit/). 44 | 45 | ## How To Use 46 | 47 | ### Step 0: Install 48 | Simple installation from PyPI 49 | ```bash 50 | pip install git+https://github.com/voidful/TFkit.git@refactor-dataset 51 | ``` 52 | 53 | ### Step 1: Prepare dataset in csv format 54 | [Task format](https://voidful.tech/TFkit/tasks/) 55 | ``` 56 | input, target 57 | ``` 58 | 59 | ### Step 2: Train model 60 | ```bash 61 | tfkit-train \ 62 | --task clas \ 63 | --config xlm-roberta-base \ 64 | --train training_data.csv \ 65 | --test testing_data.csv \ 66 | --lr 4e-5 \ 67 | --maxlen 384 \ 68 | --epoch 10 \ 69 | --savedir roberta_sentiment_classificer 70 | ``` 71 | 72 | ### Step 3: Evaluate 73 | ```bash 74 | tfkit-eval \ 75 | --task roberta_sentiment_classificer/1.pt \ 76 | --metric clas \ 77 | --valid testing_data.csv 78 | ``` 79 | 80 | ## Advanced features 81 |
82 | Multi-task training 83 | 84 | ```bash 85 | tfkit-train \ 86 | --task clas clas \ 87 | --config xlm-roberta-base \ 88 | --train training_data_taskA.csv training_data_taskB.csv \ 89 | --test testing_data_taskA.csv testing_data_taskB.csv \ 90 | --lr 4e-5 \ 91 | --maxlen 384 \ 92 | --epoch 10 \ 93 | --savedir roberta_sentiment_classificer_multi_task 94 | ``` 95 |
96 | 97 | ## Not maintained task 98 | Due to time constraints, the following tasks are temporarily not supported 99 | | | | 100 | |-|-| 101 | | Classification | :label: multi-class and multi-label classification | 102 | | Question Answering | :page_with_curl: extractive qa | 103 | | Question Answering | :radio_button: multiple-choice qa | 104 | | Tagging | :eye_speech_bubble: sequence level tagging / sequence level with crf | 105 | | Self-supervise Learning | :diving_mask: mask language model | 106 | 107 | ## Supplement 108 | - [transformers models list](https://huggingface.co/models): you can find any pretrained models here 109 | - [nlprep](https://github.com/voidful/NLPrep): download and preprocessing data in one line 110 | - [nlp2go](https://github.com/voidful/nlp2go): create demo api as quickly as possible. 111 | 112 | 113 | ## Contributing 114 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md). 115 | 116 | ## License ![PyPI - License](https://img.shields.io/github/license/voidful/tfkit) 117 | 118 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE) 119 | 120 | ## Icons reference 121 | Icons modify from Freepik from www.flaticon.com 122 | Icons modify from Nikita Golubev from www.flaticon.com 123 | -------------------------------------------------------------------------------- /demo_data/classification.csv: -------------------------------------------------------------------------------- 1 | We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS 2 | Calotropis procera (ushaar) keratitis.,Not-Related 3 | Fixed drug eruption is associated with many drugs but this is the first such report with omeprazole.,Related///CONCLUSION -------------------------------------------------------------------------------- /demo_data/generation.csv: -------------------------------------------------------------------------------- 1 | "Dan's parents were overweight . Dan was overweight as well . The doctors told his parents it was unhealthy . His parents understood and decided to make a change .","They got themselves and Dan on a diet ." 2 | "Jane was working at a diner . Suddenly , a customer barged up to the counter . He began yelling about how long his food was taking . /// Jane didn't know how to react .","Luckily , her coworker intervened and calmed the man down ." 3 | Peter was a truck driver . He was running a little behind on schedule . Peter decided to run past the weigh station . He was stopped by a cop .,"Peter ended up running late and getting a fine ." -------------------------------------------------------------------------------- /demo_data/mask.csv: -------------------------------------------------------------------------------- 1 | "i go to [MASK] by [MASK]","school bus" 2 | "how did i [MASK] [MASK]","get here" -------------------------------------------------------------------------------- /demo_data/mcq.csv: -------------------------------------------------------------------------------- 1 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . According to the article , what is the author 's opinion about identity ? [MASK] People 's identity is completely determined by their genes . [MASK] Government has the rights to confirm people 's identities . [MASK] Cloning itself gives parents great power over identity . [MASK] Cloning may prevent people from possessing their identities .",3 2 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . According to Paragraph 4 , which is right ? [MASK] cloning means running the risk of wasting too much effort [MASK] numbers of baby animals are likely to be created by cloning [MASK] human cloning is much more difficult than animal cloning [MASK] there are 200 sheep successfully cloned .",0 3 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality . What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2 -------------------------------------------------------------------------------- /demo_data/qa.csv: -------------------------------------------------------------------------------- 1 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. What did Beyoncé announce in January 2010?", 18,25 2 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. Who suggested the hiatus for Beyoncé?", 74,84 3 | "Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, ""to live life, to be inspired by things again"". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances. In what year did Beyonce have her hiatus?", 60,64 -------------------------------------------------------------------------------- /demo_data/tag.csv: -------------------------------------------------------------------------------- 1 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 2 | "1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 會 中 , 威 廉 · 瓊 斯 發 表 了 下 面 這 段 著 名 的 言 論 : 「 梵 語 儘 管 非 常 古 老 , 構 造 卻 精 妙 絕 倫 : 比 希 臘 語 還 完 美 , 比 拉 丁 語 還 豐 富 , 精 緻 之 處 同 時 勝 過 此 兩 者 , 但 在 動 詞 詞 根 和 語 法 形 式 上 , 又 跟 此 兩 者 無 比 相 似 , 不 可 能 是 巧 合 的 結 果 。 這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 3 | "這 三 種 語 言 太 相 似 了 , 使 任 何 同 時 稽 考 三 者 的 語 文 學 家 都 不 得 不 相 信 三 者 同 出 一 源 , 出 自 一 種 可 能 已 經 消 逝 的 語 言 。 基 於 相 似 的 原 因 , 儘 管 缺 少 同 樣 有 力 的 證 據 , 我 們 可 以 推 想 哥 德 語 和 凱 爾 特 語 , 雖 然 混 入 了 迥 然 不 同 的 語 彙 , 也 與 梵 語 有 著 相 同 的 起 源 ; 而 古 波 斯 語 可 能 也 是 這 一 語 系 的 子 裔 。 」 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 4 | "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 印 歐 語 系 因 為 哪 一 門 語 言 而 被 發 現 ?",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 5 | 實 驗 室,LOA LOB LOC 6 | 溫 者 必 良 , 自 古 而 然 。,O O O O O O O O O O 7 | 狼 煙 逝 去 , 幽 夢 醒 來 。,B_Thing I_Thing O O O O O O O O -------------------------------------------------------------------------------- /demo_data/tok_list.txt: -------------------------------------------------------------------------------- 1 | 闕 2 | :mbk1: 3 | >gg< -------------------------------------------------------------------------------- /demo_data/unk_tok.csv: -------------------------------------------------------------------------------- 1 | 紫府東風放夜時。步蓮穠李伴人歸,五更鐘動笙歌散,十里月明燈火稀。 2 | 香苒苒,夢依依。天涯寒盡減春衣,鳳凰城闕知何處,寥落星河一雁飛。 -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- 1 | ##DRCD 2 | ### Test 3 | | model | EM | F1 | 4 | | :----:|:----: |:----: | 5 | | albert-small | 74.45% | 86.08% | 6 | | electra-small | 76.64% | 87.49% | 7 | | albert-base | 80.17% | 89.87% | 8 | 9 | ### Dev 10 | | model | EM | F1 | 11 | | :----:|:----: |:----: | 12 | | albert-small | 73.70% | 85.33% | 13 | | electra-small | 77.61% | 87.33% | 14 | | albert-base | 80.52% | 89.92% | 15 | -------------------------------------------------------------------------------- /docs/img/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/flow.png -------------------------------------------------------------------------------- /docs/img/tfkit-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit-icon.png -------------------------------------------------------------------------------- /docs/img/tfkit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/docs/img/tfkit.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 |
5 |

6 |
7 |

8 | 9 | PyPI 10 | 11 | 12 | Download 13 | 14 | 15 | Build 16 | 17 | 18 | Last Commit 19 | 20 |

21 | 22 | ## Getting started 23 | 24 | ### Installing via pip 25 | ```bash 26 | pip install tfkit 27 | ``` 28 | 29 | * You can use tfkit for model training and evaluation with `tfkit-train` and `tfkit-eval`. 30 | 31 | ### Running TFKit on the task you wanted 32 | 33 | ### First step - prepare your dataset 34 | The key to combine different task together is to make different task with same data format. 35 | 36 | **notice** 37 | 38 | * All data will be in csv format - tfkit will use **csv** for all task, normally it will have two columns, first columns is the input of models, the second column is the output of models. 39 | * Plane text with no tokenization - there is no need to tokenize text before training, or do re-calculating for tokenization, tfkit will handle it for you. 40 | * No header is needed. 41 | 42 | For example, a sentiment classification dataset will be like: 43 | ```csv 44 | how dare you,negative 45 | ``` 46 | 47 | !!! hint 48 | For the detail and example format on different, you can check [here](tasks/) 49 | 50 | !!! hint 51 | nlprep is a tool for data split/preprocessing/argumentation, it can help you to create ready to train data for tfkit, check [here](https://github.com/voidful/NLPrep) 52 | 53 | ### Second step - model training 54 | 55 | Using `tfkit-train` for model training, you can use 56 | 57 | Before training a model, there is something you need to clarify: 58 | 59 | - `--model` what is your model to handle this task? check [here](models/) to the detail of models. 60 | - `--config` what pretrained model you want to use? you can go [https://huggingface.co/models](https://huggingface.co/models) to search for available pretrained models. 61 | - `--train` and `--test` training and testing dataset path, which is in csv format. 62 | - `--savedir` model saving directory, default will be in '/checkpoints' folder 63 | 64 | you can leave the rest to the default config, or use `tfkit-train -h` to more configuration. 65 | 66 | An example about training a sentiment classifier: 67 | ```bash 68 | tfkit-train \ 69 | --task clas \ 70 | --config xlm-roberta-base \ 71 | --train training_data.csv \ 72 | --test testing_data.csv \ 73 | --lr 4e-5 \ 74 | --maxlen 384 \ 75 | --epoch 10 \ 76 | --savedir roberta_sentiment_classificer 77 | ``` 78 | 79 | #### Third step - model eval 80 | 81 | Using `tfkit-eval` for model evaluation. 82 | - `--model` saved model's path. 83 | - `--metric` the evaluation metric eg: emf1, nlg(BLEU/ROUGE), clas(confusion matrix). 84 | - `--valid` validation data, also in csv format. 85 | - `--panel` a input panel for model specific parameter. 86 | 87 | for more configuration detail, you may use `tfkit-eval -h`. 88 | 89 | After evaluate, It will print evaluate result in your console, and also generate three report for debugging. 90 | - `*_score.csv` overall score, it is the copy of the console result. 91 | - `*each_data_score.csv` score on each data, 3 column `predicted,targets,score`, ranked from the lowest to the highest. 92 | - `*predicted.csv` csv file include 3 column `input,predicted,targets`. 93 | 94 | !!! hint 95 | nlp2go is a tool for demonstration, with CLI and Restful interface. check [here](https://github.com/voidful/nlp2go) 96 | 97 | ### Example 98 | #### Use distilbert to train NER Model 99 | ```bash 100 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t 101 | tfkit-train --batch 10 --epoch 3 --lr 5e-6 --train ./clner_row/train --test ./clner_row/test --maxlen 512 --task tag --config distilbert-base-multilingual-cased 102 | nlp2go --task ./checkpoints/3.pt --cli 103 | ``` 104 | 105 | #### Use Albert to train DRCD Model Model 106 | ```bash 107 | nlprep --dataset qa_zh --outdir ./zhqa/ 108 | tfkit-train --maxlen 512 --savedir ./drcd_qa_model/ --train ./zhqa/drcd-train --test ./zhqa/drcd-test --task qa --config voidful/albert_chinese_small --cache 109 | nlp2go --task ./drcd_qa_model/3.pt --cli 110 | ``` 111 | 112 | #### Use Albert to train both DRCD Model and NER Model 113 | ```bash 114 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t 115 | nlprep --dataset qa_zh --outdir ./zhqa/ 116 | tfkit-train --maxlen 300 --savedir ./mt-qaner --train ./clner_row/train ./zhqa/drcd-train --test ./clner_row/test ./zhqa/drcd-test --task tag qa --config voidful/albert_chinese_small 117 | nlp2go --task ./mt-qaner/3.pt --cli 118 | ``` 119 | 120 | **You can also try tfkit in Google Colab: [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg "tfkit")](https://colab.research.google.com/drive/1hqaTKxd3VtX2XkvjiO0FMtY-rTZX30MJ?usp=sharing)** 121 | 122 | ## Contributing 123 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md). 124 | 125 | ## License 126 | ![PyPI - License](https://img.shields.io/github/license/voidful/tfkit) 127 | 128 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE) 129 | 130 | ## Icons reference 131 | Icons modify from Freepik from www.flaticon.com 132 | Icons modify from Nikita Golubev from www.flaticon.com 133 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | tfkit is tested on Python 3.6+, and PyTorch 1.1.0+. 3 | 4 | ### Installing via pip 5 | ```bash 6 | pip install tfkit 7 | ``` 8 | ### Installing via source 9 | ```bash 10 | git clone https://github.com/voidful/tfkit.git 11 | python setup.py install 12 | # or 13 | pip install . 14 | ``` 15 | 16 | ## Running tfkit 17 | Model you've installed tfkit, you can run with 18 | 19 | ### pip installed version: 20 | `tfkit-train` 21 | `tfkit-eval` 22 | `tfkit-dump` 23 | 24 | ### local version: 25 | `python -m tfkit.train` 26 | `python -m tfkit.eval` 27 | `python -m tfkit.dump` -------------------------------------------------------------------------------- /docs/models.md: -------------------------------------------------------------------------------- 1 | ## Models Overview 2 | 3 | | task | available models | 4 | | ----------- | ------------------------------------ | 5 | | text generation | `seq2seq` `clm` `onebyone` `once` `oncectc` | 6 | | extractive question answering | `qa` | 7 | | multiple choice question answering | `mcq` | 8 | | sequence tagging | `tag` `tagcrf` | 9 | | sentence classification | `clas` | 10 | | mask language model | `clm` | 11 | 12 | ## Text Generation 13 | ### `seq2seq` 14 | [comment]: <> (::: tfkit.model.seq2seq.model.Model.forward) 15 | [comment]: <> (::: tfkit.model.seq2seq.dataloader) 16 | encoder decoder models for text generation, eg: T5/BART 17 | 18 | ### `clm` 19 | causal language model, decoder only models for text generation, eg: GPT 20 | 21 | ### `onebyone` 22 | onebyone text generation, for mask lm generation. 23 | 24 | ### `once` 25 | once text generation 26 | 27 | ### `oncectc` 28 | once text generation with ctc loss 29 | 30 | ## Extractive Question Answering 31 | ### `qa` 32 | SQuAD like question answer 33 | 34 | ## Multiple Choice Question Answering 35 | ### `mcq` 36 | softmax from mask token in input 37 | 38 | ## Sequence Tagging 39 | ### `tag` 40 | token classification 41 | 42 | ### `tagcrf` 43 | token classification with crf layer 44 | 45 | ## Sentence Classification 46 | ### `clas` 47 | sentence classification using pooling head from transformer models. 48 | 49 | ## Mask Language Model 50 | ### `mask` 51 | mask token prediction, for self-supervised learning -------------------------------------------------------------------------------- /docs/structure.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | Flow 3 | ![Flow](https://raw.githubusercontent.com/voidful/TFkit/master/docs/img/flow.png) 4 | 5 | Project directory: 6 | ``` 7 | . 8 | ├─ demo_data/ # Example data for training and evaluation 9 | ├─ docs/ # Documents 10 | ├─ tfkit/ 11 | │ ├─ model/ # all of the models, subdir name will be model name 12 | │ │ ├─ model_name # - name will be dynamic import to tfkit-train 13 | │ │ │ ├─ __init__.py 14 | │ │ │ ├─ dataloader.py # - for data loading and preprocessing 15 | │ │ │ └─ model.py # - model forward and prediction 16 | │ │ └─ __init__.py 17 | │ ├─ test/ # project unit test 18 | │ │ ├─ __init__.py 19 | │ │ ├─ test_atrain.py # - test tfkit-train 20 | │ │ ├─ test_dataloader.py # - test all model/*/dataloader.py 21 | │ │ ├─ test_model.py # - test all model/*/model.py 22 | │ │ ├─ test_package.py # - test package import 23 | │ │ ├─ test_utility_dataset.py # - test utility/dataset.py 24 | │ │ ├─ test_utility_eval_metric.py # - test utility/eval_metric.py 25 | │ │ ├─ test_utility_logger.py # - test utility/logger.py 26 | │ │ ├─ test_utility_loss.py # - test utility/loss.py 27 | │ │ ├─ test_utility_model_loader.py # - test utility/model_loader.py 28 | │ │ ├─ test_utility_tok.py # - test utility/predictor.py 29 | │ │ ├─ test_zeval.py # - test tfkit-eval 30 | │ │ └─ test_zzdump.py # - test tfkit-dump 31 | │ ├─ utility/ # project utility 32 | │ │ ├─ __init__.py 33 | │ │ ├─ dataset.py # - handle dataset loading 34 | │ │ ├─ eval_metric.py # - handle evaluation metric calculation 35 | │ │ ├─ logger.py # - handle logging and printing 36 | │ │ ├─ loss.py # - custom loss function 37 | │ │ ├─ model_loader.py # - handle model loading 38 | │ │ ├─ predictor.py # - handle model prediction 39 | │ │ └─ tok.py # - handle tokenization 40 | │ ├─ __init__.py # package init 41 | │ ├─ dump.py # tfkit-dump handler 42 | │ ├─ eval.py # tfkit-eval handler 43 | │ └─ train.py # tfkit-train handler 44 | ├─ Dockerfile # recommend docker file 45 | ├─ mkdocs.yml # document config 46 | ├─ README.md # project readme 47 | ├─ requirements.txt # package requirement 48 | └─ setup.py # package setup 49 | ``` -------------------------------------------------------------------------------- /docs/tasks.md: -------------------------------------------------------------------------------- 1 | ## Task format 2 | 3 | ### Classification 4 | 5 | !!! info 6 | #### multi-class classification: 7 | Format: 8 | `input sentence,label` 9 | 10 | Example: 11 | ``` 12 | Calotropis procera (ushaar) keratitis.,Not-Related 13 | ``` 14 | 15 | #### multi-label classification 16 | use `///` to separate each label. 17 | 18 | Format: 19 | `input sentence,label1///label2` 20 | 21 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/classification.csv): 22 | ``` 23 | We report two cases of pseudoporphyria caused by naproxen and oxaprozin.,Related///METHODS 24 | ``` 25 | 26 | ### Text Generation 27 | 28 | !!! info 29 | Format: 30 | `input sentence, target sentence` 31 | 32 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/generation.csv): 33 | ``` 34 | Peter was a truck driver . He was running a little behind on schedule . Peter decided to run past the weigh station . He was stopped by a cop .,"Peter ended up running late and getting a fine ." 35 | ``` 36 | 37 | ### Extractive Question Answering 38 | 39 | !!! info 40 | Format: 41 | `input sentence with question, answer start position, answer end position` 42 | 43 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/qa.csv): 44 | ``` 45 | Beyoncé announced a hiatus from her music ... Who suggested the hiatus for Beyoncé?, 74,84 46 | ``` 47 | 48 | ### Multiple-Choice Question Answering 49 | 50 | !!! info 51 | Input passage should include all available, $each choice must start with a mask token$ 52 | choice id will be start from 0 53 | 54 | Format: 55 | `input passage [MASK]choiceA [MASK]choiceB, 1` 56 | 57 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mcq.csv): 58 | ``` 59 | "I 'm sure many of you have seen Star Wars ... What is the best title of the passage ? [MASK] What Is Human Cloning [MASK] How Does Human Cloning Happen [MASK] Human Cloning Is Wrong [MASK] Discussion On Human Cloning",2 60 | ``` 61 | 62 | ### Mask Language Modeling 63 | 64 | !!! info 65 | input sentence with mask, can be multiple 66 | target of each mask should be separate by blank 67 | Format: 68 | `input sentence with [MASK] [MASK],target_token target_token` 69 | 70 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/mask.csv): 71 | ``` 72 | "how did i [MASK] [MASK]","get here" 73 | ``` 74 | 75 | ### Sequence Tagging 76 | 77 | !!! info 78 | input sentence with blank between each word 79 | target label separate with blank, should be one to one to the input 80 | Format: 81 | `input sentence,tag tag` 82 | 83 | [Example](https://github.com/voidful/TFkit/blob/master/tfkit/demo_data/tag.csv): 84 | ``` 85 | "welcome to New York,O O B_place B_place" 86 | ``` 87 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Project information 2 | site_name: tfkit 3 | site_description: 🤖📇 Transformers kit - End2End toolkit for NLP task 4 | site_author: Voidful 5 | site_url: https://github.com/voidful/tfkit 6 | repo_name: tfkit 7 | repo_url: https://github.com/voidful/tfkit 8 | copyright: Copyright © Voidful 9 | 10 | nav: 11 | - Home: index.md 12 | - Installation: installation.md 13 | - Tasks: tasks.md 14 | - Models: models.md 15 | - Structure: structure.md 16 | - Benchmark: benchmark.md 17 | 18 | plugins: 19 | - search 20 | - mkdocstrings: 21 | default_handler: python 22 | handlers: 23 | python: 24 | setup_commands: 25 | - import sys 26 | - sys.path.append("docs") 27 | rendering: 28 | show_root_heading: True 29 | heading_level: 3 30 | show_source: false 31 | watch: 32 | - tfkit 33 | 34 | theme: 35 | name: material 36 | language: en 37 | palette: 38 | primary: blue grey 39 | accent: blue grey 40 | font: 41 | text: Roboto 42 | code: Roboto Mono 43 | logo: img/tfkit-icon.png 44 | favicon: img/tfkit-icon.png 45 | 46 | # Extras 47 | extra: 48 | social: 49 | - icon: fontawesome/brands/github-alt 50 | link: https://github.com/voidful/tfkit 51 | - icon: fontawesome/brands/twitter 52 | link: https://twitter.com/voidful_stack 53 | - icon: fontawesome/brands/linkedin 54 | link: https://www.linkedin.com/in/voidful/ 55 | version: 56 | provider: mike 57 | 58 | # Google Analytics 59 | google_analytics: 60 | - UA-127062540-5 61 | - auto 62 | 63 | # Extensions 64 | markdown_extensions: 65 | - markdown.extensions.admonition 66 | - markdown.extensions.attr_list 67 | - markdown.extensions.codehilite: 68 | guess_lang: false 69 | - markdown.extensions.def_list 70 | - markdown.extensions.footnotes 71 | - markdown.extensions.meta 72 | - markdown.extensions.toc: 73 | permalink: true 74 | - pymdownx.arithmatex 75 | - pymdownx.betterem: 76 | smart_enable: all 77 | - pymdownx.caret 78 | - pymdownx.critic 79 | - pymdownx.details 80 | - pymdownx.emoji: 81 | emoji_index: !!python/name:materialx.emoji.twemoji 82 | emoji_generator: !!python/name:materialx.emoji.to_svg 83 | # - pymdownx.highlight: 84 | # linenums_style: pymdownx-inline 85 | - pymdownx.inlinehilite 86 | - pymdownx.keys 87 | - pymdownx.magiclink: 88 | repo_url_shorthand: true 89 | user: squidfunk 90 | repo: mkdocs-material 91 | - pymdownx.mark 92 | - pymdownx.smartsymbols 93 | - pymdownx.snippets: 94 | check_paths: true 95 | - pymdownx.superfences 96 | - pymdownx.tabbed 97 | - pymdownx.tasklist: 98 | custom_checkbox: true 99 | - pymdownx.tilde 100 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=3.3.0 2 | tensorboard 3 | tensorboardX 4 | torch 5 | matplotlib 6 | nlp2>=1.8.44 7 | tqdm>=4.45.0 8 | inquirer 9 | numpy 10 | scipy>=1.10.1 11 | pytorch-crf 12 | sentencepiece 13 | pandas 14 | accelerate>=0.5.1 15 | joblib 16 | scikit-learn 17 | editdistance -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('requirements.txt') as f: 4 | required = f.read().splitlines() 5 | 6 | setup( 7 | name='tfkit', 8 | version='0.8.20', 9 | description='Transformers kit - Multi-task QA/Tagging/Multi-label Multi-Class Classification/Generation with BERT/ALBERT/T5/BERT', 10 | url='https://github.com/voidful/TFkit', 11 | author='Voidful', 12 | author_email='voidful.stack@gmail.com', 13 | long_description=open("README.md", encoding="utf8").read(), 14 | long_description_content_type="text/markdown", 15 | setup_requires=['setuptools-git'], 16 | classifiers=[ 17 | 'Development Status :: 4 - Beta', 18 | "Intended Audience :: Science/Research", 19 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 20 | "License :: OSI Approved :: Apache Software License", 21 | 'Programming Language :: Python :: 3.6' 22 | ], 23 | license="Apache", 24 | keywords='transformer huggingface nlp multi-task multi-class multi-label classification generation tagging deep learning machine reading', 25 | packages=find_packages(), 26 | install_requires=required, 27 | entry_points={ 28 | 'console_scripts': ['tfkit-train=tfkit.train:main', 'tfkit-eval=tfkit.eval:main', 'tfkit-dump=tfkit.dump:main'] 29 | }, 30 | py_modules=['tfkit'], 31 | python_requires=">=3.5.0", 32 | zip_safe=False, 33 | ) 34 | -------------------------------------------------------------------------------- /tfkit/__init__.py: -------------------------------------------------------------------------------- 1 | import tfkit.utility 2 | import tfkit.dump 3 | import tfkit.train 4 | import tfkit.eval 5 | from tfkit.task import * -------------------------------------------------------------------------------- /tfkit/dump.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, \ 5 | AutoModelForCausalLM 6 | 7 | from tfkit.utility.model import load_trained_model, add_tokens_to_pretrain 8 | 9 | 10 | def parse_dump_args(args): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model", required=True, type=str) 13 | parser.add_argument("--dumpdir", required=True, type=str) 14 | return vars(parser.parse_args(args)) 15 | 16 | 17 | def main(arg=None): 18 | arg = parse_dump_args(sys.argv[1:]) if arg is None else parse_dump_args(arg) 19 | model, model_type, model_class, model_info, model_preprocessor = load_trained_model(arg.get('model')) 20 | tokenizer = model.tokenizer 21 | pretrained_config = model_info.get("model_config") 22 | if model_type == 'clm': 23 | hf_model = AutoModelForCausalLM.from_pretrained(model_info.get("model_config")) 24 | hf_model.eval() 25 | hf_model.transformer = model.pretrained 26 | if hasattr(hf_model, 'lm_head'): 27 | hf_model.lm_head.weight = model.model.weight 28 | else: 29 | hf_model.cls.weight = model.model.weight 30 | hf_model.config.tie_word_embeddings = False 31 | hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', [])) 32 | hf_model.save_pretrained(arg.get('dumpdir')) 33 | elif model_type == 'seq2seq': 34 | hf_model = AutoModelForSeq2SeqLM.from_pretrained(model_info.get("model_config")) 35 | hf_model.eval() 36 | hf_model.model = model.pretrained 37 | hf_model.lm_head = model.model 38 | hf_model.config.tie_word_embeddings = False 39 | hf_model.config.tie_encoder_decoder = False 40 | hf_model, tokenizer = add_tokens_to_pretrain(hf_model, tokenizer, model_info.get('add_tokens', [])) 41 | hf_model.save_pretrained(arg.get('dumpdir')) 42 | elif model_type == 'clas': 43 | hf_model = AutoModelForSequenceClassification.from_pretrained(model_info.get("model_config")) 44 | hf_model.classifier.weight = model.classifier_list[0].weight 45 | hf_model.save_pretrained(arg.get('dumpdir')) 46 | else: 47 | model.pretrained.save_pretrained(arg.get('dumpdir')) 48 | 49 | tokenizer.save_pretrained(arg.get('dumpdir')) 50 | print('==================') 51 | print("Finish model dump.") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /tfkit/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import logging 4 | import sys 5 | import time 6 | from datetime import timedelta 7 | 8 | import nlp2 9 | import torch 10 | from tqdm.auto import tqdm 11 | 12 | from tfkit.utility.eval_metric import EvalMetric 13 | from tfkit.utility.model import load_trained_model, load_predict_parameter 14 | 15 | transformers_logger = logging.getLogger('transformers') 16 | transformers_logger.setLevel(logging.CRITICAL) 17 | 18 | 19 | def parse_eval_args(args): 20 | parser = argparse.ArgumentParser() 21 | group = parser.add_mutually_exclusive_group(required=True) 22 | group.add_argument("--model", nargs='+', type=str, help="evaluation model") 23 | parser.add_argument("--config", type=str, help='pre-trained task path after add token') 24 | parser.add_argument("--metric", required=True, type=str, choices=['emf1', 'nlg', 'clas', 'er'], 25 | help="evaluate metric") 26 | parser.add_argument("--valid", required=True, type=str, nargs='+', help="evaluate data path") 27 | parser.add_argument("--tag", type=str, help="evaluate task tag for select multi-task task") 28 | parser.add_argument("--print", action='store_true', help="print each pair of evaluate data") 29 | parser.add_argument("--panel", action='store_true', help="enable panel to input argument") 30 | 31 | input_arg, model_arg = parser.parse_known_args(args) 32 | input_arg = {k: v for k, v in vars(input_arg).items() if v is not None} 33 | model_arg = {k.replace("--", ""): v for k, v in zip(model_arg[:-1:2], model_arg[1::2])} 34 | return input_arg, model_arg 35 | 36 | 37 | def main(arg=None): 38 | with torch.no_grad(): 39 | eval_arg, model_arg = parse_eval_args(sys.argv[1:]) if arg is None else parse_eval_args(arg) 40 | models_path = eval_arg.get('model', []) 41 | 42 | if nlp2.is_dir_exist(models_path[0]): 43 | models = [f for f in nlp2.get_files_from_dir(models_path[0]) if f.endswith('.pt')] 44 | else: 45 | models = models_path 46 | 47 | for model_path in models: 48 | start_time = time.time() 49 | valid = eval_arg.get('valid')[0] 50 | model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path, 51 | pretrained_config=eval_arg.get( 52 | 'config'), 53 | tag=eval_arg.get('tag')) 54 | predict_parameter = load_predict_parameter(model, model_arg, eval_arg.get('panel')) 55 | 56 | eval_metrics = [EvalMetric(model.tokenizer) 57 | for _ in range(int(predict_parameter.get('decodenum', 1)))] 58 | 59 | print("PREDICT PARAMETER") 60 | print("=======================") 61 | print(predict_parameter) 62 | print("=======================") 63 | 64 | get_data_item = preprocessor.read_file_to_data(valid) 65 | for chunk in tqdm(get_data_item): 66 | for i in chunk: 67 | input = i['input'] 68 | target = i['target'] 69 | predict_parameter.update({'input': input}) 70 | result, result_dict = model.predict(**predict_parameter) 71 | for eval_pos, eval_metric in enumerate(eval_metrics): 72 | # predicted can be list of string or string 73 | # target should be list of string 74 | predicted = result 75 | processed_target = target 76 | if 'qa' in model_type: 77 | processed_target = " ".join(input.split(" ")[int(target[0]): int(target[1])]) 78 | if len(result) > 0: 79 | predicted = result[0][0] if isinstance(result[0], list) else result[0] 80 | else: 81 | predicted = '' 82 | elif 'onebyone' in model_type or 'seq2seq' in model_type or 'clm' in model_type: 83 | processed_target = target 84 | if len(result) < eval_pos: 85 | print("Decode size smaller than decode num:", result_dict['label_map']) 86 | predicted = result[eval_pos] 87 | elif 'once' in model_type: 88 | processed_target = target 89 | predicted = result[eval_pos] 90 | elif 'mask' in model_type: 91 | processed_target = target.split(" ") 92 | predicted = result 93 | elif 'tag' in model_type: 94 | predicted = " ".join([list(d.values())[0] for d in result_dict[0]['label_map']]) 95 | processed_target = target[0].split(" ") 96 | predicted = predicted.split(" ") 97 | 98 | if eval_arg.get('print'): 99 | print('===eval===') 100 | print("input: ", input) 101 | print("target: ", processed_target) 102 | print("predicted: ", predicted) 103 | print('==========') 104 | 105 | eval_metric.add_record(input, predicted, processed_target, eval_arg.get('metric')) 106 | 107 | for eval_pos, eval_metric in enumerate(eval_metrics): 108 | argtype = f"_dataset{valid.replace('/', '_').replace('.', '_')}" 109 | if 'decodenum' in predict_parameter and int(predict_parameter['decodenum']) > 1: 110 | argtype += f"_num_{eval_pos}" 111 | if 'mode' in predict_parameter: 112 | para_mode = predict_parameter['mode'][0] if isinstance(predict_parameter['mode'], list) else \ 113 | predict_parameter['mode'].lower() 114 | argtype += f"_mode_{para_mode}" 115 | if 'filtersim' in predict_parameter: 116 | argtype += f"_filtersim_{predict_parameter['filtersim']}" 117 | outfile_name = f"{model_path}{argtype}" 118 | 119 | with open(f"{outfile_name}_predicted.csv", "w", encoding='utf8') as f: 120 | writer = csv.writer(f) 121 | records = eval_metric.get_record(eval_arg.get('metric')) 122 | writer.writerow(['input', 'predicted', 'targets']) 123 | for i, p, t in zip(records['ori_input'], records['ori_predicted'], records['ori_target']): 124 | writer.writerow([i, p, t]) 125 | print("write result at:", outfile_name) 126 | 127 | with open(f"{outfile_name}_each_data_score.csv", "w", encoding='utf8') as edsf: 128 | eds = csv.writer(edsf) 129 | with open(f"{outfile_name}_score.csv", "w", encoding='utf8') as f: 130 | for i in eval_metric.cal_score(eval_arg.get('metric')): 131 | f.write(f"TASK: {i[0]} , {eval_pos}\n") 132 | f.write(f"{i[1]}\n") 133 | eds.writerows(i[2]) 134 | 135 | print("write score at:", outfile_name) 136 | 137 | for i in eval_metric.cal_score(eval_arg.get('metric')): 138 | print("TASK: ", i[0], eval_pos) 139 | print(i[1]) 140 | 141 | print(f"=== Execution time: {timedelta(seconds=(time.time() - start_time))} ===") 142 | 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /tfkit/task/__init__.py: -------------------------------------------------------------------------------- 1 | import os, pkgutil 2 | 3 | __all__ = list(module for _, module, _ in pkgutil.iter_modules([os.path.dirname(__file__)])) 4 | -------------------------------------------------------------------------------- /tfkit/task/clas/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/clas/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from torch import nn 6 | 7 | from tfkit.utility.predictor import ClassificationPredictor 8 | 9 | dir_path = os.path.dirname(os.path.realpath(__file__)) 10 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 11 | 12 | from torch import softmax, sigmoid 13 | from tfkit.task.clas import Preprocessor 14 | from tfkit.utility.loss import FocalLoss, BCEFocalLoss 15 | 16 | 17 | class Model(nn.Module): 18 | 19 | def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.1, **kwargs): 20 | super().__init__() 21 | self.tokenizer = tokenizer 22 | self.pretrained = pretrained 23 | 24 | self.dropout = nn.Dropout(dropout) 25 | self.loss_fct = FocalLoss() 26 | self.loss_fct_mt = BCEFocalLoss() 27 | 28 | self.tasks = dict() 29 | self.tasks_detail = tasks_detail 30 | self.classifier_list = nn.ModuleList() 31 | for task, labels in tasks_detail.items(): 32 | self.classifier_list.append(nn.Linear(self.pretrained.config.hidden_size, len(labels))) 33 | self.tasks[task] = len(self.classifier_list) - 1 34 | self.maxlen = maxlen 35 | 36 | self.pretrained = self.pretrained 37 | self.classifier_list = self.classifier_list 38 | self.loss_fct = self.loss_fct 39 | self.loss_fct_mt = self.loss_fct_mt 40 | 41 | predictor = ClassificationPredictor(self, Preprocessor) 42 | self.predictor = predictor 43 | self.predict = predictor.predict 44 | 45 | def get_all_task(self): 46 | """ 47 | list all classification task 48 | :return: tasks list 49 | """ 50 | return list(self.tasks.keys()) 51 | 52 | def mean_pooling(self, model_output, attention_mask): 53 | """ 54 | Mean Pooling - Take attention mask into account for correct averaging 55 | from https://github.com/UKPLab/sentence-transformers 56 | modify - mask from -1 to 0 57 | :param model_output: 58 | :param attention_mask: 59 | :return: 60 | """ 61 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float() 62 | input_mask_expanded[input_mask_expanded < 0] = 0 63 | sum_embeddings = torch.sum(model_output * input_mask_expanded, 1) 64 | sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) 65 | return sum_embeddings / sum_mask 66 | 67 | def forward(self, batch_data, eval=False, **kwargs): 68 | # covert input to correct data type 69 | tasks = batch_data['task'] 70 | tasks = [bytes(t).decode(encoding="utf-8", errors="ignore") for t in tasks] 71 | inputs = torch.as_tensor(batch_data['input']) 72 | targets = torch.as_tensor(batch_data['target']) 73 | masks = torch.as_tensor(batch_data['mask']) 74 | # define model output 75 | result_dict = { 76 | 'max_item': [], 77 | 'prob_list': [], 78 | 'label_prob': [] 79 | } 80 | 81 | result_logits = [] 82 | result_labels = [] 83 | for p, zin in enumerate(zip(tasks, inputs, masks)): 84 | task, input, mask = zin 85 | task_id = self.tasks[task] 86 | task_labels = self.tasks_detail[task] 87 | output = self.pretrained(input.unsqueeze(0), mask.unsqueeze(0))[0] 88 | pooled_output = self.dropout(self.mean_pooling(output, mask.unsqueeze(0))) 89 | classifier_output = self.classifier_list[task_id](pooled_output) 90 | reshaped_logit = classifier_output.view(-1, len(task_labels)) # 0 for cls position 91 | result_logits.append(reshaped_logit) 92 | if not eval: 93 | target = targets[p] 94 | result_labels.append(target) 95 | else: 96 | if 'multi_label' in task: 97 | reshaped_logit = sigmoid(reshaped_logit) 98 | else: 99 | reshaped_logit = softmax(reshaped_logit, dim=1) 100 | logit_prob = reshaped_logit[0].data.tolist() 101 | logit_label = dict(zip(task_labels, logit_prob)) 102 | result_dict['label_prob'].append({task: logit_label}) 103 | if 'multi_label' in task: 104 | result_dict['max_item'].append({task: [k for k, v in logit_label.items() if v > 0.5]}) 105 | else: 106 | result_dict['max_item'].append({task: [task_labels[logit_prob.index(max(logit_prob))]]}) 107 | 108 | if eval: 109 | outputs = result_dict 110 | else: 111 | loss = 0 112 | for logit, labels, task in zip(result_logits, result_labels, tasks): 113 | if 'multi_label' in task: 114 | loss += self.loss_fct_mt(logit, labels.type_as(logit)) 115 | else: 116 | loss += self.loss_fct(logit, labels) 117 | outputs = loss 118 | 119 | return outputs 120 | -------------------------------------------------------------------------------- /tfkit/task/clas/preprocessor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.preprocessing import MultiLabelBinarizer 3 | 4 | from tfkit.utility import tok 5 | from tfkit.utility.data_filereader import get_multiclas_data_from_file 6 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 7 | 8 | 9 | class Preprocessor(GeneralNLPPreprocessor): 10 | 11 | def read_file_to_data(self, path): 12 | return get_multiclas_data_from_file(path) 13 | 14 | def preprocess_component_convert_to_id(self, item, **param_dict): 15 | item['input'] = self.tokenizer.convert_tokens_to_ids(item['input']) 16 | yield item 17 | 18 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 19 | tinput, task = item['input'], item['task'] 20 | row_dict = {'task': list(task.encode("utf-8"))} 21 | tokenized_input_id = [tok.tok_begin_id(tokenizer)] + tinput + [tok.tok_sep_id(tokenizer)] 22 | mask_id = [1] * len(tokenized_input_id) 23 | row_dict['input'] = tokenized_input_id 24 | row_dict['mask'] = mask_id 25 | row_dict['target'] = [-1] 26 | if 'target' in item: 27 | target = item['target'] 28 | if 'multi_label' in task: 29 | mlb = MultiLabelBinarizer(classes=item['task_dict'][task]) 30 | tar = mlb.fit_transform([target]) 31 | tokenize_label = tar 32 | else: 33 | tokenize_label = [item['task_dict'][task].index(target[0])] 34 | row_dict['target'] = tokenize_label 35 | return {key: torch.tensor(value) for key, value in row_dict.items()} 36 | -------------------------------------------------------------------------------- /tfkit/task/clm/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/clm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from tfkit.task.clm import Preprocessor 5 | from tfkit.utility.predictor import AutoRegressivePredictor 6 | 7 | dir_path = os.path.dirname(os.path.realpath(__file__)) 8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 9 | 10 | import torch 11 | from torch import nn 12 | from torch.nn.functional import softmax 13 | 14 | 15 | class Model(nn.Module): 16 | def __init__(self, tokenizer, pretrained, maxlen=512, **kwargs): 17 | super().__init__() 18 | self.tokenizer = tokenizer 19 | self.pretrained = pretrained 20 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__()) 21 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size) 22 | self.maxlen = maxlen 23 | predictor = AutoRegressivePredictor(self, Preprocessor) 24 | self.predictor = predictor 25 | self.predict = predictor.predict 26 | 27 | def clean_cache(self): 28 | self.encoder_outputs = None 29 | self.past_key_values = None 30 | 31 | def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs): 32 | inputs = batch_data['input'] 33 | masks = batch_data['mask'] 34 | tokens_tensor = torch.as_tensor(inputs) 35 | mask_tensors = torch.as_tensor(masks) 36 | 37 | outputs = self.pretrained(tokens_tensor, attention_mask=mask_tensors) 38 | prediction_scores = self.model(outputs[0]) 39 | 40 | if eval: 41 | result_dict = {} 42 | start = batch_data['start'][0] 43 | softmax_score = softmax(prediction_scores[0][start], dim=-1).flatten() 44 | max_item_id = torch.argmax(softmax_score, -1).item() 45 | max_item_prob = softmax_score[max_item_id].item() 46 | result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob) 47 | if max_return > 1: 48 | topK = torch.topk(softmax_score, max_return) 49 | prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in 50 | zip(topK.values.data.tolist(), topK.indices.data.tolist())] 51 | result_dict['prob_list'] = softmax_score.data.tolist()[:max_return] 52 | result_dict['label_prob'] = prob_result 53 | outputs = result_dict 54 | else: 55 | targets = batch_data['target'] 56 | loss_tensors = torch.as_tensor(targets) 57 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token 58 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size), 59 | loss_tensors.view(-1)) 60 | 61 | outputs = masked_lm_loss 62 | return outputs 63 | -------------------------------------------------------------------------------- /tfkit/task/clm/preprocessor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from tfkit.utility.data_filereader import get_gen_data_from_file 4 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 5 | 6 | 7 | class Preprocessor(GeneralNLPPreprocessor): 8 | def read_file_to_data(self, path): 9 | return get_gen_data_from_file(path) 10 | 11 | def preprocess_component_convert_to_id(self, item, **param_dict): 12 | tokenized_input, target = item['input'], item.get('target', None) 13 | tokenized_target = self.tokenizer.tokenize(target) if target else None 14 | previous = item.get("previous", []) 15 | if tokenized_target is None: 16 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 17 | 'previous': self.tokenizer.convert_tokens_to_ids(previous)} 18 | else: 19 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 20 | 'previous': self.tokenizer.convert_tokens_to_ids(previous), 21 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)} 22 | 23 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 24 | t_input_id, previous = item['input'], item['previous'] 25 | row_dict = {} 26 | if 'target' in item: 27 | target = item['target'] 28 | t_target_id = [-1] * len(t_input_id) 29 | mask_id = [0] * (len(t_target_id)) 30 | t_target_id += target + [self.tok_sep_id] 31 | mask_id += [1] * (len(target + [self.tok_sep_id])) 32 | 33 | row_dict['start'] = [len(t_input_id)] 34 | t_input_id += [self.tok_bos_id] + target 35 | mask_id = [1] * (len(t_input_id)) 36 | row_dict['target'] = t_target_id 37 | else: 38 | t_prev_id = [self.tok_sep_id] + previous 39 | t_input_id.extend(t_prev_id) 40 | mask_id = [1] * (len(t_input_id)) 41 | row_dict['start'] = [len(t_input_id) - 1] 42 | row_dict['input'] = t_input_id 43 | row_dict['mask'] = mask_id 44 | row_dict['target_pad'] = [-1] 45 | return {key: torch.tensor(value) for key, value in row_dict.items()} 46 | -------------------------------------------------------------------------------- /tfkit/task/once/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/once/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from collections import defaultdict 4 | 5 | from tfkit.task.once import Preprocessor 6 | from tfkit.utility.predictor import NonAutoRegressivePredictor 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 10 | 11 | from torch.nn.functional import softmax 12 | from tfkit.utility.loss import * 13 | from tfkit.utility.tok import * 14 | 15 | 16 | class Model(nn.Module): 17 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None): 18 | super().__init__() 19 | self.tokenizer = tokenizer 20 | self.pretrained = pretrained 21 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__()) 22 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size) 23 | self.maxlen = maxlen 24 | 25 | predictor = NonAutoRegressivePredictor(self, Preprocessor) 26 | self.predictor = predictor 27 | self.predict = predictor.predict 28 | 29 | def clean_cache(self): 30 | self.encoder_outputs = None 31 | self.past_key_values = None 32 | 33 | def forward(self, batch_data, eval=False, max_return=1, **kwargs): 34 | inputs = batch_data['input'] 35 | masks = batch_data['mask'] 36 | starts = batch_data['start'] 37 | ends = batch_data['end'] 38 | tokens_tensor = torch.as_tensor(inputs) 39 | mask_tensors = torch.as_tensor(masks) 40 | 41 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors) 42 | sequence_output = output[0] 43 | prediction_scores = self.model(sequence_output) 44 | 45 | if eval: 46 | result_dict = { 47 | 'max_item': [], 48 | 'label_prob': defaultdict(list), 49 | 'prob_list': [] 50 | } 51 | start = batch_data['start'][0] 52 | stop = False 53 | topK_ids = [[]] * max_return 54 | topK_probs = [1] * max_return 55 | while start < self.maxlen and not stop: 56 | softmax_score = softmax(prediction_scores[0][start], dim=0) 57 | max_item_id = torch.argmax(softmax_score, -1).item() 58 | max_item_prob = softmax_score[max_item_id].item() 59 | if max_return > 1: 60 | topK = torch.topk(softmax_score, max_return) 61 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())): 62 | topK_ids[k].append(tid) 63 | topK_probs[k] *= prob 64 | else: 65 | topK_ids[0].append(max_item_id) 66 | topK_probs[0] *= max_item_prob 67 | 68 | if tok_sep_id(self.tokenizer) == max_item_id: 69 | stop = True 70 | start += 1 71 | result_dict['prob_list'] = topK_probs 72 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in 73 | zip(topK_ids, topK_probs)] 74 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']] 75 | outputs = result_dict 76 | else: 77 | targets = batch_data['target'] 78 | negative_targets = batch_data['ntarget'] 79 | loss_tensors = torch.as_tensor(targets) 80 | negativeloss_tensors = torch.as_tensor(negative_targets) 81 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token 82 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size), 83 | loss_tensors.view(-1)) 84 | if not torch.all(negativeloss_tensors.eq(-1)).item(): 85 | negative_loss_fct = NegativeCElLoss() 86 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size), 87 | negativeloss_tensors.view(-1)) 88 | masked_lm_loss += negative_loss 89 | outputs = masked_lm_loss 90 | 91 | return outputs 92 | -------------------------------------------------------------------------------- /tfkit/task/once/preprocessor.py: -------------------------------------------------------------------------------- 1 | import tfkit.utility.tok as tok 2 | from tfkit.utility.data_filereader import get_gen_data_from_file 3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 4 | 5 | 6 | class Preprocessor(GeneralNLPPreprocessor): 7 | def read_file_to_data(self, path): 8 | return get_gen_data_from_file(path) 9 | 10 | def set_global_parameters(self): 11 | self.tokenize_target = True 12 | 13 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict): 14 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood 15 | tokenized_input, tokenized_target, n_target = item['input'], item.get('target', None), item.get('ntarget', None) 16 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 17 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)} 18 | if "neg" in likelihood: 19 | # formatting neg data in csv 20 | if n_target is None: 21 | ntext_arr = [ 22 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)] 23 | elif tok.tok_sep(self.tokenizer) in n_target: 24 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))] 25 | else: 26 | ntext_arr = [n_target.strip()] 27 | for neg_text in ntext_arr: 28 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 29 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target), 30 | 'ntarget': self.tokenizer.convert_tokens_to_ids(neg_text)} 31 | 32 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 33 | tok_pad = tok.tok_pad_id(tokenizer) 34 | tok_bos = tok.tok_begin_id(tokenizer) 35 | tok_sep = tok.tok_sep_id(tokenizer) 36 | tok_mask = tok.tok_mask_id(tokenizer) 37 | 38 | row_dict = {} 39 | t_input_id = item['input'] 40 | encoder_mask_id = [1] * (len(t_input_id)) 41 | encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id))) 42 | target_start = len(t_input_id) 43 | target_end = maxlen 44 | target_length = target_end - target_start 45 | t_input_id.extend([tok_pad] * (maxlen - len(t_input_id))) 46 | if 'target' in item and item['target'] is not None: 47 | target = item['target'] + [tok_sep] 48 | target.extend([-1] * (maxlen - len(target))) 49 | row_dict['target'] = target 50 | row_dict['ntarget'] = [-1] * maxlen 51 | if 'ntarget' in item and len(item['ntarget'].strip()) > 0: 52 | tokenized_ntarget_id = item['ntarget'] 53 | tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) 54 | if len(tokenized_ntarget_id) <= maxlen: 55 | row_dict['ntarget'] = tokenized_ntarget_id 56 | 57 | input_length = min(maxlen, target_start * 3) 58 | row_dict['input'] = t_input_id 59 | row_dict['mask'] = encoder_mask_id 60 | row_dict['start'] = target_start 61 | row_dict['end'] = maxlen 62 | row_dict['input_length'] = input_length 63 | row_dict['target_length'] = target_length 64 | return row_dict 65 | -------------------------------------------------------------------------------- /tfkit/task/oncectc/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Model 2 | from tfkit.task.once.preprocessor import Preprocessor 3 | -------------------------------------------------------------------------------- /tfkit/task/oncectc/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from collections import defaultdict 4 | 5 | from tfkit.task.once import Preprocessor 6 | from tfkit.utility.predictor import NonAutoRegressivePredictor 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 10 | 11 | from torch.nn.functional import softmax 12 | from tfkit.utility.loss import * 13 | from tfkit.utility.tok import * 14 | from tfkit.utility.loss import SeqCTCLoss 15 | 16 | 17 | class Model(nn.Module): 18 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None): 19 | super().__init__() 20 | self.tokenizer = tokenizer 21 | self.pretrained = pretrained 22 | self.maxlen = maxlen 23 | self.blank_token = "" 24 | self.tokenizer.add_tokens(self.blank_token) 25 | self.pretrained.resize_token_embeddings(len(tokenizer)) 26 | self.blank_index = self.tokenizer.convert_tokens_to_ids([self.blank_token])[0] 27 | self.loss = SeqCTCLoss(blank_index=self.blank_index) 28 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__()) 29 | self.model = nn.Linear(self.pretrained.config.hidden_size, self.vocab_size) 30 | predictor = NonAutoRegressivePredictor(self, Preprocessor) 31 | self.predictor = predictor 32 | self.predict = predictor.predict 33 | 34 | def clean_cache(self): 35 | self.encoder_outputs = None 36 | self.past_key_values = None 37 | 38 | def forward(self, batch_data, eval=False, max_return=1, **kwargs): 39 | inputs = batch_data['input'] 40 | masks = batch_data['mask'] 41 | starts = batch_data['start'] 42 | ends = batch_data['end'] 43 | tokens_tensor = torch.as_tensor(inputs) 44 | mask_tensors = torch.as_tensor(masks) 45 | 46 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors) 47 | sequence_output = output[0] 48 | prediction_scores = self.model(sequence_output) 49 | batch_size = list(tokens_tensor.shape)[0] 50 | prediction_scores = prediction_scores.view(batch_size, -1, self.vocab_size) 51 | 52 | if eval: 53 | result_dict = { 54 | 'max_item': [], 55 | 'label_prob': defaultdict(list), 56 | 'prob_list': [] 57 | } 58 | start = batch_data['start'][0] 59 | topK_ids = [[]] * max_return 60 | topK_probs = [1] * max_return 61 | 62 | pscore = prediction_scores.detach().cpu() 63 | predicted_indexs = pscore.argmax(2).tolist()[0] 64 | predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indexs) 65 | output = [] 66 | for pos, (predicted_index, predicted_token) in enumerate(zip(predicted_indexs, predicted_tokens)): 67 | if len(output) > 0 and predicted_index == output[-1]: 68 | continue 69 | if predicted_token == self.blank_token: 70 | continue 71 | if predicted_token == tok_pad(self.tokenizer): 72 | continue 73 | if predicted_token == tok_sep(self.tokenizer): 74 | break 75 | 76 | softmax_score = softmax(prediction_scores[0][pos], dim=0) 77 | max_item_id = torch.argmax(softmax_score, -1).item() 78 | max_item_prob = softmax_score[max_item_id].item() 79 | if max_return > 1: 80 | topK = torch.topk(softmax_score, max_return) 81 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())): 82 | topK_ids[k].append(tid) 83 | topK_probs[k] *= prob 84 | else: 85 | topK_ids[0].append(max_item_id) 86 | topK_probs[0] *= max_item_prob 87 | start += 1 88 | 89 | result_dict['prob_list'] = topK_probs 90 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in 91 | zip(topK_ids, topK_probs)] 92 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']] 93 | outputs = result_dict 94 | else: 95 | targets = batch_data['target'] 96 | negative_targets = batch_data['ntarget'] 97 | input_lengths = batch_data['input_length'] 98 | target_lengths = batch_data['target_length'] 99 | 100 | target_tensors = torch.as_tensor(targets) 101 | input_length_tensors = torch.as_tensor(input_lengths) 102 | target_length_tensors = torch.as_tensor(target_lengths) 103 | 104 | loss_tensors = torch.as_tensor(targets) 105 | negativeloss_tensors = torch.as_tensor(negative_targets) 106 | ctc_lm_loss = self.loss(prediction_scores, 107 | input_length_tensors, 108 | target_tensors.view(batch_size, -1), 109 | target_length_tensors) 110 | 111 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token 112 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size), 113 | loss_tensors.view(-1)) 114 | if not torch.all(negativeloss_tensors.eq(-1)).item(): 115 | negative_loss_fct = NegativeCElLoss() 116 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size), 117 | negativeloss_tensors.view(-1)) 118 | masked_lm_loss += negative_loss 119 | outputs = ctc_lm_loss + masked_lm_loss 120 | 121 | return outputs 122 | -------------------------------------------------------------------------------- /tfkit/task/qa/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/qa/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from tfkit.utility.predictor import QuestionAnsweringPredictor 5 | 6 | dir_path = os.path.dirname(os.path.realpath(__file__)) 7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch.nn.functional import softmax 12 | from tfkit.task.qa.preprocessor import Preprocessor 13 | 14 | 15 | class Model(nn.Module): 16 | 17 | def __init__(self, tokenizer, pretrained, maxlen=128, dropout=0.1, **kwargs): 18 | super().__init__() 19 | self.tokenizer = tokenizer 20 | self.pretrained = pretrained 21 | self.maxlen = maxlen 22 | 23 | self.dropout = nn.Dropout(dropout) 24 | self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1) 25 | # self.loss_fct = FocalLoss(ignore_index=-1) 26 | # self.loss_fct = GWLoss() 27 | 28 | self.pretrained = self.pretrained 29 | self.qa_classifier = nn.Linear(self.pretrained.config.hidden_size, 2) 30 | self.loss_fct = self.loss_fct 31 | 32 | predictor = QuestionAnsweringPredictor(self, Preprocessor) 33 | self.predictor = predictor 34 | self.predict = predictor.predict 35 | 36 | def forward(self, batch_data, eval=False, **kwargs): 37 | print("batch_data",batch_data) 38 | inputs = torch.as_tensor(batch_data['input']) 39 | masks = torch.as_tensor(batch_data['mask']) 40 | targets = torch.as_tensor(batch_data['target']) 41 | start_positions, end_positions = targets.split(1, dim=1) 42 | start_positions = start_positions.squeeze(1) 43 | end_positions = end_positions.squeeze(1) 44 | 45 | output = self.pretrained(inputs, attention_mask=masks)[0] 46 | logits = self.qa_classifier(output) 47 | start_logits, end_logits = logits.split(1, dim=-1) 48 | start_logits = start_logits.squeeze(-1) 49 | end_logits = end_logits.squeeze(-1) 50 | 51 | if eval: 52 | result_dict = { 53 | 'label_prob_all': [], 54 | 'label_map': [] 55 | } 56 | reshaped_start_logits = softmax(start_logits, dim=1) 57 | reshaped_end_logits = softmax(end_logits, dim=1) 58 | start_prob = reshaped_start_logits.data.tolist()[0] 59 | end_prob = reshaped_end_logits.data.tolist()[0] 60 | result_dict['label_prob_all'].append({'start': dict(zip(range(len(start_prob)), start_prob)), 61 | 'end': dict(zip(range(len(end_prob)), end_prob))}) 62 | result_dict['label_map'].append({'start': start_prob.index(max(start_prob)), 63 | 'end': end_prob.index(max(end_prob))}) 64 | outputs = result_dict 65 | else: 66 | start_loss = self.loss_fct(start_logits, start_positions) 67 | end_loss = self.loss_fct(end_logits, end_positions) 68 | total_loss = (start_loss + end_loss) / 2 69 | outputs = total_loss 70 | 71 | return outputs 72 | -------------------------------------------------------------------------------- /tfkit/task/qa/preprocessor.py: -------------------------------------------------------------------------------- 1 | import nlp2 2 | import tfkit.utility.tok as tok 3 | import torch 4 | from tfkit.utility.data_filereader import get_qa_data_from_file 5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 6 | 7 | 8 | class Preprocessor(GeneralNLPPreprocessor): 9 | def read_file_to_data(self, path): 10 | return get_qa_data_from_file(path) 11 | 12 | def preprocess_component_prepare_input(self, item): 13 | mapping_index = [] 14 | pos = 1 # cls as start 0 15 | input_text_list = nlp2.split_sentence_to_array(item['input']) 16 | for i in input_text_list: 17 | for _ in range(len(self.tokenizer.tokenize(i))): 18 | if _ < 1: 19 | mapping_index.append({'char': i, 'pos': pos}) 20 | pos += 1 21 | item['mapping_index'] = mapping_index 22 | return item 23 | 24 | def preprocess_component_convert_to_id(self, item, **param_dict): 25 | input_text, target = item['input'], item.get('target', None) 26 | tokenized_input = [tok.tok_begin(self.tokenizer)] + input_text + [tok.tok_sep(self.tokenizer)] 27 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input) 28 | start_index = item['input_index'][0] 29 | end_index = item['input_index'][1] 30 | if target: 31 | item['target'] = [0, 0] 32 | target_start, target_end = target 33 | ori_start = target_start = int(target_start) 34 | ori_end = target_end = int(target_end) 35 | ori_ans = tokenized_input[ori_start:ori_end] 36 | target_start -= start_index 37 | target_end -= start_index 38 | # print("target_start", self.parameters['maxlen'],item['mapping_index'][target_start]['pos'],ori_end) 39 | # if item['mapping_index'][target_start]['pos'] > ori_end or target_start < 0 \ 40 | # or target_start > self.parameters['maxlen'] \ 41 | # or target_end >= self.parameters['maxlen'] - 2: 42 | # target_start = 0 43 | # target_end = 0 44 | # else: 45 | for map_pos, map_tok in enumerate(item['mapping_index'][start_index:]): 46 | if start_index < map_tok['pos'] <= end_index: 47 | length = len(self.tokenizer.tokenize(map_tok['char'])) 48 | if map_pos < ori_start: 49 | target_start += length - 1 50 | if map_pos < ori_end: 51 | target_end += length - 1 52 | item['target'] = [target_start + 1, target_end + 1] # cls +1 53 | 54 | item['input'] = input_id 55 | item['mask'] = [1] * len(input_id) 56 | item['raw_input'] = tokenized_input 57 | yield item 58 | 59 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 60 | row_dict = { 61 | 'input': item['input'], 62 | 'mask': item['mask'] 63 | } 64 | if 'target' in item: 65 | row_dict['target'] = item['target'] 66 | return {key: torch.tensor(value) for key, value in row_dict.items()} 67 | -------------------------------------------------------------------------------- /tfkit/task/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/seq2seq/model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn.functional import softmax 6 | from transformers import AutoModel 7 | import torch.nn.functional as F 8 | from tfkit.task.seq2seq import Preprocessor 9 | from tfkit.utility.loss import NegativeCElLoss, SelfKDLoss 10 | from tfkit.utility.model import tie_encoder_decoder_weights 11 | from tfkit.utility.predictor import AutoRegressivePredictor 12 | 13 | 14 | class Model(nn.Module): 15 | def __init__(self, tokenizer, pretrained, maxlen=512, selfkd=False, **kwargs): 16 | super().__init__() 17 | self.maxlen = maxlen 18 | self.tokenizer = tokenizer 19 | self.pretrained = pretrained 20 | self.selfkd = selfkd 21 | self.decoder_model, init_weight = self.initialize_decoder() 22 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__()) 23 | self.model = nn.Linear(self.decoder_hidden_size, self.vocab_size, bias=False) 24 | if init_weight is not None: 25 | self.model.weight = init_weight 26 | self.predictor = AutoRegressivePredictor(self, Preprocessor) 27 | self.predict = self.predictor.predict 28 | 29 | def initialize_decoder(self): 30 | init_weight = None 31 | 32 | if hasattr(self.pretrained, 'decoder'): 33 | decoder_model = None 34 | self.decoder_hidden_size = self.pretrained.config.hidden_size 35 | if hasattr(self.pretrained, 'shared'): 36 | init_weight = copy.deepcopy(self.pretrained.shared.weight) 37 | else: 38 | decoder_config = copy.deepcopy(self.pretrained.config) 39 | decoder_config.is_decoder = True 40 | decoder_config.add_cross_attention = True 41 | decoder_model = AutoModel.from_config(decoder_config) 42 | tie_encoder_decoder_weights(self.pretrained, decoder_model, decoder_model.base_model_prefix) 43 | self.decoder_hidden_size = decoder_config.hidden_size 44 | 45 | return decoder_model, init_weight 46 | 47 | def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs): 48 | if self.decoder_model: 49 | prediction_output, prediction_all_hidden = self.decoder_forward(batch_data, eval) 50 | else: 51 | prediction_output, prediction_all_hidden = self.encoder_forward(batch_data, eval, beamsearch) 52 | 53 | prediction_scores = self.model(prediction_output) 54 | 55 | if eval: 56 | outputs = self.process_eval_output(prediction_scores, max_return) 57 | else: 58 | outputs = self.calculate_loss(batch_data, prediction_scores, prediction_all_hidden) 59 | return outputs 60 | 61 | def decoder_forward(self, batch_data, eval): 62 | input_tensors = torch.as_tensor(batch_data['input']) 63 | prev_tensors = torch.as_tensor(batch_data['prev']) 64 | encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask']) 65 | decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask']) 66 | 67 | if not eval: 68 | outputs = self.pretrained(input_tensors, attention_mask=encoder_mask_tensors) 69 | prediction = self.decoder_model( 70 | input_ids=prev_tensors, 71 | attention_mask=decoder_mask_tensors, 72 | output_hidden_states=self.selfkd, 73 | use_cache=False, 74 | return_dict=True, 75 | ) 76 | prediction_output = prediction['last_hidden_state'] 77 | prediction_all_hidden = prediction.get('hidden_states') 78 | return prediction_output, prediction_all_hidden 79 | 80 | def encoder_forward(self, batch_data, eval, beamsearch): 81 | input_tensors = torch.as_tensor(batch_data['input']) 82 | prev_tensors = torch.as_tensor(batch_data['prev']) 83 | encoder_mask_tensors = torch.as_tensor(batch_data['encoder_mask']) 84 | decoder_mask_tensors = torch.as_tensor(batch_data['decoder_mask']) 85 | 86 | prediction = self.pretrained( 87 | input_ids=input_tensors, 88 | attention_mask=encoder_mask_tensors, 89 | decoder_input_ids=prev_tensors, 90 | decoder_attention_mask=decoder_mask_tensors, 91 | output_hidden_states=self.selfkd, 92 | use_cache=False, 93 | return_dict=True 94 | ) 95 | prediction_output = prediction['last_hidden_state'] 96 | prediction_all_hidden = prediction.get('decoder_hidden_states') 97 | return prediction_output, prediction_all_hidden 98 | 99 | def process_eval_output(self, prediction_scores, max_return): 100 | result_dict = {} 101 | softmax_score = softmax(prediction_scores[0][0], dim=0) 102 | max_item_id = torch.argmax(softmax_score, -1).item() 103 | max_item_prob = softmax_score[max_item_id].item() 104 | result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob) 105 | 106 | if max_return > 1: 107 | topK = torch.topk(softmax_score, max_return) 108 | prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in 109 | zip(topK.values.data.tolist(), topK.indices.data.tolist())] 110 | result_dict['prob_list'] = softmax_score.data.tolist()[:max_return] 111 | result_dict['label_prob'] = prob_result 112 | 113 | return result_dict 114 | 115 | def calculate_loss(self, batch_data, prediction_scores, prediction_all_hidden): 116 | targets = batch_data['target'] 117 | negative_targets = batch_data['ntarget'] 118 | loss_tensors = torch.as_tensor(targets) 119 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token 120 | lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size), 121 | loss_tensors.view(-1)) 122 | 123 | if self.selfkd: 124 | selfkdloss_fct = SelfKDLoss(ignore_index=-1) 125 | for decoder_hidden in prediction_all_hidden[:-1]: 126 | student = self.model(decoder_hidden) 127 | lm_loss += selfkdloss_fct(student.view(-1, self.vocab_size), 128 | prediction_scores.view(-1, self.vocab_size), loss_tensors.view(-1)) 129 | 130 | if 'btarget' in batch_data: 131 | backtran_tensors = torch.as_tensor(batch_data['btarget']) 132 | if not torch.all(backtran_tensors.eq(-1)).item(): 133 | backtran_predation = self.pretrained( 134 | input_ids=backtran_tensors, 135 | output_hidden_states=True, 136 | return_dict=True 137 | ) 138 | backtran_hidden = backtran_predation['encoder_last_hidden_state'] 139 | backtran_loss = F.cosine_similarity(self.encoder_hidden, backtran_hidden).mean() 140 | lm_loss += backtran_loss 141 | 142 | negativeloss_tensors = torch.as_tensor(negative_targets) 143 | if not torch.all(negativeloss_tensors.eq(-1)).item(): 144 | negative_loss_fct = NegativeCElLoss(ignore_index=-1) 145 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size), 146 | negativeloss_tensors.view(-1)) 147 | lm_loss += negative_loss 148 | 149 | return lm_loss 150 | -------------------------------------------------------------------------------- /tfkit/task/seq2seq/preprocessor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import tfkit.utility.tok as tok 4 | from tfkit.utility.data_filereader import get_gen_data_from_file 5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 6 | 7 | 8 | class Preprocessor(GeneralNLPPreprocessor): 9 | def read_file_to_data(self, path): 10 | return get_gen_data_from_file(path) 11 | 12 | def set_global_parameters(self): 13 | self.tokenize_target = True 14 | 15 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict): 16 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood 17 | tokenized_input, tokenized_target, n_target, b_target = item['input'], \ 18 | item.get('target', None), \ 19 | item.get('ntarget', None), \ 20 | item.get('btarget', None) 21 | previous = item.get("previous", []) 22 | if tokenized_target is None: 23 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 24 | 'previous': self.tokenizer.convert_tokens_to_ids(previous)} 25 | elif b_target and len(b_target) > 0: 26 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 27 | 'previous': self.tokenizer.convert_tokens_to_ids(previous), 28 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target), 29 | 'btarget': self.tokenizer.encode(b_target)} 30 | else: 31 | if "neg" in likelihood or 'both' in likelihood: 32 | # formatting neg data in csv 33 | if n_target is None: 34 | ntext_arr = [ 35 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)] 36 | elif tok.tok_sep(self.tokenizer) in n_target: 37 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))] 38 | else: 39 | ntext_arr = [n_target.strip()] 40 | for neg_text in ntext_arr: 41 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 42 | 'previous': self.tokenizer.convert_tokens_to_ids(previous), 43 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target), 44 | 'ntarget': self.tokenizer.encode(neg_text)} 45 | else: 46 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 47 | 'previous': self.tokenizer.convert_tokens_to_ids(previous), 48 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)} 49 | 50 | # whole sentence masking 51 | if 'pos' in likelihood: 52 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 53 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target), 54 | 'previous': self.tokenizer.convert_tokens_to_ids( 55 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target))} 56 | elif 'both' in likelihood: 57 | for neg_text in ntext_arr: 58 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input), 59 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target), 60 | 'previous': self.tokenizer.convert_tokens_to_ids( 61 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target)), 62 | 'ntarget': self.tokenizer.encode(neg_text)} 63 | 64 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 65 | t_input_id, previous = item['input'], item['previous'] 66 | row_dict = {} 67 | if 'target' in item: 68 | target = item['target'] 69 | tokenized_target_id = [] 70 | if len(previous) == len(target): 71 | tokenized_prev_id = [self.tok_mask_id] * maxlen 72 | else: 73 | tokenized_prev_id = [self.tok_sep_id] + target 74 | tokenized_target_id.extend(target + [self.tok_sep_id]) 75 | row_dict['target'] = tokenized_target_id 76 | row_dict['target_pad'] = [-1] 77 | row_dict['prev'] = tokenized_prev_id 78 | row_dict['ntarget'] = [-1] * maxlen 79 | if 'ntarget' in item and len(item['ntarget']) > 0: 80 | tokenized_ntarget_id = item['ntarget'] 81 | if len(tokenized_ntarget_id) <= maxlen: 82 | row_dict['ntarget'] = tokenized_ntarget_id 83 | if 'btarget' in item and len(item['btarget']) > 0: 84 | row_dict['btarget'] = tokenizer.encode(item['btarget']) 85 | else: 86 | tokenized_prev_id = [self.tok_sep_id] 87 | tokenized_prev_id.extend(previous) 88 | row_dict['prev'] = tokenized_prev_id 89 | 90 | row_dict['input'] = t_input_id 91 | row_dict['encoder_mask'] = [1] * len(t_input_id) 92 | row_dict['decoder_mask'] = [1] * len(tokenized_prev_id) 93 | return {key: torch.tensor(value) for key, value in row_dict.items()} 94 | -------------------------------------------------------------------------------- /tfkit/task/tag/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessor import Preprocessor 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /tfkit/task/tag/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from collections import Counter 4 | 5 | import torch 6 | from torch import nn 7 | from torch.nn.functional import softmax 8 | 9 | from tfkit.task.tag import Preprocessor 10 | from tfkit.utility.loss import FocalLoss 11 | from tfkit.utility.predictor import TaggingPredictor 12 | 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 15 | 16 | 17 | class Model(nn.Module): 18 | def __init__(self, tokenizer, pretrained, tasks_detail, maxlen=512, dropout=0.2, **kwargs): 19 | super().__init__() 20 | self.initialize_components(tokenizer, pretrained, tasks_detail, maxlen, dropout) 21 | 22 | def initialize_components(self, tokenizer, pretrained, tasks_detail, maxlen, dropout): 23 | labels = list(tasks_detail.values())[0] 24 | self.tokenizer = tokenizer 25 | self.pretrained = pretrained 26 | self.dropout = nn.Dropout(dropout) 27 | self.tagger = nn.Linear(self.pretrained.config.hidden_size, len(labels)) 28 | self.labels = labels 29 | self.maxlen = maxlen 30 | self.loss_fct = FocalLoss() 31 | 32 | self.pretrained = self.pretrained 33 | self.loss_fct = self.loss_fct 34 | 35 | predictor = TaggingPredictor(self, Preprocessor) 36 | self.predictor = predictor 37 | self.predict = predictor.predict 38 | 39 | def forward(self, batch_data, eval=False, separator=" ", **kwargs): 40 | inputs = batch_data["input"] 41 | masks = batch_data["mask"] 42 | 43 | bert_output = self.compute_bert_output(inputs, masks) 44 | 45 | if eval: 46 | outputs = self.compute_eval_output(batch_data, bert_output) 47 | else: 48 | outputs = self.compute_loss_output(batch_data, bert_output) 49 | 50 | return outputs 51 | 52 | def compute_bert_output(self, inputs, masks): 53 | token_tensor = torch.as_tensor(inputs, dtype=torch.long) 54 | mask_tensors = torch.as_tensor(masks) 55 | bert_output = self.pretrained(token_tensor, attention_mask=mask_tensors) 56 | res = bert_output[0] 57 | pooled_output = self.dropout(res) 58 | reshaped_logits = self.tagger(pooled_output) 59 | 60 | return reshaped_logits 61 | 62 | def compute_eval_output(self, batch_data, reshaped_logits): 63 | result_dict = { 64 | 'label_prob_all': [], 65 | 'label_map': [] 66 | } 67 | 68 | ilogit = softmax(reshaped_logits[0], dim=1) 69 | result_labels = ilogit.data.tolist() 70 | start, end = batch_data['pos'][0] 71 | token_word_mapping = batch_data['token_word_mapping'] 72 | 73 | for pos, logit_prob in enumerate(result_labels[1:]): # skip cls and sep 74 | if start + pos >= len(token_word_mapping): 75 | break 76 | 77 | word, pos = self.compute_word_pos(token_word_mapping, start, pos) 78 | self.update_result_dict(result_dict, logit_prob, word, pos) 79 | 80 | result_dict['token_word_mapping'] = token_word_mapping[start:end] 81 | 82 | return result_dict 83 | 84 | @staticmethod 85 | def compute_word_pos(token_word_mapping, start, pos): 86 | word = token_word_mapping[start + pos]['word'] 87 | pos = token_word_mapping[start + pos]['pos'] 88 | 89 | return word, pos 90 | 91 | def update_result_dict(self, result_dict, logit_prob, word, pos): 92 | if len(result_dict['label_map']) > pos: 93 | self.update_existing_result(result_dict, logit_prob, word, pos) 94 | else: 95 | self.append_new_result(result_dict, logit_prob, word) 96 | 97 | def update_existing_result(self, result_dict, logit_prob, word, pos): 98 | O = Counter(result_dict['label_prob_all'][-1][word]) 99 | N = Counter(dict(zip(self.labels, logit_prob))) 100 | mean_prob = {k: v / 2 for k, v in (O + N).items()} 101 | result_dict['label_prob_all'][-1] = {word: mean_prob} 102 | result_dict['label_map'][-1] = { 103 | word: max(mean_prob, key=mean_prob.get)} 104 | 105 | def append_new_result(self, result_dict, logit_prob, word): 106 | max_index = logit_prob.index(max(logit_prob)) 107 | result_dict['label_map'].append({word: self.labels[max_index]}) 108 | result_dict['label_prob_all'].append({word: dict(zip(self.labels, logit_prob))}) 109 | 110 | def compute_loss_output(self, batch_data, reshaped_logits): 111 | targets = batch_data["target"] 112 | target_tensor = torch.as_tensor(targets, dtype=torch.long) 113 | loss = self.loss_fct(reshaped_logits.view(-1, len(self.labels)), target_tensor.view(-1)) 114 | 115 | return loss 116 | -------------------------------------------------------------------------------- /tfkit/task/tag/preprocessor.py: -------------------------------------------------------------------------------- 1 | import tfkit.utility.tok as tok 2 | from tfkit.utility.data_filereader import get_tag_data_from_file 3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor 4 | 5 | get_data_from_file = get_tag_data_from_file 6 | 7 | 8 | class Preprocessor(GeneralNLPPreprocessor): 9 | 10 | def read_file_to_data(self, path): 11 | return get_tag_data_from_file(path) 12 | 13 | def preprocess(self, item, **param_dict): 14 | input_text, target = item['input'], item.get('target', None) 15 | separator = param_dict.get('separator', ' ') 16 | word_token_mapping = [] 17 | token_word_mapping = [] 18 | pos = 0 19 | 20 | for word_i, word in enumerate(input_text.split(separator)): 21 | tokenize_word = self.tokenizer.tokenize(word) 22 | for _ in range(len(tokenize_word)): 23 | if _ < 1: # only record first token (one word one record) 24 | word_token_mapping.append({'char': word, 'pos': pos, 'len': len(tokenize_word)}) 25 | token_word_mapping.append({'tok': tokenize_word[_], 'word': word, 'pos': len(word_token_mapping) - 1}) 26 | pos += 1 27 | 28 | t_input_list, t_pos_list = tok.handle_exceed(self.tokenizer, input_text, self.parameters['maxlen'] - 2, 29 | mode=self.parameters.get('handle_exceed'), 30 | keep_after_sep=False) 31 | preprocessed_data = [] 32 | for t_input, t_pos in zip(t_input_list, t_pos_list): # -1 for cls 33 | # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. 34 | row_dict = dict() 35 | tokenized_input = [tok.tok_begin(self.tokenizer)] + t_input 36 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input) 37 | 38 | if target is not None: 39 | target_token = [] 40 | for input_word, target_label in zip(word_token_mapping, target.split(separator)): 41 | if t_pos[0] <= input_word['pos'] < t_pos[1]: 42 | for _ in range(input_word['len']): 43 | target_token += [target_label] 44 | 45 | target_id = [target_token[0]] + target_token 46 | 47 | if len(input_id) != len(target_id): 48 | print(list(zip(input.split(separator), target.split(separator)))) 49 | print(self.tokenizer.decode(input_id)) 50 | print(input_id) 51 | print(target_id) 52 | print("input target len not equal ", len(input_id), len(target_id)) 53 | continue 54 | row_dict['target'] = target_id 55 | 56 | row_dict['input'] = input_id 57 | row_dict['word_token_mapping'] = word_token_mapping 58 | row_dict['token_word_mapping'] = token_word_mapping 59 | row_dict['end'] = len(input_id) 60 | row_dict['pos'] = t_pos 61 | preprocessed_data.append(row_dict) 62 | return preprocessed_data 63 | 64 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 65 | labels = item['task_dict'] 66 | print("item['input']",len(item['input'])) 67 | mask_id = [1] * len(item['input']) 68 | mask_id.extend([0] * (maxlen - len(mask_id))) 69 | item['input'].extend([0] * (self.parameters['maxlen'] - len(item['input']))) 70 | row_dict = { 71 | 'input': item['input'], 72 | 'mask': mask_id, 73 | 'pos': item['pos'], 74 | } 75 | # 'token_word_mapping': item['token_word_mapping'] 76 | if 'target' in item: 77 | print(labels['tag']) 78 | target_id = [labels['tag'].index(i) for i in item['target']] 79 | if "O" in labels['tag']: 80 | target_id = [labels['tag'].index("O")] + target_id 81 | else: 82 | target_id = [target_id[0]] + target_id 83 | target_id.extend([0] * (self.parameters['maxlen'] - len(target_id))) 84 | row_dict['target'] = target_id 85 | 86 | return row_dict 87 | -------------------------------------------------------------------------------- /tfkit/test/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../")) 4 | 5 | DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data') 6 | TAG_DATASET = os.path.join(DATASET_DIR, 'tag.csv') 7 | CLAS_DATASET = os.path.join(DATASET_DIR, 'classification.csv') 8 | GEN_DATASET = os.path.join(DATASET_DIR, 'generation.csv') 9 | MASK_DATASET = os.path.join(DATASET_DIR, 'mask.csv') 10 | MCQ_DATASET = os.path.join(DATASET_DIR, 'mcq.csv') 11 | QA_DATASET = os.path.join(DATASET_DIR, 'qa.csv') 12 | ADDTOK_DATASET = os.path.join(DATASET_DIR, 'unk_tok.csv') 13 | NEWTOKEN_FILE = os.path.join(DATASET_DIR, 'tok_list.txt') 14 | 15 | MODEL_SAVE_DIR = os.path.join(ROOT_DIR, 'tfkit/test/cache/') 16 | ADDTOKFREQ_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfreq/') 17 | ADDTOKFILE_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, 'addtokfile/') 18 | CLAS_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clas/') 19 | TAG_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tag/') 20 | TAGCRF_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'tagcrf/') 21 | ONEBYONE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'onebyone/') 22 | CLM_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'clm/') 23 | SEQ2SEQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'seq2seq/') 24 | ONCE_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'once/') 25 | ONCECTC_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'oncectc/') 26 | MASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mask/') 27 | MCQ_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mcq/') 28 | QA_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'qa/') 29 | MTTASK_MODEL_DIR = os.path.join(MODEL_SAVE_DIR, 'mttask/') 30 | 31 | ONEBYONE_MODEL_PATH = os.path.join(ONEBYONE_MODEL_DIR, '2.pt') 32 | ONCE_MODEL_PATH = os.path.join(ONCE_MODEL_DIR, '2.pt') 33 | ONCECTC_MODEL_PATH = os.path.join(ONCECTC_MODEL_DIR, '1.pt') 34 | SEQ2SEQ_MODEL_PATH = os.path.join(SEQ2SEQ_MODEL_DIR, '2.pt') 35 | CLM_MODEL_PATH = os.path.join(CLM_MODEL_DIR, '2.pt') 36 | CLAS_MODEL_PATH = os.path.join(CLAS_MODEL_DIR, '2.pt') 37 | MASK_MODEL_PATH = os.path.join(MASK_MODEL_DIR, '2.pt') 38 | MCQ_MODEL_PATH = os.path.join(MCQ_MODEL_DIR, '2.pt') 39 | TAG_MODEL_PATH = os.path.join(TAG_MODEL_DIR, '2.pt') 40 | QA_MODEL_PATH = os.path.join(QA_MODEL_DIR, '2.pt') 41 | ADDTOKFREQ_MODEL_PATH = os.path.join(ADDTOKFREQ_SAVE_DIR, '2.pt') 42 | ADDTOKFILE_MODEL_PATH = os.path.join(ADDTOKFILE_SAVE_DIR, '2.pt') 43 | -------------------------------------------------------------------------------- /tfkit/test/task/test_task_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tfkit 4 | from tfkit.utility.data_loader import pad_batch 5 | from torch import Tensor 6 | from transformers import BertTokenizer, AutoModel, AutoTokenizer 7 | 8 | 9 | class TestModel(unittest.TestCase): 10 | 11 | 12 | def testGenerationModel(self): 13 | input = "See you next time" 14 | maxlen = 32 15 | tokenizer = AutoTokenizer.from_pretrained('sshleifer/bart-tiny-random') 16 | pretrained = AutoModel.from_pretrained('sshleifer/bart-tiny-random') 17 | # tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm 18 | for gmodel in [tfkit.task.seq2seq, tfkit.task.once, tfkit.task.oncectc, tfkit.task.clm]: 19 | print(str(gmodel)) 20 | model = gmodel.Model(tokenizer, pretrained, maxlen=maxlen) 21 | preprocessor = gmodel.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice', reserved_len=0) 22 | for preprocessed_item in preprocessor.preprocess( 23 | {'task': 'taskA', 'input': input}): 24 | print("preprocessed_item", preprocessed_item) 25 | feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen) 26 | feature = preprocessor.postprocess_batch(feature) 27 | print(model(feature, eval=True)) 28 | self.assertTrue(isinstance(model(feature, eval=True), dict)) 29 | model_dict = model(feature, eval=True) 30 | self.assertTrue('max_item' in model_dict) 31 | 32 | # greedy 33 | print("greedy") 34 | result, detail = model.predict(input=input) 35 | print(result, model_dict) 36 | self.assertTrue(len(result) == 1) 37 | self.assertTrue(isinstance(result, list)) 38 | self.assertTrue(isinstance(detail, dict)) 39 | 40 | # TopK 41 | result, detail = model.predict(input=input, decodenum=3, mode='topK', topK=3, filtersim=False) 42 | print("topK", result) 43 | self.assertTrue(len(result) == 3) 44 | self.assertTrue(isinstance(result, list)) 45 | self.assertTrue(isinstance(detail, dict)) 46 | 47 | # beamsearch 48 | result, detail = model.predict(input=input, decodenum=3) 49 | print("beamsaerch", len(result), result, model_dict) 50 | self.assertTrue(len(result) == 3) 51 | self.assertTrue(isinstance(result, list)) 52 | self.assertTrue(isinstance(detail, dict)) 53 | 54 | # TopP 55 | result, detail = model.predict(input=input, decodenum=3, mode='topP', topP=0.8) 56 | print("TopP", len(result), result, model_dict) 57 | self.assertTrue(len(result) == 3) 58 | self.assertTrue(isinstance(result, list)) 59 | self.assertTrue(isinstance(detail, dict)) 60 | 61 | # test exceed 512 62 | result, model_dict = model.predict(input="T " * 540) 63 | print("test exceed 512", len(result), result, model_dict) 64 | self.assertTrue(isinstance(result, list)) 65 | self.assertTrue(isinstance(detail, dict)) 66 | print("exceed max len", result) 67 | 68 | result, model_dict = model.predict(input="T " * 550, reserved_len=10) 69 | print(result) 70 | self.assertTrue(isinstance(result, list)) 71 | self.assertTrue(isinstance(detail, dict)) 72 | print("exceed max len with reserved len:", result) 73 | 74 | # def testClas(self): 75 | # input = "One hundred thirty-four patients suspected of having pancreas cancer successfully underwent gray scale ultrasound examination of the pancreas ." 76 | # target = "a" 77 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 78 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny') 79 | # maxlen = 512 80 | # model = tfkit.task.clas.Model(tokenizer, pretrained, tasks_detail={"taskA": ["a", "b"]}) 81 | # preprocessor = tfkit.task.clas.Preprocessor(tokenizer, maxlen=maxlen, handle_exceed='start_slice', 82 | # reserved_len=0) 83 | # for preprocessed_item in preprocessor.preprocess( 84 | # {'task': 'taskA', 'input': input, 'target': target, 'task_dict': {"taskA": ["a", "b"]}}): 85 | # feature = preprocessor.postprocess(preprocessed_item, tokenizer, maxlen=maxlen) 86 | # for k, v in feature.items(): 87 | # feature[k] = [v, v] 88 | # print(feature) 89 | # # test train 90 | # print(model(feature)) 91 | # self.assertTrue(isinstance(model(feature), Tensor)) 92 | # # test eval 93 | # print(model(feature, eval=True)) 94 | # model_dict = model(feature, eval=True) 95 | # print(model_dict) 96 | # 97 | # # test predict 98 | # tok_label = model.predict(task="taskA", input=input) 99 | # self.assertTrue(len(tok_label) == 2) 100 | # # test predict with top k 2 101 | # top_k_label, top_k_dict = model.predict(task="taskA", input=input, topK=2) 102 | # print("test predict with top k 2, ", top_k_label, top_k_dict) 103 | # self.assertTrue(len(top_k_label) == 2) 104 | # 105 | # # test exceed 512 106 | # for merge_strategy in ['entropy', 'count', 'prob']: 107 | # result, model_dict = model.predict(task="taskA", input=" ".join([str(i) for i in range(2000)]), 108 | # merge_strategy=merge_strategy, topK=2) 109 | # print(result, len(model_dict), model_dict) 110 | # self.assertTrue(isinstance(result, list)) 111 | # self.assertTrue(len(result) == 2) 112 | 113 | # def testQA(self): 114 | # input = "梵 語 在 社 交 中 口 頭 使 用 , 並 且 在 早 期 古 典 梵 語 文 獻 的 發 展 中 維 持 口 頭 傳 統 。 在 印 度 , 書 寫 形 式 是 當 梵 語 發 展 成 俗 語 之 後 才 出 現 的 ; 在 書 寫 梵 語 的 時 候 , 書 寫 系 統 的 選 擇 受 抄 寫 者 所 處 地 域 的 影 響 。 同 樣 的 , 所 有 南 亞 的 主 要 書 寫 系 統 事 實 上 都 用 於 梵 語 文 稿 的 抄 寫 。 自 1 9 世 紀 晚 期 , 天 城 文 被 定 為 梵 語 的 標 準 書 寫 系 統 , 十 分 可 能 的 原 因 是 歐 洲 人 有 用 這 種 文 字 印 刷 梵 語 文 本 的 習 慣 。 最 早 的 已 知 梵 語 碑 刻 可 確 定 為 公 元 前 一 世 紀 。 它 們 採 用 了 最 初 用 於 俗 語 而 非 梵 語 的 婆 羅 米 文 。 第 一 個 書 寫 梵 語 的 證 據 , 出 現 在 晚 於 它 的 俗 語 的 書 寫 證 據 之 後 的 幾 個 世 紀 , 這 被 描 述 為 一 種 悖 論 。 在 梵 語 被 書 寫 下 來 的 時 候 , 它 首 先 用 於 行 政 、 文 學 或 科 學 類 的 文 本 。 宗 教 文 本 口 頭 傳 承 , 在 相 當 晚 的 時 候 才 「 不 情 願 」 地 被 書 寫 下 來 。 [Question] 最 初 梵 語 以 什 麼 書 寫 系 統 被 記 錄 下 來 ?" 115 | # target = [201, 205] 116 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 117 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny') 118 | # model = tfkit.task.qa.Model(tokenizer, pretrained, maxlen=512) 119 | # 120 | # proc = tfkit.task.qa.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice', 121 | # reserved_len=0) 122 | # for items in proc.preprocess({"input": input}): 123 | # raw_input = items['raw_input'] 124 | # feature = proc.postprocess(items, tokenizer, 512) 125 | # for k, v in feature.items(): 126 | # feature[k] = [v] 127 | # 128 | # # test train 129 | # print(model(feature)) 130 | # self.assertTrue(isinstance(model(feature), Tensor)) 131 | # # test eval 132 | # print(model(feature, eval=True)) 133 | # model_dict = model(feature, eval=True) 134 | # self.assertTrue('label_prob_all' in model_dict) 135 | # self.assertTrue('label_map' in model_dict) 136 | # 137 | # # test predict 138 | # result, model_dict = model.predict(input=input) 139 | # print("model_dict", model_dict, input, result) 140 | # self.assertTrue('label_prob_all' in model_dict[0]) 141 | # self.assertTrue('label_map' in model_dict[0]) 142 | # self.assertTrue(len(result) == 1) 143 | # 144 | # # # test eval top k = 2 145 | # # top_k_label, top_k_dict = task.predict(input=input, topK=2) 146 | # # print("top_k_label", top_k_label) 147 | # # self.assertTrue(len(top_k_label) == 2) 148 | # 149 | # # test exceed 512 150 | # for merge_strategy in ['entropy', 'count', 'prob']: 151 | # result, model_dict = model.predict(input=" ".join([str(i) for i in range(550)]), 152 | # handle_exceed='start_slice', 153 | # merge_strategy=merge_strategy) 154 | # print(result, len(model_dict)) 155 | # self.assertTrue(isinstance(result, list)) 156 | # 157 | # def testTag(self): 158 | # tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_small') 159 | # pretrained = AutoModel.from_pretrained('voidful/albert_chinese_small') 160 | # 161 | # input = "在 歐 洲 , 梵 語 的 學 術 研 究 , 由 德 國 學 者 陸 特 和 漢 斯 雷 頓 開 創 。 後 來 威 廉 · 瓊 斯 發 現 印 歐 語 系 , 也 要 歸 功 於 對 梵 語 的 研 究 。 此 外 , 梵 語 研 究 , 也 對 西 方 文 字 學 及 歷 史 語 言 學 的 發 展 , 貢 獻 不 少 。 1 7 8 6 年 2 月 2 日 , 亞 洲 協 會 在 加 爾 各 答 舉 行 。 [SEP] 陸 特 和 漢 斯 雷 頓 開 創 了 哪 一 地 區 對 梵 語 的 學 術 研 究 FOB ?" 162 | # target = "O A A O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O" 163 | # label = ["O", "A"] 164 | # 165 | # model = tfkit.task.tag.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"default": label}) 166 | # 167 | # # test exceed 512 168 | # for merge_strategy in ['count']: 169 | # result, model_dict = model.predict( 170 | # input=""" 171 | # Rundfadsfdsfsfning 明朝(1368年1月23日-1644年4月25日[註 1])是中國歷史上最後一個由漢族建立的大一統王朝,歷經十二世、十六位皇帝,國祚二百七十六年[參 4]。\n\n元朝末年政治腐敗,種族紛爭,天災不斷,民不聊生,民變暴動屢禁不止,平民朱元璋加入紅巾軍並在其中乘勢崛起,跟隨佔據濠州的郭子興。郭子興死後,朱元璋被當時反抗軍擁立的小明王韓林兒封為左副元帥,並率部眾先後攻占滁州、和州等地,並最終攻佔集慶(今江蘇南京),採取朱升所建議的「高築牆,廣積糧,緩稱王」的政策,以鞏固根據地,讓士兵屯田積糧減少百姓負擔,以示自己為仁義之師而避免受敵。1364年,朱元璋稱吳王,建立西吳政權。1368年,在掃滅陳友諒、張士誠和方國珍等群雄勢力後,朱元璋於當年農曆正月初四日登基稱帝,立國號為大明[參 5],定都應天府(今南京市),其轄區稱為京師,由因皇室姓朱,故又稱朱明,之後以「驅逐胡虜,恢復中華」[參 6]為號召北伐中原[參 7][參 8],並收回了燕雲十六州[參 9],結束蒙元在中國漢地的統治,統一天下。\n\n明初天下大定,經過朱元璋的休養生息,社會經濟得以恢復和發展,國力迅速恢復,史稱洪武之治。朱元璋去世後,其孫朱允炆即位,但其在靖難之役中敗於駐守燕京的朱元璋第四子朱棣,也自此失蹤。朱棣登基後遷都至順天府(今北京市),將北平布政司升為京師,原京師改稱南京[參 3]。成祖朱棣時期,開疆拓土,又派遣鄭和七下西洋,此後許多漢人遠赴海外,國勢達到頂峰,史稱永樂盛世。其後的仁宗和宣宗時期國家仍處於興盛時期,史稱仁宣之治[參 10]。英宗和代宗時期,遭遇土木之變,國力中衰,經于謙等人抗敵,最終解除國家危機。憲宗和孝宗相繼與民休息,孝宗則力行節儉,減免稅賦,百姓安居樂業,史稱弘治中興[參 11]。武宗時期爆發了南巡之爭和寧王之亂。世宗即位初,引發大禮議之爭,他清除宦官和權臣勢力後總攬朝綱,實現嘉靖中興,並於屯門海戰與西草灣之戰中擊退葡萄牙殖民侵略,任用胡宗憲和俞大猷等將領平定東南沿海的倭患。世宗駕崩後經過隆慶新政國力得到恢復,神宗前期任用張居正,推行萬曆新政,國家收入大增,商品經濟空前繁榮、科學巨匠迭出、社會風尚呈現出活潑開放的新鮮氣息,史稱萬曆中興[參 12]。後經過萬曆三大征平定內憂外患,粉碎豐臣秀吉攻占朝鮮進而入明的計劃,然而因為國本之爭,皇帝逐漸疏於朝政,史稱萬曆怠政,同時東林黨爭也帶來了明中期的政治混亂。\n\n萬曆一朝成為明朝由盛轉衰的轉折期[參 13]。光宗繼位不久因紅丸案暴斃,熹宗繼承大統改元天啟,天啟年間魏忠賢閹黨禍亂朝綱,至明思宗即位後剷除閹黨,但閹黨倒臺後,黨爭又起,政治腐敗以及連年天災[註 2][註 3],導致國力衰退,最終爆發大規模民變。1644年4月25日(舊曆三月十九),李自成所建立的大順軍攻破北京,思宗自縊於煤山,是為甲申之變。隨後吳三桂倒戈相向,滿族建立的滿清入主中原。明朝宗室於江南地區相繼成立南明諸政權,而原本反明的流寇在李自成等領袖死後亦加入南明陣營,這些政權被清朝統治者先後以「為君父報仇」為名各個殲滅,1662年,明朝宗室最後政權被剷除,永曆帝被俘後被殺,滿清又陸續擊敗各地反抗軍,以及攻取台灣、澎湖,1683年,奉大明為正朔的明鄭向清朝投降,漢族抗爭勢力方為清朝所消滅。[參 16]。\n\n明代的核心領土囊括漢地[註 4],東北到外興安嶺及黑龍江流域[參 19],後縮為遼河流域;初年北達戈壁沙漠一帶,後改為今長城;西北至新疆哈密,後改為嘉峪關;西南臨孟加拉灣[註 5],後折回約今雲南境;曾經在今中國東北、新疆東部及西藏等地設有羈縻機構[參 21]。不過,明朝是否實際統治了西藏國際上尚存在有一定的爭議[註 6]。明成祖時期曾短暫征服及統治安南[參 22],永樂二十二年(1424年),明朝國土面積達到極盛,在東南亞設置舊港宣慰司[註 7]等行政機構,加強對東南洋一帶的管理[參 23][參 24]。\n\n明代商品經濟繁榮,出現商業集鎮,而手工業及文化藝術呈現世俗化趨勢[參 25]。根據《明實錄》所載的人口峰值於成化十五年(1479年)達七千餘萬人[參 26],不過許多學者考慮到當時存在大量隱匿戶口,故認為明朝人口峰值實際上逾億[參 27],還有學者認為晚明人口峰值接近2億[註 8]。這一時期,其GDP總量所占的世界比例在中國古代史上也是最高的,1600年明朝GDP總量為960億美元,占世界經濟總量的29.2%,晚明中國人均GDP在600美元[註 9]。\n\n明朝政治則是權力趨於集中,明太祖在誅殺胡惟庸後廢除傳統的丞相制,六部直接對皇帝負責,後來設置內閣;地方上由承宣布政使司、提刑按察使司、都指揮使司分掌權力,加強地方管理。仁宗、宣宗之後,文官治國的思想逐漸濃厚,行政權向內閣和六部轉移。同時還設有都察院等監察機構,為加強對全國臣民的監視,明太祖設立特務機構錦衣衛,明成祖設立東廠,明憲宗時再設西廠(後取消),明武宗又設內行廠(後取消),合稱「廠衛」。但明朝皇帝並非完全獨斷獨行,有許多事還必須經過經廷推、廷議、廷鞫程序,同時,能將原旨退還的給事中亦可對皇權形成制衡。[參 33]到了後期皇帝出現了怠政,宦官行使大權的陋習[參 3],儘管決策權始終集中在皇帝手中,然而政務大部分已經由內閣處理,此外,到了明代中晚期文官集團的集體意見足以與皇帝抗衡,在遇到事情決斷兩相僵持不下時,也容易產生一種類似於「憲政危機(英語:Constitutional crisis)」的情況,因此「名義上他是天子,實際上他受制於廷臣。」[參 34]但明朝皇權受制於廷臣主要是基於道德上而非法理上,因為明朝當時風氣普遍注重名節,受儒家教育的皇帝通常不願被冠以「昏君」之名。但雖然皇權受制衡,皇帝仍可任意動用皇權,例如明世宗「大禮議」事件最後以廷杖朝臣多人的方式結束[參 35],明神宗在國本之爭失利後也以長期拒絕參與政事向朝臣們示威[1][2][3]。\n\n有學者認為明代是繼漢唐之後的黃金時期,也被稱為最後一個可以和漢唐媲美的盛世[參 36]。清代張廷玉等修的官修《明史》評價明朝為「治隆唐宋」[註 10]、「遠邁漢唐」[參 37]。 172 | # """, 173 | # merge_strategy=merge_strategy, start_contain="B_", 174 | # end_contain="I_") 175 | # print(result) 176 | # self.assertTrue(isinstance(result, list)) 177 | 178 | # proc = tfkit.task.tag.Preprocessor(tokenizer, maxlen=512, handle_exceed='start_slice', 179 | # reserved_len=0) 180 | # for items in proc.prepare_data({"input": input}): 181 | # raw_input = items['raw_input'] 182 | # feature = proc.postprocess(items, tokenizer, 512) 183 | # for k, v in feature.items(): 184 | # feature[k] = [v] 185 | # self.assertTrue(isinstance(model(feature), Tensor)) 186 | # print(model(feature)) 187 | # # test eval 188 | # model_dict = model(feature, eval=True) 189 | # self.assertTrue('label_prob_all' in model_dict) 190 | # self.assertTrue('label_map' in model_dict) 191 | # self.assertEqual(len(model_dict['label_map']), len(input.split(" "))) 192 | # 193 | # # test predict 194 | # result, model_dict = model.predict(input=input, start_contain="A", end_contain="A") 195 | # self.assertTrue('label_prob_all' in model_dict[0]) 196 | # self.assertTrue('label_map' in model_dict[0]) 197 | # print("result", result, len(result)) 198 | # self.assertTrue(isinstance(result, list)) 199 | # 200 | # # test exceed 512 201 | # for merge_strategy in ['minentropy', 'maxcount', 'maxprob']: 202 | # result, model_dict = model.predict(input=" ".join([str(i) for i in range(1000)]), 203 | # merge_strategy=merge_strategy, start_contain="A", end_contain="A") 204 | # print(result) 205 | # self.assertTrue(isinstance(result, list)) 206 | -------------------------------------------------------------------------------- /tfkit/test/test_atrain.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | from transformers import BertTokenizer, AutoModel 5 | 6 | import tfkit 7 | from tfkit.test import * 8 | from tfkit.utility.model import load_model_class 9 | 10 | 11 | class TestTrain(unittest.TestCase): 12 | 13 | def testHelp(self): 14 | result = os.system('tfkit-train -h') 15 | assert (result == 0) 16 | 17 | def test_parser(self): 18 | input_arg, model_arg = tfkit.train.parse_train_args( 19 | ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config', 20 | 'voidful/albert_chinese_tiny']) 21 | print(input_arg, model_arg) 22 | self.assertTrue(input_arg.get('task') == ['once']) 23 | self.assertTrue(isinstance(input_arg.get('train'), list)) 24 | 25 | input_arg, model_arg = tfkit.train.parse_train_args( 26 | ['--task', 'once', '--train', 'train.csv', '--test', 'test.csv', '--config', 27 | 'voidful/albert_chinese_tiny', '--likelihood', 'pos']) 28 | print(input_arg, model_arg) 29 | self.assertTrue(model_arg.get('likelihood') == 'pos') 30 | self.assertTrue(isinstance(input_arg.get('train'), list)) 31 | 32 | def test_optimizer(self): 33 | model_class = load_model_class('clas') 34 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 35 | pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny') 36 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]}, 37 | maxlen=128) 38 | optim, scheduler = tfkit.train.optimizer(model, lr=0.1, total_step=10) 39 | print(optim, scheduler) 40 | optim.zero_grad() 41 | scheduler.step() 42 | 43 | def testMultiTask(self): 44 | tfkit.train.main( 45 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET, 46 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config', 47 | 'voidful/albert_chinese_tiny', '--maxlen', '50']) 48 | result = os.system( 49 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + MTTASK_MODEL_DIR + ' --train ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --lr 5e-5 --test ' + CLAS_DATASET + ' ' + GEN_DATASET + ' --task once clm --config voidful/albert_chinese_tiny --maxlen 50') 50 | self.assertTrue(result == 0) 51 | 52 | def testGenOnce(self): 53 | tfkit.train.main( 54 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train', 55 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config', 56 | 'voidful/albert_chinese_tiny', '--maxlen', '50']) 57 | result = os.system( 58 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task once --config voidful/albert_chinese_tiny --maxlen 50') 59 | self.assertTrue(result == 0) 60 | 61 | def testGenOnceCTC(self): 62 | tfkit.train.main( 63 | ['--batch', '2', '--epoch', '1', '--savedir', ONCECTC_MODEL_DIR, '--train', 64 | GEN_DATASET, '--lr', '3e-4', '--test', GEN_DATASET, '--task', 'oncectc', '--config', 65 | 'voidful/albert_chinese_tiny', '--maxlen', '50']) 66 | result = os.system( 67 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + ONCE_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task oncectc --config voidful/albert_chinese_tiny --maxlen 50') 68 | self.assertTrue(result == 0) 69 | 70 | def testGenSeq2Seq(self): 71 | # result = os.system( 72 | # 'tfkit-train --batch 2 --epoch 1 --savedir ' + SEQ2SEQ_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task seq2seq --config prajjwal1/bert-small --maxlen 50 --selfkd True') 73 | # self.assertTrue(result == 0) 74 | tfkit.train.main( 75 | ['--batch', '1', '--epoch', '1', '--savedir', SEQ2SEQ_MODEL_DIR, '--train', 76 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config', 77 | 'prajjwal1/bert-small', '--maxlen', '20']) 78 | tfkit.train.main( 79 | ['--batch', '2', '--epoch', '2', '--savedir', SEQ2SEQ_MODEL_DIR, '--train', 80 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'seq2seq', '--config', 81 | 'prajjwal1/bert-small', '--maxlen', '20', '--likelihood', 'pos']) 82 | 83 | def testGenCLM(self): 84 | result = os.system( 85 | 'tfkit-train --batch 2 --epoch 1 --savedir ' + CLM_MODEL_DIR + ' --train ' + GEN_DATASET + ' --test ' + GEN_DATASET + ' --task clm --config prajjwal1/bert-small --maxlen 50') 86 | self.assertTrue(result == 0) 87 | tfkit.train.main( 88 | ['--batch', '2', '--epoch', '2', '--savedir', CLM_MODEL_DIR, '--train', 89 | GEN_DATASET, '--lr', '5e-4', '--test', GEN_DATASET, '--task', 'clm', '--config', 90 | 'prajjwal1/bert-small', '--maxlen', '20']) 91 | 92 | def testAddTokenFile(self): 93 | tfkit.train.main( 94 | ['--batch', '2', '--epoch', '1', '--savedir', ADDTOKFILE_SAVE_DIR, '--train', 95 | GEN_DATASET, '--lr', '5e-5', '--test', ADDTOK_DATASET, '--task', 'clm', '--config', 96 | 'voidful/albert_chinese_tiny', '--maxlen', '100', '--add_tokens_file', NEWTOKEN_FILE]) 97 | result = os.system( 98 | f'tfkit-train --batch 2 --add_tokens_file {NEWTOKEN_FILE} --savedir {ADDTOKFILE_SAVE_DIR} --epoch 2 --train {ADDTOK_DATASET} --test {ADDTOK_DATASET} --task clm --config voidful/albert_chinese_tiny --maxlen 50') 99 | self.assertTrue(result == 0) 100 | 101 | def testResume(self): 102 | tfkit.train.main( 103 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train', 104 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config', 105 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'testresume']) 106 | 107 | tfkit.train.main( 108 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train', 109 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config', 110 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(ONCE_MODEL_DIR, "1.pt")]) 111 | 112 | def testResumeMultiModel(self): 113 | tfkit.train.main( 114 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET, 115 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config', 116 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm']) 117 | # resume to train all task 118 | tfkit.train.main( 119 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', CLAS_DATASET, GEN_DATASET, 120 | '--lr', '5e-5', '--test', CLAS_DATASET, GEN_DATASET, '--task', 'once', 'clm', '--config', 121 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--tag', 'once', 'clm', '--resume', 122 | os.path.join(MTTASK_MODEL_DIR, "1.pt")]) 123 | # resume to train only one task 124 | tfkit.train.main( 125 | ['--batch', '2', '--epoch', '1', '--savedir', MTTASK_MODEL_DIR, '--train', 126 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'clm', '--config', 127 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--resume', os.path.join(MTTASK_MODEL_DIR, "1.pt"), 128 | '--tag', 'clm']) 129 | 130 | @pytest.mark.skip() 131 | def testLoggerwandb(self): 132 | tfkit.train.main( 133 | ['--batch', '2', '--epoch', '1', '--savedir', ONCE_MODEL_DIR, '--train', 134 | GEN_DATASET, '--lr', '5e-5', '--test', GEN_DATASET, '--task', 'once', '--config', 135 | 'voidful/albert_chinese_tiny', '--maxlen', '50', '--wandb']) 136 | 137 | def testClas(self): 138 | tfkit.train.main( 139 | ['--batch', '2', '--epoch', '1', '--savedir', CLAS_MODEL_DIR, '--train', 140 | CLAS_DATASET, '--lr', '5e-5', '--test', CLAS_DATASET, '--task', 'clas', '--config', 141 | 'voidful/albert_chinese_tiny', '--maxlen', '50']) 142 | result = os.system( 143 | 'tfkit-train --batch 2 --epoch 2 --savedir ' + CLAS_MODEL_DIR + ' --train ' + CLAS_DATASET + ' --test ' + CLAS_DATASET + ' --task clas --config voidful/albert_chinese_tiny --maxlen 50') 144 | self.assertTrue(result == 0) 145 | 146 | # def testQA(self): 147 | # tfkit.train.main( 148 | # ['--batch', '2', '--epoch', '1', '--savedir', QA_MODEL_DIR, '--train', 149 | # QA_DATASET, '--lr', '5e-5', '--test', QA_DATASET, '--task', 'qa', '--config', 150 | # 'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'start_slice']) 151 | # result = os.system( 152 | # 'tfkit-train --batch 2 --epoch 2 --savedir ' + QA_MODEL_DIR + ' --train ' + QA_DATASET + ' --test ' + QA_DATASET + ' --task qa --config voidful/albert_chinese_tiny --maxlen 512 --handle_exceed start_slice') 153 | # self.assertTrue(result == 0) 154 | # 155 | # def testTag(self): 156 | # tfkit.train.main( 157 | # ['--batch', '2', '--epoch', '1', '--savedir', TAG_MODEL_DIR, '--train', 158 | # TAG_DATASET, '--lr', '5e-5', '--test', TAG_DATASET, '--task', 'tag', '--config', 159 | # 'voidful/albert_chinese_tiny', '--maxlen', '512', '--handle_exceed', 'slide']) 160 | # result = os.system( 161 | # 'tfkit-train --batch 2 --epoch 2 --savedir ' + TAG_MODEL_DIR + ' --train ' + TAG_DATASET + ' --test ' + TAG_DATASET + ' --task tag --config voidful/albert_chinese_tiny --maxlen 50 --handle_exceed slide') 162 | # self.assertTrue(result == 0) 163 | -------------------------------------------------------------------------------- /tfkit/test/test_package.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import AutoTokenizer 4 | 5 | import tfkit 6 | import os 7 | 8 | class TestPackage(unittest.TestCase): 9 | 10 | def testImport(self): 11 | path = os.path.dirname(tfkit.__file__) 12 | print(path) 13 | tfkit.task 14 | tfkit.utility 15 | -------------------------------------------------------------------------------- /tfkit/test/test_zeval.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tfkit 4 | from tfkit.test import * 5 | 6 | 7 | class TestEval(unittest.TestCase): 8 | 9 | def testHelp(self): 10 | result = os.system('tfkit-eval -h') 11 | self.assertTrue(result == 0) 12 | 13 | def test_parser(self): 14 | parser, _ = tfkit.eval.parse_eval_args( 15 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print']) 16 | print(parser) 17 | self.assertTrue(parser.get('model') == ['once']) 18 | 19 | eval_parser, model_parser = tfkit.eval.parse_eval_args( 20 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print', '--decodenum', '2']) 21 | self.assertTrue(eval_parser.get('model') == ['once']) 22 | self.assertTrue(model_parser.get('decodenum') == '2') 23 | 24 | def testEvalGen(self): 25 | tfkit.eval.main( 26 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print']) 27 | result = os.system( 28 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print') 29 | self.assertTrue(result == 0) 30 | 31 | def testEvalGenOnce(self): 32 | tfkit.eval.main( 33 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print']) 34 | result = os.system( 35 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print') 36 | self.assertTrue(result == 0) 37 | 38 | def testEvalGenOnceCTC(self): 39 | tfkit.eval.main( 40 | ['--model', ONCECTC_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print']) 41 | result = os.system( 42 | 'tfkit-eval --model ' + ONCECTC_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print') 43 | self.assertTrue(result == 0) 44 | 45 | def testEvalSeq2Seq(self): 46 | tfkit.eval.main( 47 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print', 48 | '--decodenum', '2']) 49 | tfkit.eval.main( 50 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print']) 51 | result = os.system( 52 | 'tfkit-eval --model ' + SEQ2SEQ_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print') 53 | self.assertTrue(result == 0) 54 | 55 | def testEvalCLM(self): 56 | tfkit.eval.main( 57 | ['--model', CLM_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print']) 58 | result = os.system( 59 | 'tfkit-eval --model ' + CLM_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print') 60 | self.assertTrue(result == 0) 61 | 62 | def testEvalAddedTokenModel(self): 63 | result = os.system( 64 | 'tfkit-eval --model ' + ADDTOKFILE_MODEL_PATH + ' --valid ' + ADDTOK_DATASET + ' --metric emf1 --print') 65 | self.assertTrue(result == 0) 66 | 67 | def testEvalClassify(self): 68 | tfkit.eval.main( 69 | ['--model', CLAS_MODEL_PATH, '--valid', CLAS_DATASET, '--metric', 'clas', '--print']) 70 | result = os.system( 71 | 'tfkit-eval --model ' + CLAS_MODEL_PATH + ' --valid ' + CLAS_DATASET + ' --metric clas --print') 72 | self.assertTrue(result == 0) 73 | 74 | # def testEvalQA(self): 75 | # tfkit.eval.main( 76 | # ['--model', QA_MODEL_PATH, '--valid', QA_DATASET, '--metric', 'emf1', '--print']) 77 | # result = os.system( 78 | # 'tfkit-eval --model ' + QA_MODEL_PATH + ' --valid ' + QA_DATASET + ' --metric emf1 --print') 79 | # self.assertTrue(result == 0) 80 | # 81 | # def testEvalTag(self): 82 | # tfkit.eval.main( 83 | # ['--model', TAG_MODEL_PATH, '--valid', TAG_DATASET, '--metric', 'clas', '--print']) 84 | # result = os.system( 85 | # 'tfkit-eval --model ' + TAG_MODEL_PATH + ' --valid ' + TAG_DATASET + ' --metric clas --print') 86 | # self.assertTrue(result == 0) -------------------------------------------------------------------------------- /tfkit/test/test_zzdump.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from tfkit.test import * 3 | import os 4 | 5 | import tfkit 6 | 7 | 8 | class TestEval(unittest.TestCase): 9 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../")) 10 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/') 11 | 12 | def testHelp(self): 13 | result = os.system('tfkit-dump -h') 14 | assert (result == 0) 15 | 16 | def test_parser(self): 17 | parser = tfkit.dump.parse_dump_args(['--model', 'a', '--dumpdir', 'b']) 18 | self.assertTrue(parser.get('model') == 'a') 19 | self.assertTrue(parser.get('dumpdir') == 'b') 20 | 21 | def testDump(self): 22 | dump_dir = './cache/dump' 23 | tfkit.dump.main(["--model", CLM_MODEL_PATH, '--dumpdir', dump_dir]) 24 | result = os.system( 25 | 'tfkit-dump --model ' + CLM_MODEL_PATH + ' --dumpdir ' + dump_dir) 26 | self.assertTrue(result == 0) 27 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_data_filereader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tfkit.test import * 4 | from tfkit.utility.data_filereader import * 5 | 6 | 7 | class TestDataFile(unittest.TestCase): 8 | 9 | def test_get_x_data_from_file(self): 10 | for get_x_iter in [get_gen_data_from_file(GEN_DATASET), 11 | get_qa_data_from_file(QA_DATASET), 12 | get_tag_data_from_file(TAG_DATASET), 13 | get_clas_data_from_file(CLAS_DATASET), 14 | get_multiclas_data_from_file(CLAS_DATASET)]: 15 | while True: 16 | try: 17 | print(next(get_x_iter)) 18 | except StopIteration as e: 19 | task_label_dict = e.value 20 | break 21 | print(task_label_dict) 22 | for k, v in task_label_dict.items(): 23 | print(k, v) 24 | self.assertTrue(isinstance(v, list)) 25 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_data_loader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | 5 | from tfkit.utility.data_loader import pad_batch 6 | 7 | 8 | class TestUtilityDataLoader(unittest.TestCase): 9 | 10 | def test_batch_reduce_pad(self): 11 | k = [{'input': torch.tensor([1, 2, 3])}, 12 | {'input': torch.tensor([3, 4])}, 13 | {'input': torch.tensor([5])}] 14 | reduced_batch = pad_batch(k) 15 | self.assertEqual(len(reduced_batch[0]['input']), len(reduced_batch[1]['input'])) 16 | print(reduced_batch) 17 | self.assertCountEqual(reduced_batch[0]['input'], [1, 2, 3]) 18 | self.assertCountEqual(reduced_batch[1]['input'], [3, 4, 0]) 19 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_data_processor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from tfkit.test import * 4 | from tfkit.utility.data_filereader import * 5 | 6 | 7 | class TestDataPreprocess(unittest.TestCase): 8 | 9 | def test_get_x_data_from_file(self): 10 | for get_x_iter in [get_gen_data_from_file(GEN_DATASET), 11 | get_qa_data_from_file(QA_DATASET), 12 | get_tag_data_from_file(TAG_DATASET), 13 | get_clas_data_from_file(CLAS_DATASET), 14 | get_multiclas_data_from_file(CLAS_DATASET)]: 15 | while True: 16 | try: 17 | print(next(get_x_iter)) 18 | except StopIteration as e: 19 | task_label_dict = e.value 20 | break 21 | print(task_label_dict) 22 | for k, v in task_label_dict.items(): 23 | print(k, v) 24 | self.assertTrue(isinstance(v, list)) 25 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import os 4 | 5 | from tfkit.utility.logger import Logger 6 | 7 | dir_path = os.path.dirname(os.path.realpath(__file__)) 8 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 9 | 10 | import unittest 11 | import tfkit 12 | 13 | 14 | class TestLogger(unittest.TestCase): 15 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../")) 16 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, './test/cache/') 17 | 18 | def test_write_log(self): 19 | logger = Logger(savedir=self.MODEL_SAVE_PATH) 20 | logger.write_log("test") 21 | with open(logger.logfilepath, 'r') as f: 22 | lines = f.read().splitlines() 23 | last_line = lines[-1] 24 | print(last_line) 25 | self.assertEqual(last_line, "test") 26 | 27 | def test_write_metric(self): 28 | logger = Logger(savedir=self.MODEL_SAVE_PATH) 29 | logger.write_metric("test", 1, 0) 30 | with open(logger.metricfilepath, 'r') as f: 31 | last_row = list(csv.reader(f))[-1] 32 | self.assertEqual(last_row, ["test", '1', '0']) 33 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from torch import nn 6 | from torch.autograd import Variable 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 10 | 11 | import unittest 12 | import tfkit 13 | 14 | 15 | class TestLoss(unittest.TestCase): 16 | outputs = Variable(torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]), requires_grad=False) 17 | targets = Variable(torch.Tensor([1, 1]).long(), requires_grad=False) 18 | alln_targets = Variable(torch.Tensor([-1, -1]).long(), requires_grad=False) 19 | onen_targets = Variable(torch.Tensor([1, -1]).long(), requires_grad=False) 20 | 21 | def testLabelSmoothingCrossEntropy(self): 22 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]) 23 | targets = torch.Tensor([1, 1]).long() 24 | alln_targets = torch.Tensor([0, -1]).long() 25 | onen_targets = torch.Tensor([1, -1]).long() 26 | 27 | criterion = nn.CrossEntropyLoss(ignore_index=-1) 28 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, ignore_index=-1) 29 | 30 | self.assertTrue(criterion(outputs, targets).item() < 31 | custom_criterion(outputs, targets).item()) 32 | self.assertTrue(criterion(outputs, onen_targets).item() < 33 | custom_criterion(outputs, onen_targets).item()) 34 | 35 | criterion = nn.CrossEntropyLoss() 36 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3) 37 | self.assertTrue(criterion(outputs, targets).item() < 38 | custom_criterion(outputs, targets).item()) 39 | 40 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, reduction='none') 41 | print(custom_criterion(self.outputs, self.targets)) 42 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2]) 43 | 44 | def testDiceLoss(self): 45 | custom_criterion = tfkit.utility.loss.DiceLoss(ignore_index=-1) 46 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.targets).item() < 1) 47 | self.assertTrue(0.99 < custom_criterion(self.outputs, self.alln_targets).item() <= 1) 48 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.onen_targets).item() < 1) 49 | 50 | custom_criterion = tfkit.utility.loss.DiceLoss(reduction='none') 51 | print(custom_criterion(self.outputs, self.targets)) 52 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2]) 53 | 54 | def testLossDrop(self): 55 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]) 56 | targets = torch.Tensor([1, 1]).long() 57 | norm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) 58 | loss_fct = nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # -1 index = padding token 59 | masked_lm_loss = loss_fct(outputs, targets) 60 | masked_lm_loss = masked_lm_loss.view(-1, len(targets)) # view by batch size 61 | masked_lm_loss = masked_lm_loss.sum(dim=0) 62 | masked_lm_loss = masked_lm_loss.mean() 63 | print(masked_lm_loss.mean(), norm_loss_fct(outputs, targets).mean()) 64 | 65 | def testBCEFocalLoss(self): 66 | outputs = torch.Tensor([[0, 1, 0], [0.2, 0, 0]]) 67 | targets = torch.Tensor([[0, 1, 0], [1, 0, 0]]) 68 | criterion = nn.BCELoss() 69 | custom_criterion = tfkit.utility.loss.BCEFocalLoss() 70 | self.assertTrue(criterion(outputs, targets).item() > 71 | custom_criterion(outputs, targets).item()) 72 | 73 | def testNegativeCElLoss(self): 74 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]) 75 | targets = torch.Tensor([1, 1]).long() 76 | alln_targets = torch.Tensor([-1, -1]).long() 77 | onen_targets = torch.Tensor([1, -1]).long() 78 | 79 | criterion = nn.CrossEntropyLoss(ignore_index=-1) 80 | custom_criterion = tfkit.utility.loss.NegativeCElLoss() 81 | self.assertTrue( 82 | criterion(outputs, targets).item() < custom_criterion(outputs, self.targets).item()) 83 | self.assertTrue(criterion(outputs, onen_targets).item() < custom_criterion(outputs, onen_targets).item()) 84 | 85 | def testFocalLoss(self): 86 | criterion = nn.CrossEntropyLoss(ignore_index=-1) 87 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=0) 88 | self.assertAlmostEqual(criterion(self.outputs, self.targets).item(), 89 | custom_criterion(self.outputs, self.targets).item()) 90 | self.assertAlmostEqual(criterion(self.outputs, self.alln_targets).item(), 91 | custom_criterion(self.outputs, self.alln_targets).item()) 92 | self.assertAlmostEqual(criterion(self.outputs, self.onen_targets).item(), 93 | custom_criterion(self.outputs, self.onen_targets).item()) 94 | 95 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=1) 96 | self.assertTrue(criterion(self.outputs, self.targets) > custom_criterion(self.outputs, self.targets)) 97 | self.assertTrue(criterion(self.outputs, self.alln_targets).item() - custom_criterion(self.outputs, 98 | self.alln_targets).item() < 1) 99 | self.assertTrue(criterion(self.outputs, self.onen_targets) > custom_criterion(self.outputs, self.onen_targets)) 100 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from tfkit.utility.model import list_all_model, load_model_class, load_predict_parameter, load_trained_model 5 | 6 | dir_path = os.path.dirname(os.path.realpath(__file__)) 7 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 8 | 9 | import unittest 10 | from transformers import BertTokenizer, AutoModel 11 | 12 | 13 | class TestModelLoader(unittest.TestCase): 14 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../")) 15 | MODEL_SAVE_PATH = os.path.join(ROOT_DIR, 'tfkit/test/cache/') 16 | 17 | def test_list_all_model(self): 18 | models = list_all_model() 19 | self.assertTrue(isinstance(models, list)) 20 | 21 | def test_load_model_class(self): 22 | load_model_class('clas') 23 | load_model_class('once') 24 | 25 | def test_load_predict_parameter(self): 26 | model_class = load_model_class('clas') 27 | # load pre-train task 28 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 29 | pretrained = AutoModel.from_pretrained('voidful/albert_chinese_tiny') 30 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail={"taskA": ["a", "b"]}, 31 | maxlen=128) 32 | clas_param = load_predict_parameter(model) 33 | print("clas_param", clas_param) 34 | self.assertTrue('input' in clas_param) 35 | self.assertTrue('topK' in clas_param) 36 | self.assertTrue('task' in clas_param) 37 | self.assertTrue('handle_exceed' in clas_param) 38 | self.assertTrue(isinstance(clas_param['handle_exceed'], str)) 39 | 40 | # def test_load_trained_model(self): 41 | # model_path = os.path.join(self.MODEL_SAVE_PATH, '1.pt') 42 | # model, model_type, model_class, model_info, preprocessor = load_trained_model(model_path) 43 | # print(model) 44 | # print(model_type) 45 | # print(model_class) 46 | # print(model_info) 47 | # print(model.predict) 48 | # print(model.predict(input="a")) 49 | -------------------------------------------------------------------------------- /tfkit/test/utility/test_utility_tok.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir))) 6 | 7 | import unittest 8 | import tfkit 9 | from transformers import AutoTokenizer, BertTokenizer 10 | 11 | 12 | class TestTok(unittest.TestCase): 13 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__ + "/../../../")) 14 | DATASET_DIR = os.path.join(ROOT_DIR, 'demo_data') 15 | 16 | def testTok(self): 17 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 18 | begin = tfkit.utility.tok.tok_begin(tokenizer) 19 | self.assertEqual(begin, "[CLS]") 20 | sep = tfkit.utility.tok.tok_sep(tokenizer) 21 | self.assertEqual(sep, "[SEP]") 22 | mask = tfkit.utility.tok.tok_mask(tokenizer) 23 | self.assertEqual(mask, "[MASK]") 24 | pad = tfkit.utility.tok.tok_pad(tokenizer) 25 | self.assertEqual(pad, "[PAD]") 26 | 27 | def testTok(self): 28 | tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') 29 | begin = tfkit.utility.tok.tok_begin(tokenizer) 30 | self.assertEqual(begin, "") 31 | sep = tfkit.utility.tok.tok_sep(tokenizer) 32 | self.assertEqual(sep, "") 33 | mask = tfkit.utility.tok.tok_mask(tokenizer) 34 | self.assertEqual(mask, "") 35 | pad = tfkit.utility.tok.tok_pad(tokenizer) 36 | self.assertEqual(pad, "") 37 | 38 | def testGetXUnkToken(self): 39 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 40 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[], topP=0.5) 41 | self.assertFalse(result) 42 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[], freqK=10) 43 | self.assertFalse(result) 44 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'], 45 | freqK=1) 46 | self.assertTrue(len(result) > 0) 47 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'], 48 | topP=0.9) 49 | self.assertTrue(len(result) > 0) 50 | 51 | def testHandleExceed(self): 52 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny') 53 | seq = " ".join([str(_) for _ in range(100)]) 54 | maxlen = 50 55 | for mode in ['noop', 'remove', 'slide', 'start_slice', 'end_slice']: 56 | rlt, _ = tfkit.utility.tok.handle_exceed(tokenizer, seq, maxlen, mode=mode) 57 | if mode == 'remove': 58 | self.assertTrue(len(rlt) == 0) 59 | if mode == 'slide': 60 | self.assertTrue(len(rlt) > 1) 61 | for i in rlt: 62 | print(i) 63 | if mode != 'noop': 64 | self.assertTrue(len(i) == 50) 65 | -------------------------------------------------------------------------------- /tfkit/utility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/voidful/TFkit/5942b86e9132703ae4f328ba3d199c322b8cd1e4/tfkit/utility/__init__.py -------------------------------------------------------------------------------- /tfkit/utility/data_filereader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from collections import defaultdict 3 | 4 | import nlp2 5 | 6 | 7 | # ignore sklearn warning 8 | def warn(*args, **kwargs): 9 | pass 10 | 11 | 12 | import warnings 13 | 14 | warnings.warn = warn 15 | 16 | from tqdm.auto import tqdm 17 | 18 | from tfkit.utility import tok 19 | 20 | 21 | def get_multiclas_data_from_file(fpath): 22 | task_label_dict = defaultdict(list) 23 | with open(fpath, 'r') as infile: 24 | reader = csv.DictReader(infile) 25 | fieldnames = reader.fieldnames 26 | headers = ['input'] + ['target_' + str(i) for i in range(len(fieldnames) - 1)] 27 | 28 | is_multi_label = "" 29 | for rows in nlp2.read_csv_chunk(fpath, ','): 30 | for row in rows: 31 | if tok.UNIVERSAL_SEP in row[1]: 32 | is_multi_label = "_multi_label" 33 | break 34 | 35 | for rows in nlp2.read_csv_chunk(fpath, ','): 36 | for row in rows: 37 | start_pos = 1 38 | for pos, item in enumerate(row[start_pos:]): 39 | pos += start_pos 40 | task = headers[0] + "_" + headers[pos] + is_multi_label 41 | item = item.strip() 42 | if tok.UNIVERSAL_SEP in item: 43 | for i in item.split(tok.UNIVERSAL_SEP): 44 | task_label_dict[task].append(i) if i not in task_label_dict[task] else task_label_dict[task] 45 | else: 46 | task_label_dict[task].append(item) if item not in task_label_dict[task] else task_label_dict[ 47 | task] 48 | task_label_dict[task].sort() 49 | 50 | for rows in nlp2.read_csv_chunk(fpath, ','): 51 | chunk = [] 52 | for row in rows: 53 | start_pos = 1 54 | for pos, item in enumerate(row[start_pos:]): 55 | pos += start_pos 56 | task = headers[0] + "_" + headers[pos] + is_multi_label 57 | item = item.strip() 58 | targets = item.split(tok.UNIVERSAL_SEP) if tok.UNIVERSAL_SEP in item else [item] 59 | targets = [task_label_dict[task][task_label_dict[task].index(target)] for target in targets] 60 | input = row[0] 61 | chunk.append({"task": task, "input": input, "target": targets}) 62 | yield chunk 63 | return task_label_dict 64 | 65 | 66 | def get_clas_data_from_file(fpath): 67 | task_label_dict = defaultdict(list) 68 | task = 'clas' 69 | task_label_dict[task] = [] 70 | for rows in nlp2.read_csv_chunk(fpath, ','): 71 | chunk = [] 72 | for row in rows: 73 | source_text = row[0] 74 | target_text = row[1] 75 | if target_text not in task_label_dict[task]: 76 | task_label_dict[task].append(target_text) 77 | chunk.append({"task": task, "input": source_text, "target": task_label_dict[task].index(target_text)}) 78 | yield chunk 79 | return task_label_dict 80 | 81 | 82 | def get_gen_data_from_file(fpath): 83 | task_label_dict = defaultdict(list) 84 | task = 'gen' 85 | task_label_dict[task] = [] 86 | print("Reading data from file...") 87 | for rows in nlp2.read_csv_chunk(fpath, ','): 88 | chunk = [] 89 | for row in rows: 90 | source_text = str(row[0]).strip() 91 | target_text = str(row[1]).strip() 92 | negative_text = str(row[2]).strip() if len(row) > 2 else None 93 | if len(source_text) == 0 or len(target_text) == 0: 94 | continue 95 | chunk.append({"task": task, "input": source_text, "target": target_text, "ntarget": negative_text}) 96 | yield chunk 97 | return task_label_dict 98 | 99 | 100 | def get_qa_data_from_file(fpath): 101 | task_label_dict = defaultdict(list) 102 | task = 'qa' 103 | task_label_dict[task] = [] 104 | for rows in nlp2.read_csv_chunk(fpath, ','): 105 | chunk = [] 106 | for row in rows: 107 | context, start, end = row 108 | chunk.append({"task": task, "input": context, "target": [start, end]}) 109 | yield chunk 110 | return task_label_dict 111 | 112 | 113 | def get_tag_data_from_file(fpath, text_index: int = 0, label_index: int = 1, separator=" "): 114 | task_label_dict = defaultdict(list) 115 | task = 'tag' 116 | labels = [] 117 | for rows in nlp2.read_csv_chunk(fpath, ','): 118 | for row in rows: 119 | for i in row[1].split(separator): 120 | if i not in labels and len(i.strip()) > 0: 121 | labels.append(i) 122 | labels.sort() 123 | task_label_dict[task] = labels 124 | 125 | for rows in nlp2.read_csv_chunk(fpath, ','): 126 | chunk = [] 127 | for row in rows: 128 | chunk.append({"task": task, "input": row[text_index].strip(), "target": row[label_index].strip(), 129 | 'separator': separator}) 130 | yield chunk 131 | return task_label_dict 132 | 133 | 134 | def get_tag_data_from_file_col(fpath, text_index: int = 0, label_index: int = 1, separator=" ", **kwargs): 135 | tasks = defaultdict(list) 136 | task = 'default' 137 | labels = [] 138 | with open(fpath, 'r', encoding='utf-8') as f: 139 | lines = f.read().splitlines() 140 | for line in tqdm(lines): 141 | rows = line.split(separator) 142 | if len(rows) > 1: 143 | if rows[label_index] not in labels and len(rows[label_index]) > 0: 144 | labels.append(rows[label_index]) 145 | labels.sort() 146 | tasks[task] = labels 147 | with open(fpath, 'r', encoding='utf-8') as f: 148 | lines = f.read().splitlines() 149 | x, y = "", "" 150 | for line in tqdm(lines): 151 | rows = line.split(separator) 152 | if len(rows) == 1: 153 | yield tasks, task, x.strip(), [y.strip()] 154 | x, y = "", "" 155 | else: 156 | if len(rows[text_index]) > 0: 157 | x += rows[text_index].replace(" ", "_") + separator 158 | y += rows[label_index].replace(" ", "_") + separator 159 | -------------------------------------------------------------------------------- /tfkit/utility/data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import torch 3 | from torch import nn 4 | from torch.utils import data 5 | 6 | 7 | def index_of(in_list, val): 8 | """ 9 | get token index in list, return -1 when it is not in the list 10 | :rtype: int 11 | :param in_list: query list 12 | :param val: query target 13 | :return: position index 14 | """ 15 | try: 16 | return in_list.index(val) 17 | except ValueError: 18 | return -1 19 | 20 | 21 | def pad_batch(batch): 22 | """ 23 | reduce batch data shape by reduce their padding to common max 24 | it needs to Handel some exception since some key is no need to be padded 25 | :param batch: list of dict, with key input and target as model input and target 26 | :return: list of dict 27 | """ 28 | keys = list(batch[0].keys()) 29 | for k in keys: 30 | batch_key_length = [len(i[k]) if not isinstance(i[k], int) else 1 for i in batch] 31 | if len(set(batch_key_length)) > 1: # is all value same? if no, it need to pad with max length 32 | pad_length = max(batch_key_length) 33 | for idx, _ in enumerate(batch): 34 | if f"{k}_pad" in batch[idx]: 35 | padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), batch[idx][f"{k}_pad"][0]) 36 | else: 37 | padded = nn.ConstantPad1d((0, pad_length - len(batch[idx][k])), 0) 38 | # batch[idx][k] = torch.unsqueeze(padded(batch[idx][k]), 0) 39 | batch[idx][k] = padded(batch[idx][k]) 40 | for ind, dat in enumerate(batch): 41 | for k, v in dat.items(): 42 | batch[ind][k] = numpy.asarray(batch[ind][k]) 43 | return batch 44 | 45 | 46 | def dataloader_collate(batch): 47 | """ 48 | dataloader_collate function to apply batch reduce padding 49 | :param batch: list of dict 50 | :return: batch: list of dict 51 | """ 52 | # batch = copy.deepcopy(batch) 53 | return torch.utils.data._utils.collate.default_collate(pad_batch(batch)) 54 | -------------------------------------------------------------------------------- /tfkit/utility/data_processor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from numpy import uint16 4 | 5 | from tfkit.utility import tok 6 | 7 | 8 | class GeneralNLPPreprocessor: 9 | """ 10 | The design of NLPPreprocessor is to handle a pure text input, 11 | perform preprocessing on it base on model constrain, 12 | return ids as output 13 | 14 | This class will be applied before model training, splitting and prepare the data for model input 15 | it will call get feature from data when it's converting to model input 16 | """ 17 | 18 | def __init__(self, tokenizer, maxlen=512, handle_exceed='slide', reserved_len=0, uint16_save=False, 19 | kwargs={}): 20 | self.tokenizer = tokenizer 21 | self.uint16_save = uint16_save 22 | self.parameters = {**{'tokenizer': tokenizer, 'maxlen': maxlen, 'handle_exceed': handle_exceed, 23 | 'reserved_len': reserved_len}, **kwargs} 24 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 25 | # item = {key: value.tolist() for key, value in item.items()} 26 | self.tok_pad_id = tok.tok_pad_id(tokenizer) 27 | self.tok_bos_id = tok.tok_begin_id(tokenizer) 28 | self.tok_sep_id = tok.tok_sep_id(tokenizer) 29 | self.tok_mask_id = tok.tok_mask_id(tokenizer) 30 | 31 | def read_file_to_data(self, filepath): 32 | assert 'plz override this funciton' 33 | 34 | def set_global_parameters(self): 35 | self.tokenize_target = False 36 | 37 | def preprocess(self, item): 38 | self.set_global_parameters() 39 | preprocessed_data = [] 40 | item = self.preprocess_component_prepare_input(item) 41 | # target may be none in eval 42 | t_input_list, t_target_list, t_input_index, t_target_index = self.preprocess_component_split_into_list( 43 | item['input'], 44 | item.get('target')) 45 | for t_input, t_target, t_input_index, t_target_index in zip(t_input_list, 46 | t_target_list, 47 | t_input_index, 48 | t_target_index): 49 | slice_length = self.parameters['maxlen'] - self.parameters.get('reserved_len') - 3 50 | item['input'] = [tok.tok_begin(self.tokenizer)] + t_input[:slice_length] 51 | item['input_index'] = t_input_index 52 | item['target_index'] = t_target_index 53 | if len(t_target) > 0: 54 | item['target'] = t_target 55 | for convert_feature_input_dict in self.preprocess_component_convert_to_id(item): 56 | if self.uint16_save: 57 | data_item = {k: np.array(v, dtype=uint16) if isinstance(v, list) else v for k, v in 58 | convert_feature_input_dict.items()} 59 | else: 60 | data_item = convert_feature_input_dict 61 | preprocessed_data.append(data_item) 62 | return preprocessed_data 63 | 64 | def preprocess_component_prepare_input(self, item): 65 | if tok.UNIVERSAL_SEP in item['input']: 66 | part = item['input'].split(tok.UNIVERSAL_SEP) 67 | item['previous'] = self.tokenizer.tokenize(part[-1]) 68 | item['input'] = "".join(part[:-1]) 69 | return item 70 | 71 | def preprocess_component_split_into_list(self, input_text, target_text=None): 72 | t_input_list, t_input_index = tok.handle_exceed(self.tokenizer, input_text, 73 | maxlen=self.parameters['maxlen'] - 3, 74 | mode=self.parameters.get('handle_exceed')) 75 | if self.tokenize_target and target_text: 76 | t_target_list, t_target_index = tok.handle_exceed(self.tokenizer, target_text, 77 | maxlen=self.parameters['maxlen'] - 3, 78 | mode=self.parameters.get('handle_exceed')) 79 | elif target_text: 80 | t_target_list, t_target_index = [target_text * len(t_input_list)], [[0] * len(t_input_list)] 81 | else: 82 | t_target_list, t_target_index = ['' * len(t_input_list)], [[0] * len(t_input_list)] 83 | return t_input_list, t_target_list, t_input_index, t_target_index 84 | 85 | def preprocess_component_convert_to_id(self, item): 86 | yield {k: self.tokenizer.convert_tokens_to_ids(v) if isinstance(v, list) else v for k, v in item.items()} 87 | 88 | def postprocess(self, item, tokenizer, maxlen, **kwargs): 89 | return {key: torch.tensor(value) for key, value in item.items() if isinstance(value, list)} 90 | 91 | def postprocess_batch(self, feature_dict, **kwargs): 92 | return {key: torch.unsqueeze(torch.tensor(value), 0).to(self.device) for key, value in feature_dict.items()} 93 | 94 | 95 | class GeneralCVPreprocessor: 96 | def __init__(self, feature_extractor, kwargs={}): 97 | self.feature_extractor = feature_extractor 98 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 99 | self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs} 100 | 101 | def read_file_to_data(self, filepath): 102 | assert 'plz override this funciton' 103 | 104 | def preprocess(self, item): 105 | preprocessed_data = [] 106 | preprocessed_data.append(item) 107 | return preprocessed_data 108 | 109 | def postprocess(self, item, **kwargs): 110 | item['input'] = self.feature_extractor(item['input']) 111 | return {key: torch.tensor(value) for key, value in item.items()} 112 | 113 | 114 | class GeneralSpeechPreprocessor: 115 | def __init__(self, feature_extractor, kwargs={}): 116 | self.feature_extractor = feature_extractor 117 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 118 | self.parameters = {**{'feature_extractor': feature_extractor}, **kwargs} 119 | 120 | def read_file_to_data(self, filepath): 121 | assert 'plz override this function' 122 | 123 | def preprocess(self, item): 124 | preprocessed_data = [] 125 | preprocessed_data.append(item) 126 | return preprocessed_data 127 | 128 | def postprocess(self, item, **kwargs): 129 | item['input'] = self.feature_extractor(item['input']) 130 | return {key: torch.tensor(value) for key, value in item.items()} -------------------------------------------------------------------------------- /tfkit/utility/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from random import choice 4 | 5 | import joblib 6 | import nlp2 7 | from torch.utils import data 8 | from tqdm.contrib.concurrent import process_map 9 | 10 | 11 | def get_dataset(file_path, task_class, tokenizer, parameter): 12 | panel = nlp2.Panel() 13 | # all_arg = nlp2.function_get_all_arg_with_value(task_class.preprocessor.prepare_convert_to_id) 14 | # if parameter.get('panel'): 15 | # print("Operation panel for data preprocessing.") 16 | # for missarg in nlp2.function_check_missing_arg(task_class.preprocessor, 17 | # parameter): 18 | # panel.add_element(k=missarg, v=all_arg[missarg], msg=missarg, default=all_arg[missarg]) 19 | # filled_arg = panel.get_result_dict() 20 | # parameter.update(filled_arg) 21 | ds = TFKitDataset(fpath=file_path, tokenizer=tokenizer, 22 | preprocessor=task_class.Preprocessor, 23 | preprocessing_arg=parameter) 24 | return ds 25 | 26 | 27 | class TFKitDataset(data.Dataset): 28 | def __init__(self, fpath, tokenizer, preprocessor, preprocessing_arg={}): 29 | cache_path = fpath + "_" + tokenizer.name_or_path.replace("/", "_") + ".cache" 30 | self.task_dict = {} 31 | self.preprocessor = preprocessor(tokenizer, kwargs=preprocessing_arg) 32 | self.tokenizer = tokenizer 33 | if os.path.isfile(cache_path) and preprocessing_arg.get('cache', False): 34 | with open(cache_path, "rb") as fo: 35 | outdata = joblib.load(fo) 36 | sample = outdata['sample'] 37 | length = outdata['length'] 38 | self.task_dict = outdata['task'] 39 | else: 40 | print(f"Start preprocessing...") 41 | sample = defaultdict(list) 42 | length = 0 43 | get_data_item = self.preprocessor.read_file_to_data(fpath) 44 | while True: 45 | try: 46 | for items in process_map(self.preprocessor.preprocess, next(get_data_item), 47 | chunksize=1000): 48 | for i in items: 49 | length += 1 50 | for k, v in i.items(): 51 | sample[k].append(v) 52 | print(f"loaded {length} data.") 53 | except StopIteration as e: 54 | tasks = e.value 55 | break 56 | self.task_dict = tasks 57 | print(f"There are {length} datas after preprocessing.") 58 | if preprocessing_arg.get('cache', False): 59 | with open(cache_path, 'wb') as fo: 60 | outdata = {'sample': sample, 'task': self.task_dict, 'length': length} 61 | joblib.dump(outdata, fo) 62 | self.length = length 63 | self.sample = sample 64 | self.task = self.task_dict 65 | 66 | def increase_with_sampling(self, total): 67 | for _ in range(total - self.length): 68 | for key in self.sample.keys(): 69 | self.sample[key].append(choice(self.sample[key])) 70 | 71 | def __len__(self): 72 | return self.length 73 | 74 | def __getitem__(self, idx): 75 | return self.preprocessor.postprocess( 76 | {**{'task_dict': self.task_dict}, **{key: self.sample[key][idx] for key in self.sample.keys()}}, 77 | self.tokenizer, 78 | maxlen=self.preprocessor.parameters['maxlen']) 79 | -------------------------------------------------------------------------------- /tfkit/utility/eval_metric.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | import string 4 | from collections import Counter 5 | from collections import defaultdict 6 | 7 | import editdistance as ed 8 | from tqdm.auto import tqdm 9 | 10 | from tfkit.utility import tok 11 | 12 | 13 | def _normalize_answer(s, task='emf1'): 14 | """Lower text and remove punctuation, articles and extra whitespace.""" 15 | 16 | def remove_articles(text): 17 | if len(text) > 1: 18 | return re.sub(r'\b(a|an|the)\b', ' ', text) 19 | else: 20 | return text 21 | 22 | def white_space_fix(text): 23 | return ' '.join(text.split()) 24 | 25 | def remove_punc(text): 26 | exclude = set(string.punctuation) 27 | return ''.join(ch for ch in text if ch not in exclude) 28 | 29 | def lower(text): 30 | return text.lower() 31 | 32 | if task == 'emf1': 33 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 34 | else: 35 | return white_space_fix((remove_punc(lower(s)))) 36 | 37 | 38 | def _f1_score(prediction, ground_truth): 39 | prediction_tokens = _normalize_answer(prediction).split() 40 | ground_truth_tokens = _normalize_answer(ground_truth).split() 41 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 42 | num_same = sum(common.values()) 43 | if num_same == 0: 44 | return 0 45 | precision = 1.0 * num_same / len(prediction_tokens) 46 | recall = 1.0 * num_same / len(ground_truth_tokens) 47 | f1 = (2 * precision * recall) / (precision + recall) 48 | return f1 49 | 50 | 51 | def _cer(groundtruth, hypothesis): 52 | err = 0 53 | tot = 0 54 | for p, t in zip(hypothesis, groundtruth): 55 | err += float(ed.eval(p.lower(), t.lower())) 56 | tot += len(t) 57 | return err / tot 58 | 59 | 60 | def _wer(groundtruth, hypothesis): 61 | err = 0 62 | tot = 0 63 | for p, t in zip(hypothesis, groundtruth): 64 | p = p.lower().split(' ') 65 | t = t.lower().split(' ') 66 | err += float(ed.eval(p, t)) 67 | tot += len(t) 68 | return err / tot 69 | 70 | 71 | class EvalMetric: 72 | 73 | def __init__(self, tokenizer, normalize_text=True): 74 | self.tasks = defaultdict(lambda: defaultdict(list)) 75 | self.tokenizer = tokenizer 76 | self.target_list = defaultdict(lambda: defaultdict(int)) 77 | self.normalize_text = normalize_text 78 | 79 | def tokenize_text(self, text): 80 | text = self.tokenizer.decode(self.tokenizer.encode(text, add_special_tokens=False)) 81 | if self.normalize_text: 82 | text = text.replace(tok.tok_sep(self.tokenizer), " ") 83 | # return _normalize_answer(text, task='others') # remove punctuation 84 | # keep punctuation 85 | text = "".join( 86 | (char if char.isalpha() or char == " " else " " + char + " ") for char in text) # separate punctuation 87 | text = ' '.join(text.split()).lower().strip() # remove extra blank 88 | return text 89 | 90 | def add_record(self, ori_input, ori_predicted, ori_target, task='default'): 91 | input = predicted = target = "" 92 | input_list = predicted_list = ori_predicted_list = target_list = [] 93 | 94 | if isinstance(ori_input, str): 95 | input = self.tokenize_text(ori_input.strip()) 96 | input_list = [input] 97 | if isinstance(ori_input, list): 98 | input_list = copy.copy(ori_input) 99 | for i, t in enumerate(ori_input): 100 | input_list[i] = self.tokenize_text(t.strip()) 101 | input = " ".join(input_list) 102 | 103 | if isinstance(ori_predicted, str): 104 | predicted = self.tokenize_text(ori_predicted) 105 | predicted_list = [predicted] 106 | ori_predicted_list = [ori_predicted] 107 | if isinstance(ori_predicted, list): 108 | predicted_list = copy.copy(ori_predicted) 109 | ori_predicted_list = copy.copy(ori_predicted) 110 | for i, t in enumerate(ori_predicted): 111 | if not isinstance(t, list): 112 | predicted_list[i] = self.tokenize_text(t.strip()) 113 | ori_predicted_list[i] = t 114 | else: 115 | predicted_list[i] = '' 116 | ori_predicted_list[i] = '' 117 | predicted = " ".join(predicted_list) 118 | if isinstance(ori_target, str): 119 | target_list = [] 120 | if tok.UNIVERSAL_SEP in ori_target: 121 | target = ori_target 122 | target_list.extend([self.tokenize_text(st.strip()) for st in ori_target.split(tok.UNIVERSAL_SEP)]) 123 | else: 124 | target = self.tokenize_text(ori_target.strip()) 125 | target_list.append(target) 126 | elif isinstance(ori_target, list): 127 | for i, t in enumerate(ori_target): 128 | if isinstance(t, list): 129 | ori_target[i] = self.tokenize_text(t.strip()) 130 | 131 | target_list = ori_target 132 | 133 | for t in target_list: 134 | self.target_list[task][t] += 1 135 | 136 | self.tasks[task]['input'].append(input) 137 | self.tasks[task]['input_list'].append(input_list) 138 | self.tasks[task]['predicted'].append(predicted) 139 | self.tasks[task]['predicted_list'].append(predicted_list) 140 | self.tasks[task]['target'].append(target) 141 | self.tasks[task]['target_list'].append(target_list) 142 | self.tasks[task]['ori_input'].append(ori_input) 143 | self.tasks[task]['ori_predicted'].append(ori_predicted) 144 | self.tasks[task]['ori_predicted_list'].append(ori_predicted_list) 145 | self.tasks[task]['ori_target'].append(ori_target) 146 | 147 | def get_record(self, task='default'): 148 | return self.tasks[task] 149 | 150 | def cal_score(self, metric): 151 | data_score = [] 152 | for task_name, task in self.tasks.items(): 153 | print("Task : " + task_name + " report ") 154 | if "emf1" in metric: 155 | em = 0 156 | total = 0 157 | f1 = 0 158 | for pos, predict in enumerate(task['predicted']): 159 | em_list = [] 160 | f1_list = [] 161 | for target in task['target_list'][pos]: 162 | if _normalize_answer(str(predict)) == _normalize_answer(str(target)) and len( 163 | _normalize_answer(str(predict))) > 0 or len(str(predict)) == len(str(target)) == 0: 164 | em_score = 1 165 | f1_score = 1 166 | else: 167 | em_score = 0 168 | f1_score = _f1_score(str(predict), str(target)) 169 | em_list.append(em_score) 170 | f1_list.append(f1_score) 171 | em += max(em_list) 172 | f1 += max(f1_list) 173 | data_score.append([predict, task['target_list'][pos][em_list.index(max(em_list))], 174 | {'em': max(em_list), 'f1': max(f1_list)}]) 175 | total += 1 176 | result = {"EM": em / (total or not total), "F1": f1 / (total or not total)} 177 | data_score = sorted(data_score, key=lambda i: i[2]['em'], reverse=True) 178 | if "er" in metric: 179 | predicts = [] 180 | targets = [] 181 | for pos, predict in enumerate(task['predicted']): 182 | wer_list = [] 183 | cer_list = [] 184 | for target in task['target_list'][pos]: 185 | if len(target) > 0 and len(predict) > 0: 186 | wer_list.append(100 * _wer([target], [predict])) 187 | cer_list.append(100 * _cer([target], [predict])) 188 | else: 189 | wer_list.append(100) 190 | cer_list.append(100) 191 | wer = min(wer_list) 192 | cer = min(cer_list) 193 | target = task['target_list'][pos][wer_list.index(wer)] 194 | predicts.append(predict) 195 | targets.append(target) 196 | data_score.append([predict, target, {'wer': wer, 'cer': cer}]) 197 | 198 | wer = 100 * _wer(targets, predicts) if len(target) > 0 else 100 199 | cer = 100 * _cer(targets, predicts) if len(target) > 0 else 100 200 | result = {"WER": wer, "CER": cer} 201 | data_score = sorted(data_score, key=lambda i: i[2]['wer'], reverse=False) 202 | if "nlg" in metric: 203 | try: 204 | from nlgeval import NLGEval 205 | except ImportError: 206 | print( 207 | "nlg-eval package not install, plz install it: pip install git+https://github.com/voidful/nlg-eval.git ; nlg-eval --setup ./nlg-eval-data/") 208 | raise 209 | nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=["METEOR"]) 210 | 211 | target_list = task['target_list'] 212 | predicted = task['predicted'] 213 | for idx, tl in enumerate(target_list): 214 | max_candidate = max([len(i) for i in target_list]) 215 | if max_candidate - len(tl) > 0: 216 | target_list[idx].extend([""] * (max_candidate - len(tl))) 217 | 218 | for t, p in tqdm(zip(target_list, predicted), total=len(target_list)): 219 | data_score.append([p, t, nlgeval.compute_metrics(ref_list=list(map(list, zip(t))), hyp_list=[p])]) 220 | result = nlgeval.compute_metrics(ref_list=list(map(list, zip(*task['target_list']))), # transpose 221 | hyp_list=predicted) 222 | data_score = sorted(data_score, key=lambda i: i[2]['ROUGE_L']) 223 | if "clas" in metric: 224 | from sklearn.metrics import classification_report 225 | from sklearn.preprocessing import MultiLabelBinarizer 226 | from sklearn.metrics import precision_recall_fscore_support 227 | target_key = [t for t in self.target_list[task_name].keys() if len(t) > 0] 228 | mlb = MultiLabelBinarizer().fit([target_key]) 229 | # remove all blank target 230 | task['target_list'] = [[j for j in sub if len(j) > 0] for sub in task['target_list']] 231 | # modify for tagging result 232 | if isinstance(task['ori_predicted_list'][0][0], list): 233 | target_list = sum([[[j] for j in sub] for sub in task['target_list']], []) 234 | predicted = sum([[[j] for j in sub] for sub in task['ori_predicted_list']], []) 235 | if len(target_list) != len(predicted): 236 | diff = len(task['target_list']) - len(task['ori_predicted_list']) 237 | predicted.extend([['']] * diff) 238 | else: 239 | target_list = task['target_list'] 240 | predicted = task['ori_predicted_list'] 241 | 242 | for p, t in zip(predicted, target_list): 243 | score = dict(zip(["precision", "recall", "fbeta_score", "support"], 244 | precision_recall_fscore_support(mlb.transform([t]), mlb.transform([p]), 245 | average='weighted'))) 246 | data_score.append([p, t, score]) 247 | result = classification_report( 248 | mlb.transform(target_list), 249 | mlb.transform(predicted), 250 | target_names=list(mlb.classes_)) 251 | data_score = sorted(data_score, key=lambda i: i[2]['fbeta_score']) 252 | yield (task_name, result, data_score) 253 | -------------------------------------------------------------------------------- /tfkit/utility/logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import json 4 | 5 | 6 | class Logger: 7 | 8 | def __init__(self, savedir, logfilename="message.log", metricfilename="metric.log", tensorboard=False, wandb=False, 9 | print_fn=print): 10 | self.savedir = savedir 11 | self.logfilepath = os.path.join(savedir, logfilename) 12 | self.metricfilepath = os.path.join(savedir, metricfilename) 13 | self.tensorboard_writer = None 14 | self.wandb_writer = None 15 | self.print_fn = print_fn 16 | if tensorboard: 17 | from torch.utils.tensorboard import SummaryWriter 18 | self.tensorboard_writer = SummaryWriter() 19 | if wandb: 20 | import wandb 21 | project_name = savedir.replace("/", "_") 22 | self.wandb_writer = wandb.init(project=project_name) 23 | 24 | def write_config(self, config_dict): 25 | if self.wandb_writer: 26 | self.wandb_writer.config.update(config_dict) 27 | if self.tensorboard_writer: 28 | self.tensorboard_writer.add_hparams(config_dict) 29 | 30 | with open(self.metricfilepath, "a", encoding='utf8') as log_file: 31 | writer = csv.writer(log_file) 32 | writer.writerow([json.dumps(config_dict)]) 33 | 34 | def write_log(self, *args): 35 | line = ' '.join([str(a) for a in args]) 36 | with open(self.logfilepath, "a", encoding='utf8') as log_file: 37 | log_file.write(line + '\n') 38 | self.print_fn(line) 39 | 40 | def write_metric(self, tag, scalar_value, global_step): 41 | if self.wandb_writer: 42 | self.wandb_writer.log({tag: scalar_value, "global_step": global_step}) 43 | if self.tensorboard_writer: 44 | self.tensorboard_writer.add_scalar(tag, scalar_value, global_step) 45 | with open(self.metricfilepath, "a", encoding='utf8') as log_file: 46 | writer = csv.writer(log_file) 47 | writer.writerow([tag, scalar_value, global_step]) 48 | -------------------------------------------------------------------------------- /tfkit/utility/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | 7 | class BCEFocalLoss(nn.Module): 8 | def __init__(self, gamma=2): 9 | super(BCEFocalLoss, self).__init__() 10 | self.gamma = gamma 11 | 12 | def forward(self, input, target): 13 | BCE_loss = F.binary_cross_entropy_with_logits(input, target, reduction='none') 14 | pt = torch.exp(-BCE_loss) # prevents nans when probability 0 15 | focal_loss = (1 - pt) ** self.gamma * BCE_loss 16 | return focal_loss.mean() 17 | 18 | 19 | class FocalLoss(nn.Module): 20 | def __init__(self, gamma=2, ignore_index=-1): 21 | super(FocalLoss, self).__init__() 22 | self.gamma = gamma 23 | self.softmax = nn.Softmax(dim=1) 24 | self.nll = nn.NLLLoss(ignore_index=ignore_index) 25 | 26 | def forward(self, input, target): 27 | softmax = self.softmax(input) 28 | logpt = torch.log(softmax) 29 | pt = Variable(logpt.data.exp()) 30 | return self.nll((1 - pt) ** self.gamma * logpt, target) 31 | 32 | 33 | class SeqCTCLoss(nn.Module): 34 | def __init__(self, blank_index): 35 | super(SeqCTCLoss, self).__init__() 36 | self.blank_index = blank_index 37 | 38 | def forward(self, logits, input_lengths, targets, target_lengths): 39 | # lengths : (batch_size, ) 40 | # log_logits : (T, batch_size, n_class), this kind of shape is required for ctc_loss 41 | # log_logits = logits + (logit_mask.unsqueeze(-1) + 1e-45).log() 42 | log_logits = logits.log_softmax(-1).transpose(0, 1) 43 | loss = F.ctc_loss(log_logits, 44 | targets, 45 | input_lengths, 46 | target_lengths, 47 | blank=self.blank_index, 48 | reduction='mean', 49 | zero_infinity=True) 50 | return loss 51 | 52 | 53 | class SelfKDLoss(nn.Module): 54 | 55 | def __init__(self, alpha=0.1, temperature=2,ignore_index=-1): 56 | super(SelfKDLoss, self).__init__() 57 | self.alpha = alpha 58 | self.temperature = temperature 59 | self.ignore_index = ignore_index 60 | 61 | def forward(self, outputs, teacher_outputs, labels): 62 | loss = nn.KLDivLoss()(F.log_softmax(outputs / self.temperature, dim=-1), 63 | F.softmax(teacher_outputs / self.temperature, dim=-1)) * ( 64 | self.alpha * self.temperature * self.temperature) + F.cross_entropy(outputs, labels,ignore_index=self.ignore_index,) * ( 65 | 1. - self.alpha) 66 | return loss 67 | 68 | 69 | class DiceLoss(nn.Module): 70 | """From 'Dice Loss for Data-imbalanced NLP Tasks'""" 71 | 72 | def __init__(self, ignore_index=None, reduction='mean'): 73 | super(DiceLoss, self).__init__() 74 | self.ignore_index = ignore_index 75 | self.reduction = reduction 76 | 77 | def forward(self, y_pred, y_true): 78 | y_pred = torch.softmax(y_pred, dim=1) 79 | if self.ignore_index is not None: 80 | mask = y_true == -1 81 | filtered_target = y_true 82 | filtered_target[mask] = 0 83 | torch.gather(y_pred, dim=1, index=filtered_target.unsqueeze(1)) 84 | mask = mask.unsqueeze(1).expand(y_pred.data.size()) 85 | y_pred[mask] = 0 86 | pred_prob = torch.gather(y_pred, dim=1, index=y_true.unsqueeze(1)) 87 | dsc_i = 1 - ((1 - pred_prob) * pred_prob) / ((1 - pred_prob) * pred_prob + 1) 88 | if self.reduction == 'mean': 89 | return dsc_i.mean() 90 | else: 91 | return dsc_i.view(-1) 92 | 93 | 94 | class NegativeCElLoss(nn.Module): 95 | def __init__(self, ignore_index=-1, reduction='mean'): 96 | super(NegativeCElLoss, self).__init__() 97 | self.softmax = nn.Softmax(dim=1) 98 | self.alpha = 1 99 | self.nll = nn.NLLLoss(ignore_index=ignore_index, reduction=reduction) 100 | 101 | def forward(self, input, target): 102 | nsoftmax = self.softmax(input) 103 | nsoftmax = torch.clamp((1.0 - nsoftmax), min=1e-32) 104 | return self.nll(torch.log(nsoftmax) * self.alpha, target) 105 | 106 | 107 | class LabelSmoothingLoss(nn.Module): 108 | def __init__(self, classes, smoothing=0.1, dim=-1, ignore_index=None, reduction='mean'): 109 | super(LabelSmoothingLoss, self).__init__() 110 | self.confidence = 1.0 - smoothing 111 | self.smoothing = smoothing 112 | self.cls = classes 113 | self.dim = dim 114 | self.reduction = reduction 115 | self.ignore_index = ignore_index 116 | 117 | def forward(self, pred, target): 118 | pred = pred.log_softmax(dim=self.dim) 119 | with torch.no_grad(): 120 | true_dist = torch.zeros_like(pred) 121 | true_dist.fill_(self.smoothing / (self.cls - 1)) 122 | if self.ignore_index is not None: 123 | mask = target == -1 124 | filtered_target = target.clone() 125 | filtered_target[mask] = 0 126 | true_dist.scatter_(1, filtered_target.unsqueeze(1), self.confidence) 127 | mask = mask.unsqueeze(1).expand(pred.data.size()) 128 | true_dist[mask] = 0 129 | else: 130 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 131 | if self.reduction == 'mean': 132 | return torch.mean(torch.sum(-true_dist * pred, dim=self.dim)) 133 | else: 134 | return torch.sum(-true_dist * pred, dim=self.dim) 135 | -------------------------------------------------------------------------------- /tfkit/utility/model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import importlib 3 | import os 4 | from typing import List 5 | 6 | import inquirer 7 | import nlp2 8 | import torch 9 | from torch import nn 10 | from transformers import AutoTokenizer, AutoModel 11 | 12 | 13 | def list_all_model(ignore_list=[]): 14 | dataset_dir = os.path.abspath(__file__ + "/../../") + '/task' 15 | return list(filter( 16 | lambda x: os.path.isdir(os.path.join(dataset_dir, x)) and '__pycache__' not in x and x not in ignore_list, 17 | os.listdir(dataset_dir))) 18 | 19 | 20 | def load_predict_parameter(model, model_arg={}, enable_arg_panel=False): 21 | """use inquirer panel to let user input task parameter or just use default value""" 22 | return nlp2.function_argument_panel(model.predictor.wrap_input, model_arg, 23 | disable_input_panel=(not enable_arg_panel), 24 | func_parent=model, 25 | ignore_empty=True) 26 | 27 | 28 | def load_model_class(model_name): 29 | return importlib.import_module('.' + model_name, 'tfkit.task') 30 | 31 | 32 | def load_pretrained_model(pretrained_config, model_type): 33 | pretrained = AutoModel.from_pretrained(pretrained_config) 34 | if 'clm' in model_type: 35 | pretrained.config.is_decoder = True 36 | return pretrained 37 | 38 | 39 | def load_pretrained_tokenizer(pretrained_config): 40 | tokenizer = AutoTokenizer.from_pretrained(pretrained_config) 41 | return tokenizer 42 | 43 | 44 | def resize_pretrain_tok(pretrained, tokenizer): 45 | if pretrained.config.vocab_size != len(tokenizer): 46 | pretrained.resize_token_embeddings(len(tokenizer)) 47 | return pretrained, tokenizer 48 | 49 | 50 | def add_tokens_to_pretrain(pretrained, tokenizer, add_tokens, sample_init=False): 51 | origin_vocab_size = tokenizer.vocab_size 52 | print("===ADD TOKEN===") 53 | num_added_toks = tokenizer.add_tokens(add_tokens) 54 | print('We have added', num_added_toks, 'tokens') 55 | pretrained.resize_token_embeddings(len(tokenizer)) 56 | if sample_init: 57 | input_embedding = pretrained.get_input_embeddings() 58 | state_dict_weight = input_embedding.state_dict()['weight'] 59 | state_dict_weight[origin_vocab_size:len(tokenizer)] = copy.copy( 60 | state_dict_weight[100:100 + num_added_toks]) 61 | pretrained.set_input_embeddings(input_embedding) 62 | print("===============") 63 | return pretrained, tokenizer 64 | 65 | 66 | def load_trained_model(model_path, pretrained_config=None, tag=None): 67 | """loading saved task""" 68 | 69 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 70 | torchpack = torch.load(model_path, map_location=device) 71 | 72 | model_info = {key: torchpack[key] for key in torchpack.keys() if 'state_dict' not in key and 'models' not in key} 73 | print("===task info===") 74 | [print(k, v[:10], "...") if isinstance(v, list) and len(v) > 10 else print(k, v) for k, v in model_info.items()] 75 | print('===============') 76 | 77 | if 'tags' in torchpack and len(torchpack['tags']) > 1: 78 | if tag is None: 79 | print("Pick which models to use in multi-task models") 80 | inquirer_res = inquirer.prompt( 81 | [inquirer.List('tag', message="Select task", choices=torchpack['tags'])]) 82 | tag = inquirer_res['tag'] 83 | type_ind = torchpack['tags'].index(tag) 84 | else: 85 | type_ind = 0 86 | print("loading saved task") 87 | 88 | # get all loading parameter 89 | maxlen = torchpack['maxlen'] 90 | if pretrained_config is not None: 91 | config = pretrained_config 92 | else: 93 | config = torchpack['model_config'] if 'model_config' in torchpack else torchpack['bert'] 94 | model_types = [torchpack['type']] if not isinstance(torchpack['type'], list) else torchpack['type'] 95 | models_state = torchpack['models'] if 'models' in torchpack else [torchpack['model_state_dict']] 96 | type = model_types[type_ind] 97 | add_tokens = torchpack['add_tokens'] if 'add_tokens' in torchpack else None 98 | # load task 99 | tokenizer = AutoTokenizer.from_pretrained(config) 100 | pretrained = AutoModel.from_pretrained(config) 101 | 102 | pretrained, tokenizer = add_tokens_to_pretrain(pretrained, tokenizer, add_tokens) 103 | 104 | model_class = load_model_class(type) 105 | task_detail = {} 106 | if 'task-label' in torchpack: 107 | task_detail = torchpack['task-label'] 108 | elif 'label' in torchpack: 109 | task_detail = {'label': torchpack['label']} 110 | 111 | model = model_class.Model(tokenizer=tokenizer, pretrained=pretrained, tasks_detail=task_detail, 112 | maxlen=maxlen) 113 | model.load_state_dict(models_state[type_ind], strict=False) 114 | model = model.to(device) 115 | 116 | preprocessor = model_class.Preprocessor(tokenizer) 117 | 118 | print("finish loading") 119 | return model, type, model_class, model_info, preprocessor 120 | 121 | 122 | def save_model(models, input_arg, models_tag, epoch, fname, logger, accelerator, add_tokens=None): 123 | accelerator.wait_for_everyone() 124 | save_model = { 125 | 'models': [accelerator.get_state_dict(m) for m in models], 126 | 'model_config': input_arg.get('config'), 127 | 'add_tokens': add_tokens, 128 | 'tags': models_tag, 129 | 'type': input_arg.get('task'), 130 | 'maxlen': input_arg.get('maxlen'), 131 | 'epoch': epoch 132 | } 133 | 134 | for ind, m in enumerate(input_arg.get('task')): 135 | if 'tag' in m: 136 | save_model['label'] = models[ind].labels 137 | if "clas" in m: 138 | save_model['task-label'] = models[ind].tasks_detail 139 | 140 | torch.save(save_model, f"{fname}.pt") 141 | logger.write_log(f"weights were saved to {fname}.pt") 142 | 143 | 144 | def tie_encoder_decoder_weights(encoder, decoder, base_model_prefix): 145 | uninitialized_encoder_weights: List[str] = [] 146 | if decoder.__class__ != encoder.__class__: 147 | print( 148 | f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." 149 | ) 150 | 151 | def tie_encoder_to_decoder_recursively( 152 | decoder_pointer: nn.Module, 153 | encoder_pointer: nn.Module, 154 | module_name: str, 155 | uninitialized_encoder_weights: List[str], 156 | depth=0, 157 | ): 158 | assert isinstance(decoder_pointer, nn.Module) and isinstance( 159 | encoder_pointer, nn.Module 160 | ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" 161 | if hasattr(decoder_pointer, "weight"): 162 | assert hasattr(encoder_pointer, "weight") 163 | encoder_pointer.weight = decoder_pointer.weight 164 | if hasattr(decoder_pointer, "bias"): 165 | assert hasattr(encoder_pointer, "bias") 166 | encoder_pointer.bias = decoder_pointer.bias 167 | return 168 | 169 | encoder_modules = encoder_pointer._modules 170 | decoder_modules = decoder_pointer._modules 171 | if len(decoder_modules) > 0: 172 | assert ( 173 | len(encoder_modules) > 0 174 | ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" 175 | 176 | all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()]) 177 | encoder_layer_pos = 0 178 | for name, module in decoder_modules.items(): 179 | if name.isdigit(): 180 | encoder_name = str(int(name) + encoder_layer_pos) 181 | decoder_name = name 182 | if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len( 183 | encoder_modules 184 | ) != len(decoder_modules): 185 | # this can happen if the name corresponds to the position in a list module list of layers 186 | # in this case the decoder has added a cross-attention that the encoder does not have 187 | # thus skip this step and subtract one layer pos from encoder 188 | encoder_layer_pos -= 1 189 | continue 190 | elif name not in encoder_modules: 191 | continue 192 | elif depth > 500: 193 | raise ValueError( 194 | "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your task." 195 | ) 196 | else: 197 | decoder_name = encoder_name = name 198 | tie_encoder_to_decoder_recursively( 199 | decoder_modules[decoder_name], 200 | encoder_modules[encoder_name], 201 | module_name + "/" + name, 202 | uninitialized_encoder_weights, 203 | depth=depth + 1, 204 | ) 205 | all_encoder_weights.remove(module_name + "/" + encoder_name) 206 | 207 | uninitialized_encoder_weights += list(all_encoder_weights) 208 | 209 | # tie weights recursively 210 | tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights) 211 | if len(uninitialized_encoder_weights) > 0: 212 | print( 213 | f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}" 214 | ) 215 | else: 216 | print("All encoder weights tied to the decoder") 217 | -------------------------------------------------------------------------------- /tfkit/utility/tok.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import nlp2 4 | from tqdm import tqdm 5 | from transformers import AutoTokenizer 6 | 7 | UNIVERSAL_SEP = "///" 8 | 9 | 10 | def tok_begin(tokenizer): 11 | if tokenizer.special_tokens_map.get('bos_token') is not None: 12 | return tokenizer.special_tokens_map.get('bos_token') 13 | elif tokenizer.special_tokens_map.get('cls_token') is not None: 14 | tokenizer.special_tokens_map.get('cls_token') 15 | return 'cls' 16 | 17 | 18 | def tok_begin_id(tokenizer): 19 | return tokenizer.convert_tokens_to_ids(tok_begin(tokenizer)) 20 | 21 | 22 | def tok_sep(tokenizer): 23 | if tokenizer.special_tokens_map.get('sep_token') is not None: 24 | return tokenizer.special_tokens_map.get('sep_token') 25 | elif tokenizer.special_tokens_map.get('eos_token') is not None: 26 | return tokenizer.special_tokens_map.get('eos_token') 27 | return 'sep' 28 | 29 | 30 | def tok_sep_id(tokenizer): 31 | return tokenizer.convert_tokens_to_ids(tok_sep(tokenizer)) 32 | 33 | 34 | def tok_mask(tokenizer): 35 | if tokenizer.special_tokens_map.get('mask_token'): 36 | return tokenizer.special_tokens_map.get('mask_token') 37 | return 'msk' 38 | 39 | 40 | def tok_mask_id(tokenizer): 41 | return tokenizer.convert_tokens_to_ids(tok_mask(tokenizer)) 42 | 43 | 44 | def tok_pad(tokenizer): 45 | if tokenizer.special_tokens_map.get('pad_token'): 46 | return tokenizer.special_tokens_map.get('pad_token') 47 | return 'pad' 48 | 49 | 50 | def tok_pad_id(tokenizer): 51 | return tokenizer.convert_tokens_to_ids(tok_pad(tokenizer)) 52 | 53 | 54 | def get_all_tok_from_config(config): 55 | tokenizer = AutoTokenizer.from_pretrained(config) 56 | return list(tokenizer.get_vocab().keys()) 57 | 58 | 59 | def handle_exceed(tokenizer, seq, maxlen, mode=['noop', 'remove', 'slide', 'start_slice', 'end_slice'], 60 | keep_after_sep=True): 61 | if isinstance(seq, list): 62 | return seq, [[len(seq)]] 63 | mode = mode[0] if isinstance(mode, list) else mode 64 | sep_tok = tok_sep(tokenizer) 65 | sep_split = seq.split(sep_tok) 66 | ext_seq = [sep_tok] + tokenizer.tokenize(sep_tok.join(sep_split[1:])) \ 67 | if len(sep_split) > 1 and keep_after_sep else [] 68 | t_seq = tokenizer.tokenize(sep_split[0]) 69 | if mode == 'noop': 70 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]] 71 | if mode == 'remove': 72 | if len(t_seq + ext_seq) <= maxlen: 73 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]] 74 | else: 75 | return [], [[0, 0]] 76 | if mode == 'slide': 77 | return nlp2.sliding_windows(t_seq, maxlen - len(ext_seq), append_seq=ext_seq) 78 | if mode == 'start_slice': 79 | slices = t_seq[:maxlen - len(ext_seq)] 80 | slices.extend(ext_seq) 81 | return [slices], [[0, maxlen - len(ext_seq)]] 82 | if mode == 'end_slice': 83 | start_pos = len(t_seq) + len(ext_seq) - maxlen 84 | slices = t_seq[start_pos:] 85 | slices.extend(ext_seq) 86 | return [slices], [[max(0, start_pos), len(t_seq)]] 87 | 88 | 89 | def get_topP_unk_token(tokenizer, file_paths: list, topP: float): 90 | unk_count_dict = OrderedDict() 91 | for path in file_paths: 92 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)): 93 | for tok in nlp2.split_sentence_to_array(input_sent): 94 | if tokenizer._unk_token in tokenizer.tokenize(tok): 95 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 96 | top_range = int((len(unk_count_dict) + 1) * topP * 100) 97 | return list(unk_count_dict.keys())[:top_range] 98 | 99 | 100 | def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int): 101 | unk_count_dict = OrderedDict() 102 | for path in file_paths: 103 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)): 104 | for tok in nlp2.split_sentence_to_array(input_sent): 105 | if tokenizer._unk_token in tokenizer.tokenize(tok): 106 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 107 | return [key for key, value in unk_count_dict.items() if value >= freqK] 108 | --------------------------------------------------------------------------------