├── .flake8 ├── .github └── workflows │ └── publish-to-pypi.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── VERSION.txt ├── container ├── Dockerfile ├── batch.Dockerfile ├── bert │ ├── download_pretrained_models.py │ ├── nginx.conf │ ├── predictor.py │ ├── serve │ ├── train │ └── wsgi.py ├── bert_batch │ ├── nginx.conf │ ├── predictor.py │ ├── train │ └── wsgi.py ├── build_and_push.sh ├── build_and_push_batch.sh └── pytorch_build_and_push.sh ├── container_lm ├── Dockerfile ├── bert │ ├── download_pretrained_models.py │ ├── nginx.conf │ ├── train │ └── wsgi.py └── build_and_push.sh ├── container_ner ├── Dockerfile ├── bert │ ├── download_pretrained_models.py │ ├── nginx.conf │ ├── predictor.py │ ├── serve │ ├── train │ └── wsgi.py └── build_and_push.sh ├── container_t5 ├── Dockerfile ├── build_and_push.sh ├── requirements.txt └── t5 │ ├── download_pretrained_models.py │ ├── nginx.conf │ ├── predictor.py │ ├── serve │ ├── train │ └── wsgi.py ├── deploy_pip.sh ├── fast_bert ├── __init__.py ├── bert_layers.py ├── data.py ├── data_abs.py ├── data_cls.py ├── data_lm.py ├── data_ner.py ├── data_qa.py ├── learner_abs.py ├── learner_cls copy.py ├── learner_cls.py ├── learner_lm.py ├── learner_ner.py ├── learner_qa.py ├── learner_util.py ├── metrics.py ├── modeling.py ├── onnx_helper.py ├── optimization.py ├── prediction.py ├── prediction_ner.py ├── summarisation │ ├── __init__.py │ ├── configuration_bertabs.py │ └── modeling_bertabs.py ├── utils │ ├── __init__.py │ └── spellcheck.py └── utils_squad_evaluate.py ├── images └── lr_finder.png ├── requirements.txt ├── sample_data ├── imdb_movie_reviews │ ├── data │ │ ├── train_sample.csv │ │ └── val_sample.csv │ └── label │ │ └── labels.csv └── multi_label_toxic_comments │ ├── data │ ├── train_sample.csv │ └── val_sample.csv │ └── label │ └── labels.csv ├── sample_notebooks ├── gpu_util.ipynb ├── new-toxic-multilabel.ipynb ├── new-toxic-predict.ipynb └── toxic_comments_sagemaker.ipynb ├── setup.py ├── tag_release.sh └── test ├── multi_class.ipynb ├── summarisation.ipynb └── tokenizer_vocab └── bert-base-uncased-vocab.txt /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, F403, F401 3 | max-line-length = 79 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distribution 📦 to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v[0-9].[0-9]+.[0-9]+*" # This ensures the action only runs on version tags 7 | 8 | jobs: 9 | build-n-publish: 10 | name: Build and publish to PyPI 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: '3.10' # Specify the Python version 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | 26 | - name: Build package 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | 30 | - name: Publish package to PyPI 31 | uses: pypa/gh-action-pypi-publish@master 32 | with: 33 | user: __token__ 34 | password: ${{ secrets.PYPI_API_TOKEN }} 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # Pycharm project settings 101 | .idea 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | .output 110 | cache/* 111 | cached* 112 | 113 | # OS related 114 | .DS_Store -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black", 3 | "python.pythonPath": "/home/ubuntu/anaconda3/bin/python", 4 | "python.linting.pylintEnabled": false, 5 | "python.linting.flake8Enabled": true, 6 | "python.linting.enabled": true 7 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 2.0.26 -------------------------------------------------------------------------------- /container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ARG ARCH=gpu 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ARG py_version=3 5 | 6 | # Validate that arguments are specified 7 | RUN test $py_version || exit 1 8 | 9 | RUN echo $py_version 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | ca-certificates \ 17 | libjpeg-dev \ 18 | nginx \ 19 | jq \ 20 | libsm6 \ 21 | libxext6 \ 22 | libxrender-dev \ 23 | nginx \ 24 | libpng-dev && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 28 | chmod +x ~/miniconda.sh && \ 29 | ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh && \ 31 | /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \ 32 | /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \ 33 | /opt/conda/bin/conda clean -ya 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | RUN pip install --upgrade pip 37 | 38 | RUN python --version 39 | RUN pip --version 40 | 41 | # #RUN df -a 42 | 43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision 44 | 45 | 46 | # Python won’t try to write .pyc or .pyo files on the import of source modules 47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging 48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 49 | 50 | RUN nvcc --version 51 | RUN which nvcc 52 | 53 | RUN pip --no-cache-dir install \ 54 | flask \ 55 | pathlib \ 56 | gevent \ 57 | gunicorn \ 58 | scipy \ 59 | scikit-learn \ 60 | pandas \ 61 | fastprogress \ 62 | python-box \ 63 | tensorboardX 64 | 65 | 66 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 67 | 68 | # RUN pip --no-cache-dir install fast-bert 69 | RUN pip install fast-bert==1.9.15 70 | 71 | RUN pip install cryptography --upgrade && \ 72 | pip install urllib3 --upgrade 73 | 74 | ENV PATH="/opt/ml/code:${PATH}" 75 | COPY /bert /opt/ml/code 76 | 77 | WORKDIR /opt/ml/code 78 | 79 | RUN cd $WORKDIR 80 | 81 | RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased distilroberta-base 82 | 83 | RUN rm -rf /opt/ml/input/data/training/cache/ 84 | -------------------------------------------------------------------------------- /container/batch.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ARG ARCH=gpu 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ARG py_version=3 5 | 6 | # Validate that arguments are specified 7 | RUN test $py_version || exit 1 8 | 9 | RUN echo $py_version 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | ca-certificates \ 17 | libjpeg-dev \ 18 | nginx \ 19 | jq \ 20 | libsm6 \ 21 | libxext6 \ 22 | libxrender-dev \ 23 | nginx \ 24 | libpng-dev && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 28 | chmod +x ~/miniconda.sh && \ 29 | ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh && \ 31 | /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \ 32 | /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \ 33 | /opt/conda/bin/conda clean -ya 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | RUN pip install --upgrade pip 37 | 38 | RUN python --version 39 | RUN pip --version 40 | 41 | # #RUN df -a 42 | 43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision 44 | 45 | 46 | # Python won’t try to write .pyc or .pyo files on the import of source modules 47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging 48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 49 | 50 | RUN nvcc --version 51 | RUN which nvcc 52 | 53 | RUN pip --no-cache-dir install \ 54 | flask \ 55 | pathlib \ 56 | gevent \ 57 | gunicorn \ 58 | scipy \ 59 | scikit-learn \ 60 | pandas \ 61 | fastprogress \ 62 | python-box \ 63 | tensorboardX 64 | 65 | 66 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 67 | 68 | # RUN pip --no-cache-dir install fast-bert 69 | RUN pip install fast-bert==1.9.15 70 | 71 | RUN pip install cryptography --upgrade && \ 72 | pip install urllib3 --upgrade 73 | 74 | ENV PATH="/opt/ml/code:${PATH}" 75 | COPY /bert_batch /opt/ml/code 76 | 77 | WORKDIR /opt/ml/code 78 | 79 | RUN cd $WORKDIR -------------------------------------------------------------------------------- /container/bert/download_pretrained_models.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from tqdm import tqdm 4 | import requests 5 | import urllib3 6 | from transformers import AutoModel, AutoTokenizer 7 | 8 | 9 | def download_pretrained_files(model_name, location): 10 | try: 11 | model_path = model_name.replace("/", ":") 12 | model = AutoModel.from_pretrained(model_name) 13 | model.save_pretrained(location / model_path) 14 | tokenizer = AutoTokenizer.from_pretrained(model_name) 15 | tokenizer.save_pretrained(location / model_path) 16 | except Exception as e: 17 | print(e) 18 | print("error downloading model {}".format(model_name)) 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | 24 | parser.add_argument( 25 | "--location_dir", 26 | default=None, 27 | type=str, 28 | required=True, 29 | help="The location where pretrained model needs to be stored", 30 | ) 31 | 32 | parser.add_argument( 33 | "--models", 34 | default=None, 35 | type=str, 36 | required=True, 37 | nargs="*", 38 | help="download the pretrained models", 39 | ) 40 | 41 | args = parser.parse_args() 42 | print(args) 43 | Path(args.location_dir).mkdir(exist_ok=True) 44 | 45 | # [download_pretrained_files(k, location=Path(args.location_dir)) 46 | # for k, v in BERT_PRETRAINED_MODEL_ARCHIVE_MAP.items()] 47 | [ 48 | download_pretrained_files(item, location=Path(args.location_dir)) 49 | for item in args.models 50 | ] 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /container/bert/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 4; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | 27 | location ~ ^/(ping|execution-parameters|invocations) { 28 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 29 | proxy_set_header Host $http_host; 30 | proxy_redirect off; 31 | proxy_pass http://gunicorn; 32 | } 33 | 34 | location / { 35 | return 404 "{}"; 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /container/bert/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | import pickle 5 | import sys 6 | import signal 7 | import traceback 8 | import re 9 | import flask 10 | import pandas as pd 11 | import torch 12 | from collections import OrderedDict 13 | 14 | from fast_bert.prediction import BertClassificationPredictor 15 | 16 | from fast_bert.utils.spellcheck import BingSpellCheck 17 | from pathlib import Path 18 | 19 | import warnings 20 | 21 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 22 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 23 | 24 | prefix = "/opt/ml/" 25 | 26 | # PATH = Path(os.path.join(prefix, "model")) 27 | PATH = os.path.join(prefix, "model") 28 | 29 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin") 30 | 31 | # request_text = None 32 | 33 | 34 | class ScoringService(object): 35 | model = None # Where we keep the model when it's loaded 36 | 37 | @classmethod 38 | def get_predictor_model(cls): 39 | 40 | # print(cls.searching_all_files(PATH)) 41 | # Get model predictor 42 | if cls.model is None: 43 | with open(os.path.join(PATH, "model_config.json")) as f: 44 | model_config = json.load(f) 45 | 46 | predictor = BertClassificationPredictor( 47 | os.path.join(PATH, "model_out"), 48 | label_path=PATH, 49 | multi_label=bool(model_config["multi_label"]), 50 | model_type=model_config["model_type"], 51 | do_lower_case=bool(model_config["do_lower_case"]), 52 | ) 53 | cls.model = predictor 54 | 55 | return cls.model 56 | 57 | @classmethod 58 | def predict(cls, text, bing_key=None): 59 | """For the input, do the predictions and return them. 60 | Args: 61 | input (a pandas dataframe): The data on which to do the predictions. There will be 62 | one prediction per row in the dataframe""" 63 | predictor_model = cls.get_predictor_model() 64 | if bing_key: 65 | spellChecker = BingSpellCheck(bing_key) 66 | text = spellChecker.spell_check(text) 67 | prediction = predictor_model.predict(text) 68 | 69 | return prediction 70 | 71 | @classmethod 72 | def predict_batch(cls, texts): 73 | """For the input, do the predictions and return them. 74 | Args: 75 | input (a pandas dataframe): The data on which to do the predictions. There will be 76 | one prediction per row in the dataframe""" 77 | predictor_model = cls.get_predictor_model() 78 | output_labels_count = int( 79 | os.environ.get( 80 | "OUTPUT_LABELS_COUNT", len(predictor_model.learner.data.labels) 81 | ) 82 | ) 83 | 84 | print("output_labels_count", output_labels_count) 85 | 86 | predictions = predictor_model.predict_batch(texts) 87 | return cls.process_batch_results( 88 | texts, predictions, labels_count=output_labels_count 89 | ) 90 | 91 | @classmethod 92 | def searching_all_files(cls, directory: Path): 93 | file_list = [] # A list for storing files existing in directories 94 | 95 | for x in directory.iterdir(): 96 | if x.is_file(): 97 | file_list.append(str(x)) 98 | else: 99 | file_list.append(cls.searching_all_files(x)) 100 | 101 | return file_list 102 | 103 | @classmethod 104 | def process_batch_results(cls, texts, results, labels_count=None): 105 | processed_results = [] 106 | for i, result in enumerate(results): 107 | processed = OrderedDict() 108 | processed["text"] = texts[i] 109 | result = result[:labels_count] if labels_count else result 110 | for index, label in enumerate(result): 111 | processed["label_{}".format(index + 1)] = label[0] 112 | processed["confidence_{}".format(index + 1)] = label[1] 113 | processed_results.append(processed) 114 | 115 | return processed_results 116 | 117 | 118 | # The flask app for serving predictions 119 | app = flask.Flask(__name__) 120 | 121 | 122 | @app.route("/ping", methods=["GET"]) 123 | def ping(): 124 | """Determine if the container is working and healthy. In this sample container, we declare 125 | it healthy if we can load the model successfully.""" 126 | health = ( 127 | ScoringService.get_predictor_model() is not None 128 | ) # You can insert a health check here 129 | 130 | status = 200 if health else 404 131 | return flask.Response(response="\n", status=status, mimetype="application/json") 132 | 133 | 134 | @app.route("/execution-parameters", methods=["GET"]) 135 | def get_execution_parameters(): 136 | params = { 137 | "MaxConcurrentTransforms": 3, 138 | "BatchStrategy": "MULTI_RECORD", 139 | "MaxPayloadInMB": 6, 140 | } 141 | return flask.Response( 142 | response=json.dumps(params), status="200", mimetype="application/json" 143 | ) 144 | 145 | 146 | @app.route("/invocations", methods=["POST"]) 147 | def transformation(): 148 | """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert 149 | it to a pandas data frame for internal use and then convert the predictions back to CSV (which really 150 | just means one prediction per line, since there's a single column. 151 | """ 152 | data = None 153 | text = None 154 | 155 | if flask.request.content_type == "application/json": 156 | print("calling json launched") 157 | data = flask.request.get_json(silent=True) 158 | 159 | text = data["text"] 160 | try: 161 | bing_key = data["bing_key"] 162 | except Exception: 163 | bing_key = None 164 | 165 | # Do the prediction 166 | predictions = ScoringService.predict(text, bing_key) 167 | result = json.dumps(predictions[:10]) 168 | return flask.Response(response=result, status=200, mimetype="application/json") 169 | 170 | elif flask.request.content_type == "text/csv": 171 | data = flask.request.data.decode("utf-8") 172 | df = pd.read_csv(io.StringIO(data), header="infer") 173 | predictions = ScoringService.predict_batch(list(df["text"].values)) 174 | 175 | out = io.StringIO() 176 | pd.DataFrame(predictions).to_csv(out, index=False) 177 | result = out.getvalue() 178 | return flask.Response(response=result, status=200, mimetype="text/csv") 179 | 180 | elif flask.request.content_type == "text/plain": 181 | data = flask.request.data.decode("utf-8") 182 | s = io.StringIO(data) 183 | # convert txt file into list of texts 184 | texts = [] 185 | for line in s: 186 | texts.append(line) 187 | predictions = ScoringService.predict_batch(texts) 188 | out = io.StringIO() 189 | pd.DataFrame(predictions).to_csv(out, index=False) 190 | result = out.getvalue() 191 | return flask.Response(response=result, status=200, mimetype="text/csv") 192 | 193 | else: 194 | return flask.Response( 195 | response="This predictor only supports JSON, txt or CSV data", 196 | status=415, 197 | mimetype="text/plain", 198 | ) 199 | -------------------------------------------------------------------------------- /container/bert/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | from __future__ import print_function 17 | import multiprocessing 18 | import os 19 | import signal 20 | import subprocess 21 | import sys 22 | 23 | cpu_count = multiprocessing.cpu_count() 24 | 25 | model_server_timeout = os.environ.get("MODEL_SERVER_TIMEOUT", 18000) 26 | model_server_workers = int(os.environ.get("MODEL_SERVER_WORKERS", 4)) 27 | 28 | 29 | def sigterm_handler(nginx_pid, gunicorn_pid): 30 | try: 31 | os.kill(nginx_pid, signal.SIGQUIT) 32 | except OSError: 33 | pass 34 | try: 35 | os.kill(gunicorn_pid, signal.SIGTERM) 36 | except OSError: 37 | pass 38 | 39 | sys.exit(0) 40 | 41 | 42 | def start_server(): 43 | print("Starting the inference server with {} workers.".format(model_server_workers)) 44 | 45 | # link the log streams to stdout/err so they will be logged to the container logs 46 | subprocess.check_call(["ln", "-sf", "/dev/stdout", "/var/log/nginx/access.log"]) 47 | subprocess.check_call(["ln", "-sf", "/dev/stderr", "/var/log/nginx/error.log"]) 48 | 49 | nginx = subprocess.Popen(["nginx", "-c", "/opt/ml/code/nginx.conf"]) 50 | gunicorn = subprocess.Popen( 51 | [ 52 | "gunicorn", 53 | "--timeout", 54 | str(model_server_timeout), 55 | "-k", 56 | "gevent", 57 | "-b", 58 | "unix:/tmp/gunicorn.sock", 59 | "-w", 60 | str(model_server_workers), 61 | "wsgi:app", 62 | ] 63 | ) 64 | 65 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 66 | 67 | # If either subprocess exits, so do we. 68 | pids = set([nginx.pid, gunicorn.pid]) 69 | while True: 70 | pid, _ = os.wait() 71 | if pid in pids: 72 | break 73 | 74 | sigterm_handler(nginx.pid, gunicorn.pid) 75 | print("Inference server exiting") 76 | 77 | 78 | # The main routine just invokes the start function. 79 | 80 | if __name__ == "__main__": 81 | start_server() 82 | -------------------------------------------------------------------------------- /container/bert/train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import json 5 | import pickle 6 | import sys 7 | import traceback 8 | import pandas as pd 9 | import numpy as np 10 | import random 11 | import datetime 12 | from pathlib import Path 13 | import logging 14 | import torch 15 | import shutil 16 | from transformers import AutoTokenizer 17 | 18 | from fast_bert.data_cls import BertDataBunch 19 | from fast_bert.learner_cls import BertLearner 20 | from fast_bert.metrics import ( 21 | accuracy, 22 | accuracy_multilabel, 23 | accuracy_thresh, 24 | fbeta, 25 | roc_auc, 26 | ) 27 | 28 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | channel_name = "training" 31 | 32 | prefix = "/opt/ml/" 33 | input_path = prefix + "input/data" # opt/ml/input/data 34 | code_path = prefix + "code" # opt/ml/code 35 | pretrained_model_path = ( 36 | code_path + "/pretrained_models" 37 | ) # opt/ml/code/pretrained_models 38 | 39 | finetuned_path = input_path + "/{}/finetuned".format( 40 | channel_name 41 | ) # opt/ml/input/data/training/finetuned 42 | 43 | output_path = os.path.join(prefix, "output") # opt/ml/output 44 | model_path = os.path.join(prefix, "model") # opt/ml/model 45 | 46 | training_config_path = os.path.join( 47 | input_path, "{}/config".format(channel_name) 48 | ) # opt/ml/input/data/training/config 49 | 50 | hyperparam_path = os.path.join( 51 | prefix, "input/config/hyperparameters.json" 52 | ) # opt/ml/input/config/hyperparameters.json 53 | config_path = os.path.join( 54 | training_config_path, "training_config.json" 55 | ) # opt/ml/input/data/training/config/training_config.json 56 | 57 | 58 | # This algorithm has a single channel of input data called 'training'. Since we run in 59 | # File mode, the input files are copied to the directory specified here. 60 | 61 | training_path = os.path.join(input_path, channel_name) # opt/ml/input/data/training 62 | 63 | 64 | def searching_all_files(directory: Path): 65 | file_list = [] # A list for storing files existing in directories 66 | 67 | for x in directory.iterdir(): 68 | if x.is_file(): 69 | file_list.append(str(x)) 70 | else: 71 | file_list.append(searching_all_files(x)) 72 | 73 | return file_list 74 | 75 | 76 | # The function to execute the training. 77 | def train(): 78 | 79 | print("Starting the training.") 80 | 81 | DATA_PATH = Path(training_path) 82 | LABEL_PATH = Path(training_path) 83 | 84 | try: 85 | print(config_path) 86 | with open(config_path, "r") as f: 87 | training_config = json.load(f) 88 | print(training_config) 89 | 90 | with open(hyperparam_path, "r") as tc: 91 | hyperparameters = json.load(tc) 92 | print(hyperparameters) 93 | 94 | # convert string bools to booleans 95 | training_config["multi_label"] = training_config["multi_label"] == "True" 96 | training_config["fp16"] = training_config["fp16"] == "True" 97 | training_config["text_col"] = training_config.get("text_col", "text") 98 | training_config["label_col"] = training_config.get("label_col", "label") 99 | training_config["train_file"] = training_config.get("train_file", "train.csv") 100 | training_config["val_file"] = training_config.get("val_file", "val.csv") 101 | training_config["label_file"] = training_config.get("label_file", "labels.csv") 102 | training_config["random_state"] = training_config.get("random_state", None) 103 | 104 | if training_config["random_state"] is not None: 105 | print("setting random state {}".format(training_config["random_state"])) 106 | random_seed(int(training_config["random_state"])) 107 | 108 | # Logger 109 | # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"])) 110 | logging.basicConfig( 111 | level=logging.INFO, 112 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 113 | datefmt="%m/%d/%Y %H:%M:%S", 114 | handlers=[ 115 | # logging.FileHandler(logfile), 116 | logging.StreamHandler(sys.stdout) 117 | ], 118 | ) 119 | 120 | logger = logging.getLogger() 121 | 122 | # Define pretrained model path 123 | PRETRAINED_PATH = Path(pretrained_model_path) / training_config[ 124 | "model_name" 125 | ].replace("/", ":") 126 | if PRETRAINED_PATH.is_dir(): 127 | logger.info("model path used {}".format(PRETRAINED_PATH)) 128 | model_name_path = str(PRETRAINED_PATH) 129 | else: 130 | model_name_path = training_config["model_name"] 131 | logger.info( 132 | "model {} is not preloaded. Will try to download.".format( 133 | model_name_path 134 | ) 135 | ) 136 | 137 | finetuned_model_name = training_config.get("finetuned_model", None) 138 | if finetuned_model_name is not None: 139 | finetuned_model = os.path.join(finetuned_path, finetuned_model_name) 140 | logger.info("finetuned model loaded from {}".format(finetuned_model)) 141 | else: 142 | logger.info( 143 | "finetuned model not available - loading standard pretrained model" 144 | ) 145 | finetuned_model = None 146 | 147 | # use auto-tokenizer 148 | tokenizer = AutoTokenizer.from_pretrained(model_name_path, use_fast=True) 149 | 150 | device = torch.device("cuda") 151 | if torch.cuda.device_count() > 1: 152 | multi_gpu = True 153 | else: 154 | multi_gpu = False 155 | 156 | logger.info("Number of GPUs: {}".format(torch.cuda.device_count())) 157 | 158 | if training_config["multi_label"] is True: 159 | label_col = json.loads(training_config["label_col"]) 160 | else: 161 | label_col = training_config["label_col"] 162 | 163 | logger.info("label columns: {}".format(label_col)) 164 | test_data = None 165 | test_df = None 166 | if training_config.get("test_file", None): 167 | try: 168 | test_df = pd.read_csv(DATA_PATH / training_config["test_file"]) 169 | except Exception: 170 | test_df = pd.read_csv( 171 | DATA_PATH / training_config["test_file"], encoding="latin1" 172 | ) 173 | test_data = list(test_df["text"]) 174 | logger.info("Test file available. Test count {}".format(len(test_df))) 175 | 176 | # Create databunch 177 | databunch = BertDataBunch( 178 | DATA_PATH, 179 | LABEL_PATH, 180 | tokenizer, 181 | train_file=training_config["train_file"], 182 | val_file=training_config["val_file"], 183 | label_file=training_config["label_file"], 184 | text_col=training_config["text_col"], 185 | test_data=test_data, 186 | label_col=label_col, 187 | batch_size_per_gpu=int(hyperparameters["train_batch_size"]), 188 | max_seq_length=int(hyperparameters["max_seq_length"]), 189 | multi_gpu=multi_gpu, 190 | multi_label=training_config["multi_label"], 191 | model_type=training_config["model_type"], 192 | logger=logger, 193 | no_cache=True, 194 | ) 195 | 196 | metrics = [] 197 | if training_config["multi_label"] is False: 198 | metrics.append({"name": "accuracy", "function": accuracy}) 199 | else: 200 | metrics.append({"name": "accuracy_thresh", "function": accuracy_thresh}) 201 | metrics.append({"name": "roc_auc", "function": roc_auc}) 202 | metrics.append({"name": "fbeta", "function": fbeta}) 203 | 204 | logger.info("databunch labels: {}".format(len(databunch.labels))) 205 | 206 | # Initialise the learner 207 | learner = BertLearner.from_pretrained_model( 208 | databunch, 209 | model_name_path, 210 | metrics=metrics, 211 | device=device, 212 | logger=logger, 213 | output_dir=Path(model_path), 214 | finetuned_wgts_path=finetuned_model, 215 | is_fp16=training_config["fp16"], 216 | fp16_opt_level=training_config["fp16_opt_level"], 217 | warmup_steps=int(hyperparameters["warmup_steps"]), 218 | grad_accumulation_steps=int(training_config["grad_accumulation_steps"]), 219 | multi_gpu=multi_gpu, 220 | multi_label=training_config["multi_label"], 221 | logging_steps=int(training_config["logging_steps"]), 222 | ) 223 | 224 | learner.fit( 225 | int(hyperparameters["epochs"]), 226 | float(hyperparameters["lr"]), 227 | schedule_type=hyperparameters["lr_schedule"], 228 | optimizer_type=hyperparameters["optimizer_type"], 229 | ) 230 | 231 | results = learner.validate(return_preds=True) 232 | logger.info("y_pred: {}".format(json.dumps(results["y_preds"].tolist()))) 233 | logger.info("y_true: {}".format(json.dumps(results["y_true"].tolist()))) 234 | logger.info("labels: {}".format(json.dumps(databunch.labels))) 235 | 236 | if test_data is not None: 237 | predictions = learner.predict_batch() 238 | results = [] 239 | for index, row in test_df.iterrows(): 240 | preds = predictions[index][:3] 241 | result = {"text": row.get("text"), "ground_truth": row.get("label")} 242 | for i, pred in enumerate(preds): 243 | result["label_{}".format(i + 1)] = pred[0] 244 | result["confidence_{}".format(i + 1)] = pred[1] 245 | 246 | results.append(result) 247 | 248 | # save test results with model outcome 249 | pd.DataFrame(results).to_csv( 250 | os.path.join(model_path, "test_result.csv"), index=None 251 | ) 252 | 253 | pd.DataFrame(results).to_csv( 254 | os.path.join(output_path, "test_result.csv"), index=None 255 | ) 256 | 257 | # save model and tokenizer artefacts 258 | learner.save_model() 259 | 260 | # save model config file 261 | with open(os.path.join(model_path, "model_config.json"), "w") as f: 262 | json.dump(training_config, f) 263 | 264 | # save label file 265 | with open(os.path.join(model_path, "labels.csv"), "w") as f: 266 | f.write("\n".join(databunch.labels)) 267 | 268 | # save label_metadata csv file from LABEL_PATH to model_path 269 | shutil.copyfile( 270 | os.path.join(LABEL_PATH, "labels_metadata.json"), 271 | os.path.join(model_path, "labels_metadata.json"), 272 | ) 273 | 274 | except Exception as e: 275 | # Write out an error file. This will be returned as the failureReason in the 276 | # DescribeTrainingJob result. 277 | trc = traceback.format_exc() 278 | with open(os.path.join(output_path, "failure"), "w") as s: 279 | s.write("Exception during training: " + str(e) + "\n" + trc) 280 | # Printing this causes the exception to be in the training job logs, as well. 281 | print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr) 282 | # A non-zero exit code causes the training job to be marked as Failed. 283 | sys.exit(255) 284 | 285 | 286 | def random_seed(seed_value): 287 | random.seed(seed_value) # Python 288 | np.random.seed(seed_value) # cpu vars 289 | 290 | torch.manual_seed(seed_value) # cpu vars 291 | 292 | if torch.cuda.is_available(): 293 | torch.cuda.manual_seed(seed_value) 294 | torch.cuda.manual_seed_all(seed_value) # gpu vars 295 | torch.backends.cudnn.deterministic = True # needed 296 | torch.backends.cudnn.benchmark = False 297 | 298 | 299 | if __name__ == "__main__": 300 | train() 301 | 302 | # A zero exit code causes the job to be marked a Succeeded. 303 | sys.exit(0) 304 | -------------------------------------------------------------------------------- /container/bert/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app -------------------------------------------------------------------------------- /container/bert_batch/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 4; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | 27 | location ~ ^/(ping|execution-parameters|invocations) { 28 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 29 | proxy_set_header Host $http_host; 30 | proxy_redirect off; 31 | proxy_pass http://gunicorn; 32 | } 33 | 34 | location / { 35 | return 404 "{}"; 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /container/bert_batch/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | import pickle 5 | import sys 6 | import signal 7 | import traceback 8 | import re 9 | import flask 10 | import pandas as pd 11 | import torch 12 | from collections import OrderedDict 13 | 14 | from fast_bert.prediction import BertClassificationPredictor 15 | 16 | from fast_bert.utils.spellcheck import BingSpellCheck 17 | from pathlib import Path 18 | 19 | import warnings 20 | 21 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 22 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 23 | 24 | prefix = "/opt/ml/" 25 | 26 | # PATH = Path(os.path.join(prefix, "model")) 27 | PATH = os.path.join(prefix, "model") 28 | 29 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin") 30 | 31 | # request_text = None 32 | 33 | 34 | class ScoringService(object): 35 | model = None # Where we keep the model when it's loaded 36 | 37 | @classmethod 38 | def get_predictor_model(cls): 39 | 40 | # print(cls.searching_all_files(PATH)) 41 | # Get model predictor 42 | if cls.model is None: 43 | with open(os.path.join(PATH, "model_config.json")) as f: 44 | model_config = json.load(f) 45 | 46 | predictor = BertClassificationPredictor( 47 | os.path.join(PATH, "model_out"), 48 | label_path=PATH, 49 | multi_label=bool(model_config["multi_label"]), 50 | model_type=model_config["model_type"], 51 | do_lower_case=bool(model_config["do_lower_case"]), 52 | ) 53 | cls.model = predictor 54 | 55 | return cls.model 56 | 57 | @classmethod 58 | def predict(cls, text, bing_key=None): 59 | """For the input, do the predictions and return them. 60 | Args: 61 | input (a pandas dataframe): The data on which to do the predictions. There will be 62 | one prediction per row in the dataframe""" 63 | predictor_model = cls.get_predictor_model() 64 | if bing_key: 65 | spellChecker = BingSpellCheck(bing_key) 66 | text = spellChecker.spell_check(text) 67 | prediction = predictor_model.predict(text) 68 | 69 | return prediction 70 | 71 | @classmethod 72 | def predict_batch(cls, texts): 73 | """For the input, do the predictions and return them. 74 | Args: 75 | input (a pandas dataframe): The data on which to do the predictions. There will be 76 | one prediction per row in the dataframe""" 77 | predictor_model = cls.get_predictor_model() 78 | output_labels_count = int( 79 | os.environ.get( 80 | "OUTPUT_LABELS_COUNT", len(predictor_model.learner.data.labels) 81 | ) 82 | ) 83 | 84 | print("output_labels_count", output_labels_count) 85 | 86 | predictions = predictor_model.predict_batch(texts) 87 | return cls.process_batch_results( 88 | texts, predictions, labels_count=output_labels_count 89 | ) 90 | 91 | @classmethod 92 | def searching_all_files(cls, directory: Path): 93 | file_list = [] # A list for storing files existing in directories 94 | 95 | for x in directory.iterdir(): 96 | if x.is_file(): 97 | file_list.append(str(x)) 98 | else: 99 | file_list.append(cls.searching_all_files(x)) 100 | 101 | return file_list 102 | 103 | @classmethod 104 | def process_batch_results(cls, texts, results, labels_count=None): 105 | processed_results = [] 106 | for i, result in enumerate(results): 107 | processed = OrderedDict() 108 | processed["text"] = texts[i] 109 | result = result[:labels_count] if labels_count else result 110 | for index, label in enumerate(result): 111 | processed["label_{}".format(index + 1)] = label[0] 112 | processed["confidence_{}".format(index + 1)] = label[1] 113 | processed_results.append(processed) 114 | 115 | return processed_results 116 | 117 | 118 | # The flask app for serving predictions 119 | app = flask.Flask(__name__) 120 | 121 | 122 | @app.route("/ping", methods=["GET"]) 123 | def ping(): 124 | """Determine if the container is working and healthy. In this sample container, we declare 125 | it healthy if we can load the model successfully.""" 126 | health = ( 127 | ScoringService.get_predictor_model() is not None 128 | ) # You can insert a health check here 129 | 130 | status = 200 if health else 404 131 | return flask.Response(response="\n", status=status, mimetype="application/json") 132 | 133 | 134 | @app.route("/execution-parameters", methods=["GET"]) 135 | def get_execution_parameters(): 136 | params = { 137 | "MaxConcurrentTransforms": 3, 138 | "BatchStrategy": "MULTI_RECORD", 139 | "MaxPayloadInMB": 6, 140 | } 141 | return flask.Response( 142 | response=json.dumps(params), status="200", mimetype="application/json" 143 | ) 144 | 145 | 146 | @app.route("/invocations", methods=["POST"]) 147 | def transformation(): 148 | """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert 149 | it to a pandas data frame for internal use and then convert the predictions back to CSV (which really 150 | just means one prediction per line, since there's a single column. 151 | """ 152 | data = None 153 | text = None 154 | 155 | if flask.request.content_type == "application/json": 156 | print("calling json launched") 157 | data = flask.request.get_json(silent=True) 158 | 159 | text = data["text"] 160 | try: 161 | bing_key = data["bing_key"] 162 | except Exception: 163 | bing_key = None 164 | 165 | # Do the prediction 166 | predictions = ScoringService.predict(text, bing_key) 167 | result = json.dumps(predictions[:10]) 168 | return flask.Response(response=result, status=200, mimetype="application/json") 169 | 170 | elif flask.request.content_type == "text/csv": 171 | data = flask.request.data.decode("utf-8") 172 | df = pd.read_csv(io.StringIO(data), header="infer") 173 | predictions = ScoringService.predict_batch(list(df["text"].values)) 174 | 175 | out = io.StringIO() 176 | pd.DataFrame(predictions).to_csv(out, index=False) 177 | result = out.getvalue() 178 | return flask.Response(response=result, status=200, mimetype="text/csv") 179 | 180 | elif flask.request.content_type == "text/plain": 181 | data = flask.request.data.decode("utf-8") 182 | s = io.StringIO(data) 183 | # convert txt file into list of texts 184 | texts = [] 185 | for line in s: 186 | texts.append(line) 187 | predictions = ScoringService.predict_batch(texts) 188 | out = io.StringIO() 189 | pd.DataFrame(predictions).to_csv(out, index=False) 190 | result = out.getvalue() 191 | return flask.Response(response=result, status=200, mimetype="text/csv") 192 | 193 | else: 194 | return flask.Response( 195 | response="This predictor only supports JSON, txt or CSV data", 196 | status=415, 197 | mimetype="text/plain", 198 | ) 199 | -------------------------------------------------------------------------------- /container/bert_batch/train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import json 5 | import logging 6 | import sys 7 | import traceback 8 | import pandas as pd 9 | import numpy as np 10 | import random 11 | import datetime 12 | from pathlib import Path 13 | import torch 14 | import shutil 15 | import tarfile 16 | from collections import OrderedDict 17 | from transformers import AutoTokenizer 18 | 19 | from fast_bert.data_cls import BertDataBunch 20 | from fast_bert.learner_cls import BertLearner 21 | from fast_bert.metrics import ( 22 | accuracy, 23 | accuracy_multilabel, 24 | accuracy_thresh, 25 | fbeta, 26 | roc_auc, 27 | ) 28 | 29 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S") 30 | # Logger 31 | # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"])) 32 | logging.basicConfig( 33 | level=logging.INFO, 34 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 35 | datefmt="%m/%d/%Y %H:%M:%S", 36 | handlers=[ 37 | # logging.FileHandler(logfile), 38 | logging.StreamHandler(sys.stdout) 39 | ], 40 | ) 41 | 42 | logger = logging.getLogger() 43 | 44 | channel_name = "training" 45 | 46 | prefix = "/opt/ml/" 47 | input_path = prefix + "input/data" # opt/ml/input/data 48 | output_path = os.path.join(prefix, "model") 49 | 50 | finetuned_path = input_path + "/{}/finetuned".format( 51 | channel_name 52 | ) # opt/ml/input/data/training/finetuned 53 | 54 | training_config_path = os.path.join( 55 | input_path, "{}/config".format(channel_name) 56 | ) # opt/ml/input/data/training/config 57 | 58 | hyperparam_path = os.path.join( 59 | prefix, "input/config/hyperparameters.json" 60 | ) # opt/ml/input/config/hyperparameters.json 61 | config_path = os.path.join( 62 | training_config_path, "training_config.json" 63 | ) # opt/ml/input/data/training/config/training_config.json 64 | 65 | 66 | # This algorithm has a single channel of input data called 'training'. Since we run in 67 | # File mode, the input files are copied to the directory specified here. 68 | 69 | training_path = os.path.join(input_path, channel_name) # opt/ml/input/data/training 70 | 71 | 72 | # The function to execute the training. 73 | def train(): 74 | logger.info("Starting batch inference...") 75 | 76 | DATA_PATH = Path(training_path) 77 | MODEL_PATH = DATA_PATH / "model" 78 | ARTIFACTS_PATH = MODEL_PATH / "model_out" 79 | 80 | # untar model.tar.gz to model directory 81 | with tarfile.open(DATA_PATH / "model.tar.gz", "r:gz") as tar: 82 | tar.extractall(MODEL_PATH) 83 | tar.close() 84 | 85 | try: 86 | with open(config_path, "r") as f: 87 | training_config = json.load(f) 88 | logger.info(training_config) 89 | 90 | with open(hyperparam_path, "r") as tc: 91 | hyperparameters = json.load(tc) 92 | logger.info(hyperparameters) 93 | 94 | # convert string bools to booleans 95 | training_config["multi_label"] = training_config["multi_label"] == "True" 96 | training_config["fp16"] = training_config["fp16"] == "True" 97 | training_config["text_col"] = training_config.get("text_col", "text") 98 | training_config["label_col"] = training_config.get("label_col", "label") 99 | training_config["train_file"] = training_config.get("train_file", "train.csv") 100 | training_config["val_file"] = training_config.get("val_file", "val.csv") 101 | training_config["label_file"] = training_config.get("label_file", "labels.csv") 102 | training_config["random_state"] = training_config.get("random_state", None) 103 | training_config["labels_count"] = int(training_config.get("labels_count", 10)) 104 | if training_config["random_state"] is not None: 105 | print("setting random state {}".format(training_config["random_state"])) 106 | random_seed(int(training_config["random_state"])) 107 | 108 | # use auto-tokenizer 109 | tokenizer = AutoTokenizer.from_pretrained(str(ARTIFACTS_PATH), use_fast=True) 110 | 111 | device = torch.device("cuda") 112 | if torch.cuda.device_count() > 1: 113 | multi_gpu = True 114 | else: 115 | multi_gpu = False 116 | 117 | logger.info("Number of GPUs: {}".format(torch.cuda.device_count())) 118 | 119 | # Create databunch 120 | databunch = BertDataBunch( 121 | MODEL_PATH, 122 | MODEL_PATH, 123 | tokenizer, 124 | train_file=None, 125 | val_file=None, 126 | batch_size_per_gpu=int(hyperparameters["train_batch_size"]), 127 | max_seq_length=int(hyperparameters["max_seq_length"]), 128 | multi_gpu=multi_gpu, 129 | multi_label=training_config["multi_label"], 130 | model_type=training_config["model_type"], 131 | logger=logger, 132 | no_cache=True, 133 | ) 134 | 135 | # Initialise the learner 136 | learner = BertLearner.from_pretrained_model( 137 | databunch, 138 | str(ARTIFACTS_PATH), 139 | metrics=[], 140 | device=device, 141 | logger=logger, 142 | output_dir=None, 143 | is_fp16=False, 144 | multi_gpu=multi_gpu, 145 | multi_label=training_config["multi_label"], 146 | logging_steps=0, 147 | ) 148 | 149 | df = pd.read_csv(str(DATA_PATH / "data.csv"), header=None) 150 | df = df.iloc[:, 0:1] 151 | # if first row is header, remove it 152 | if df.iloc[0, 0] == "text": 153 | df = df.iloc[1:] 154 | df.columns = ["text"] 155 | df.dropna(subset=["text"], inplace=True) 156 | 157 | texts = list(df["text"].values) 158 | 159 | predictions = learner.predict_batch(texts) 160 | 161 | processed_predictions = process_batch_results( 162 | texts, results=predictions, labels_count=training_config["labels_count"] 163 | ) 164 | 165 | # save test results with model outcome 166 | pd.DataFrame(processed_predictions).to_csv( 167 | os.path.join(output_path, "out.csv"), index=None 168 | ) 169 | 170 | except Exception as e: 171 | # Write out an error file. This will be returned as the failureReason in the 172 | # DescribeTrainingJob result. 173 | trc = traceback.format_exc() 174 | with open(os.path.join(output_path, "failure"), "w") as s: 175 | s.write("Exception during batch inference: " + str(e) + "\n" + trc) 176 | # Printing this causes the exception to be in the training job logs, as well. 177 | logger.error( 178 | "Exception during training: " + str(e) + "\n" + trc, file=sys.stderr 179 | ) 180 | # A non-zero exit code causes the training job to be marked as Failed. 181 | sys.exit(255) 182 | 183 | 184 | def process_batch_results(texts, results, labels_count=None): 185 | processed_results = [] 186 | for i, result in enumerate(results): 187 | processed = OrderedDict() 188 | processed["text"] = texts[i] 189 | result = result[:labels_count] if labels_count else result 190 | for index, label in enumerate(result): 191 | processed["label_{}".format(index + 1)] = label[0] 192 | processed["confidence_{}".format(index + 1)] = label[1] 193 | processed_results.append(processed) 194 | 195 | return processed_results 196 | 197 | 198 | def random_seed(seed_value): 199 | random.seed(seed_value) # Python 200 | np.random.seed(seed_value) # cpu vars 201 | 202 | torch.manual_seed(seed_value) # cpu vars 203 | 204 | if torch.cuda.is_available(): 205 | torch.cuda.manual_seed(seed_value) 206 | torch.cuda.manual_seed_all(seed_value) # gpu vars 207 | torch.backends.cudnn.deterministic = True # needed 208 | torch.backends.cudnn.benchmark = False 209 | 210 | 211 | if __name__ == "__main__": 212 | train() 213 | 214 | # A zero exit code causes the job to be marked a Succeeded. 215 | sys.exit(0) 216 | -------------------------------------------------------------------------------- /container/bert_batch/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app -------------------------------------------------------------------------------- /container/build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-fast-bert" 9 | 10 | TAG="$1" 11 | 12 | # parameters 13 | FASTAI_VERSION="1.0" 14 | PY_VERSION="py36" 15 | 16 | # Get the account number associated with the current IAM credentials 17 | account=$(aws sts get-caller-identity --query Account --output text) 18 | 19 | if [ $? -ne 0 ] 20 | then 21 | exit 255 22 | fi 23 | 24 | chmod +x bert/train 25 | chmod +x bert/serve 26 | 27 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 28 | region=$(aws configure get region) 29 | region=${region:-us-west-2} 30 | 31 | # If the repository doesn't exist in ECR, create it. 32 | 33 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 34 | 35 | if [ $? -ne 0 ] 36 | then 37 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 38 | fi 39 | 40 | # Get the login command from ECR and execute it directly 41 | $(aws ecr get-login --region ${region} --no-include-email) 42 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert 43 | 44 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 45 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 46 | 47 | # loop for each architecture (cpu & gpu) 48 | for arch in gpu 49 | do 50 | echo "Building image with arch=${arch}, region=${region}" 51 | 52 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}" 53 | docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" -f "Dockerfile" . 54 | docker tag ${IMAGE}:${TAG} ${FULLNAME} 55 | docker push ${FULLNAME} 56 | done 57 | -------------------------------------------------------------------------------- /container/build_and_push_batch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-fast-bert" 9 | 10 | TAG="$1" 11 | 12 | # parameters 13 | FASTAI_VERSION="1.0" 14 | PY_VERSION="py36" 15 | 16 | # Get the account number associated with the current IAM credentials 17 | account=$(aws sts get-caller-identity --query Account --output text) 18 | 19 | if [ $? -ne 0 ] 20 | then 21 | exit 255 22 | fi 23 | 24 | chmod +x bert_batch/train 25 | 26 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 27 | region=$(aws configure get region) 28 | region=${region:-us-west-2} 29 | 30 | # If the repository doesn't exist in ECR, create it. 31 | 32 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 33 | 34 | if [ $? -ne 0 ] 35 | then 36 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 37 | fi 38 | 39 | # Get the login command from ECR and execute it directly 40 | $(aws ecr get-login --region ${region} --no-include-email) 41 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert 42 | 43 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 44 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 45 | 46 | # loop for each architecture (cpu & gpu) 47 | for arch in gpu 48 | do 49 | echo "Building image with arch=${arch}, region=${region}" 50 | 51 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}-batch" 52 | docker build -t ${IMAGE}:${TAG}-batch --build-arg ARCH="$arch" -f "batch.Dockerfile" . 53 | docker tag ${IMAGE}:${TAG}-batch ${FULLNAME} 54 | docker push ${FULLNAME} 55 | done 56 | -------------------------------------------------------------------------------- /container/pytorch_build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-sagemaker-fast-bert" 9 | 10 | # parameters 11 | FASTAI_VERSION="1.0" 12 | PY_VERSION="py36" 13 | 14 | # Get the account number associated with the current IAM credentials 15 | account=$(aws sts get-caller-identity --query Account --output text) 16 | 17 | if [ $? -ne 0 ] 18 | then 19 | exit 255 20 | fi 21 | 22 | chmod +x bert/train.py 23 | chmod +x bert/serve.py 24 | 25 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 26 | region=$(aws configure get region) 27 | region=${region:-eu-west-1} 28 | 29 | # If the repository doesn't exist in ECR, create it. 30 | 31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 32 | 33 | if [ $? -ne 0 ] 34 | then 35 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 36 | fi 37 | 38 | # Get the login command from ECR and execute it directly 39 | $(aws ecr get-login --region ${region} --no-include-email) 40 | 41 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 42 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 43 | 44 | # loop for each architecture (cpu & gpu) 45 | 46 | echo "Building image with arch=gpu, region=${region}" 47 | TAG="pytorch-gpu-${PY_VERSION}" 48 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}" 49 | docker build -t ${IMAGE}:${TAG} --no-cache --build-arg ARCH="gpu" -f "Dockerfile_pytorch_nvidia" . 50 | docker tag ${IMAGE}:${TAG} ${FULLNAME} 51 | docker push ${FULLNAME} 52 | -------------------------------------------------------------------------------- /container_lm/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ARG ARCH=gpu 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ARG py_version=3 5 | 6 | # Validate that arguments are specified 7 | RUN test $py_version || exit 1 8 | 9 | RUN echo $py_version 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | ca-certificates \ 17 | libjpeg-dev \ 18 | nginx \ 19 | jq \ 20 | libsm6 \ 21 | libxext6 \ 22 | libxrender-dev \ 23 | nginx \ 24 | libpng-dev && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 28 | chmod +x ~/miniconda.sh && \ 29 | ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh && \ 31 | /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \ 32 | /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \ 33 | /opt/conda/bin/conda clean -ya 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | RUN pip install --upgrade pip 37 | 38 | RUN python --version 39 | RUN pip --version 40 | 41 | # #RUN df -a 42 | 43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision 44 | 45 | 46 | # Python won’t try to write .pyc or .pyo files on the import of source modules 47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging 48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 49 | 50 | RUN nvcc --version 51 | RUN which nvcc 52 | 53 | RUN pip --no-cache-dir install \ 54 | flask \ 55 | pathlib \ 56 | gunicorn \ 57 | scipy \ 58 | scikit-learn \ 59 | pandas \ 60 | fastprogress \ 61 | python-box \ 62 | tensorboardX 63 | 64 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 65 | # RUN pip --no-cache-dir install fast-bert 66 | RUN pip install fast-bert==1.9.9 67 | # RUN pip install transformers==3.2.0 68 | 69 | RUN pip install cryptography --upgrade && \ 70 | pip install urllib3 --upgrade 71 | 72 | ENV PATH="/opt/ml/code:${PATH}" 73 | COPY /bert /opt/ml/code 74 | 75 | WORKDIR /opt/ml/code 76 | 77 | RUN cd $WORKDIR 78 | 79 | -------------------------------------------------------------------------------- /container_lm/bert/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | 27 | location ~ ^/(ping|invocations) { 28 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 29 | proxy_set_header Host $http_host; 30 | proxy_redirect off; 31 | proxy_pass http://gunicorn; 32 | } 33 | 34 | location / { 35 | return 404 "{}"; 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /container_lm/bert/train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import json 5 | import pickle 6 | import sys 7 | import traceback 8 | import pandas as pd 9 | import datetime 10 | from pathlib import Path 11 | 12 | import logging 13 | import math 14 | from dataclasses import dataclass, field 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from transformers import ( 20 | CONFIG_MAPPING, 21 | MODEL_WITH_LM_HEAD_MAPPING, 22 | AutoConfig, 23 | AutoModelWithLMHead, 24 | AutoTokenizer, 25 | DataCollatorForLanguageModeling, 26 | HfArgumentParser, 27 | LineByLineTextDataset, 28 | PreTrainedTokenizer, 29 | TextDataset, 30 | Trainer, 31 | TrainingArguments, 32 | set_seed, 33 | ) 34 | 35 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S") 36 | 37 | channel_name = "training" 38 | 39 | prefix = "/opt/ml/" 40 | input_path = prefix + "input/data" # opt/ml/input/data 41 | code_path = prefix + "code" # opt/ml/code 42 | 43 | output_path = os.path.join(prefix, "output") # opt/ml/output 44 | model_path = os.path.join(prefix, "model") # opt/ml/model 45 | 46 | training_config_path = os.path.join( 47 | input_path, "{}/config".format(channel_name) 48 | ) # opt/ml/input/data/training/config 49 | 50 | hyperparam_path = os.path.join( 51 | prefix, "input/config/hyperparameters.json" 52 | ) # opt/ml/input/config/hyperparameters.json 53 | config_path = os.path.join( 54 | training_config_path, "training_config.json" 55 | ) # opt/ml/input/data/training/config/training_config.json 56 | 57 | 58 | training_path = os.path.join(input_path, channel_name) # opt/ml/input/data/training 59 | 60 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 61 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 62 | 63 | 64 | @dataclass 65 | class ModelArguments: 66 | """ 67 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 68 | """ 69 | 70 | model_name_or_path: Optional[str] = field( 71 | default=None, 72 | metadata={ 73 | "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch." 74 | }, 75 | ) 76 | model_type: Optional[str] = field( 77 | default=None, 78 | metadata={ 79 | "help": "If training from scratch, pass a model type from the list: " 80 | + ", ".join(MODEL_TYPES) 81 | }, 82 | ) 83 | config_name: Optional[str] = field( 84 | default=None, 85 | metadata={ 86 | "help": "Pretrained config name or path if not the same as model_name" 87 | }, 88 | ) 89 | tokenizer_name: Optional[str] = field( 90 | default=None, 91 | metadata={ 92 | "help": "Pretrained tokenizer name or path if not the same as model_name" 93 | }, 94 | ) 95 | cache_dir: Optional[str] = field( 96 | default=None, 97 | metadata={ 98 | "help": "Where do you want to store the pretrained models downloaded from s3" 99 | }, 100 | ) 101 | 102 | 103 | @dataclass 104 | class DataTrainingArguments: 105 | """ 106 | Arguments pertaining to what data we are going to input our model for training and eval. 107 | """ 108 | 109 | train_data_file: Optional[str] = field( 110 | default=None, metadata={"help": "The input training data file (a text file)."} 111 | ) 112 | eval_data_file: Optional[str] = field( 113 | default=None, 114 | metadata={ 115 | "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." 116 | }, 117 | ) 118 | line_by_line: bool = field( 119 | default=False, 120 | metadata={ 121 | "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." 122 | }, 123 | ) 124 | 125 | mlm: bool = field( 126 | default=False, 127 | metadata={ 128 | "help": "Train with masked-language modeling loss instead of language modeling." 129 | }, 130 | ) 131 | mlm_probability: float = field( 132 | default=0.15, 133 | metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}, 134 | ) 135 | plm_probability: float = field( 136 | default=1 / 6, 137 | metadata={ 138 | "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling." 139 | }, 140 | ) 141 | max_span_length: int = field( 142 | default=5, 143 | metadata={ 144 | "help": "Maximum length of a span of masked tokens for permutation language modeling." 145 | }, 146 | ) 147 | 148 | block_size: int = field( 149 | default=-1, 150 | metadata={ 151 | "help": "Optional input sequence length after tokenization." 152 | "The training dataset will be truncated in block of this size for training." 153 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 154 | }, 155 | ) 156 | overwrite_cache: bool = field( 157 | default=False, 158 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 159 | ) 160 | 161 | 162 | def get_dataset( 163 | args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False 164 | ): 165 | file_path = args.eval_data_file if evaluate else args.train_data_file 166 | if args.line_by_line: 167 | return LineByLineTextDataset( 168 | tokenizer=tokenizer, file_path=file_path, block_size=args.block_size 169 | ) 170 | else: 171 | return TextDataset( 172 | tokenizer=tokenizer, 173 | file_path=file_path, 174 | block_size=args.block_size, 175 | overwrite_cache=args.overwrite_cache, 176 | ) 177 | 178 | 179 | # The function to execute the training. 180 | def train(): 181 | 182 | print("Starting the training.") 183 | 184 | DATA_PATH = Path(training_path) 185 | 186 | try: 187 | print(config_path) 188 | with open(config_path, "r") as f: 189 | training_config = json.load(f) 190 | print(training_config) 191 | 192 | with open(hyperparam_path, "r") as tc: 193 | hyperparameters = json.load(tc) 194 | print(hyperparameters) 195 | 196 | # convert string bools to booleans 197 | training_config["train_file"] = training_config.get("train_file", "train.csv") 198 | training_config["val_file"] = training_config.get("val_file", "val.csv") 199 | training_config["fp16"] = training_config["fp16"] == "True" 200 | training_config["line_by_line"] = training_config["line_by_line"] == "True" 201 | training_config["use_fast_tokenizer"] = ( 202 | training_config.get("use_fast_tokenizer", "True") == "True" 203 | ) 204 | training_config["mlm"] = training_config["mlm"] == "True" 205 | training_config["mlm_probability"] = float( 206 | training_config.get("mlm_probability", 0.15) 207 | ) 208 | training_config["block_size"] = int(training_config.get("block_size", -1)) 209 | 210 | training_config["random_state"] = ( 211 | int(training_config.get("random_state")) 212 | if training_config.get("random_state") 213 | else None 214 | ) 215 | 216 | training_config["train_size"] = float(training_config.get("train_size", 0.8)) 217 | 218 | data_args = DataTrainingArguments( 219 | train_data_file=str(DATA_PATH / training_config["train_file"]), 220 | eval_data_file=str(DATA_PATH / training_config["val_file"]), 221 | line_by_line=training_config["line_by_line"], 222 | mlm=training_config["mlm"], 223 | mlm_probability=training_config["mlm_probability"], 224 | block_size=training_config["block_size"], 225 | ) 226 | 227 | training_args = TrainingArguments( 228 | output_dir=model_path, 229 | overwrite_output_dir=True, 230 | do_train=True, 231 | do_eval=True, 232 | evaluate_during_training=True, 233 | per_device_train_batch_size=int(hyperparameters["train_batch_size"]), 234 | per_device_eval_batch_size=int(hyperparameters["train_batch_size"]) * 2, 235 | gradient_accumulation_steps=int(training_config["grad_accumulation_steps"]), 236 | warmup_steps=int(hyperparameters["warmup_steps"]), 237 | logging_steps=int(training_config["logging_steps"]), 238 | fp16=training_config["fp16"], 239 | fp16_opt_level=training_config["fp16_opt_level"], 240 | seed=training_config["random_state"], 241 | num_train_epochs=int(hyperparameters["epochs"]), 242 | learning_rate=float(hyperparameters["lr"]), 243 | save_steps=0, 244 | ) 245 | 246 | # Logger 247 | # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"])) 248 | logging.basicConfig( 249 | level=logging.INFO, 250 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 251 | datefmt="%m/%d/%Y %H:%M:%S", 252 | handlers=[ 253 | # logging.FileHandler(logfile), 254 | logging.StreamHandler(sys.stdout) 255 | ], 256 | ) 257 | 258 | logger = logging.getLogger() 259 | 260 | set_seed(training_args.seed) 261 | 262 | # use auto-tokenizer 263 | tokenizer = AutoTokenizer.from_pretrained( 264 | training_config["model_name"], 265 | use_fast=training_config["use_fast_tokenizer"], 266 | ) 267 | 268 | config = AutoConfig.from_pretrained(training_config["model_name"]) 269 | 270 | model = AutoModelWithLMHead.from_pretrained( 271 | training_config["model_name"], config=config 272 | ) 273 | model.resize_token_embeddings(len(tokenizer)) 274 | 275 | if ( 276 | config.model_type in ["bert", "roberta", "distilbert", "camembert"] 277 | and not data_args.mlm 278 | ): 279 | raise ValueError( 280 | "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" 281 | "--mlm flag (masked language modeling)." 282 | ) 283 | 284 | if data_args.block_size <= 0: 285 | data_args.block_size = tokenizer.max_len 286 | # Our input block size will be the max possible for the model 287 | else: 288 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 289 | 290 | # Get datasets 291 | 292 | train_dataset = get_dataset(data_args, tokenizer=tokenizer) 293 | eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) 294 | 295 | data_collator = DataCollatorForLanguageModeling( 296 | tokenizer=tokenizer, 297 | mlm=data_args.mlm, 298 | mlm_probability=data_args.mlm_probability, 299 | ) 300 | 301 | # Initialize our Trainer 302 | trainer = Trainer( 303 | model=model, 304 | args=training_args, 305 | data_collator=data_collator, 306 | train_dataset=train_dataset, 307 | eval_dataset=eval_dataset, 308 | prediction_loss_only=True, 309 | ) 310 | 311 | # Run pre-validation 312 | if training_args.do_eval: 313 | logger.info("*** Evaluate before training ***") 314 | logger.info(validate(trainer, logger)) 315 | 316 | trainer.train() 317 | 318 | trainer.save_model() 319 | # For convenience, we also re-save the tokenizer to the same directory, 320 | # so that you can share your model easily on huggingface.co/models =) 321 | if trainer.is_world_master(): 322 | tokenizer.save_pretrained(training_args.output_dir) 323 | 324 | # Run validation 325 | if training_args.do_eval: 326 | logger.info("*** Evaluate ***") 327 | # logger.info(validate(trainer, logger)) 328 | 329 | # save model config file 330 | with open(os.path.join(model_path, "model_config.json"), "w") as f: 331 | json.dump(training_config, f) 332 | 333 | except Exception as e: 334 | # Write out an error file. This will be returned as the failureReason in the 335 | # DescribeTrainingJob result. 336 | trc = traceback.format_exc() 337 | with open(os.path.join(output_path, "failure"), "w") as s: 338 | s.write("Exception during training: " + str(e) + "\n" + trc) 339 | # Printing this causes the exception to be in the training job logs, as well. 340 | print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr) 341 | # A non-zero exit code causes the training job to be marked as Failed. 342 | sys.exit(255) 343 | 344 | 345 | def validate(trainer: Trainer, logger): 346 | results = {} 347 | eval_output = trainer.evaluate() 348 | 349 | perplexity = math.exp(eval_output["eval_loss"]) 350 | result = {"perplexity": perplexity} 351 | 352 | results.update(result) 353 | 354 | return results 355 | 356 | 357 | if __name__ == "__main__": 358 | train() 359 | 360 | # A zero exit code causes the job to be marked a Succeeded. 361 | sys.exit(0) 362 | -------------------------------------------------------------------------------- /container_lm/bert/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app -------------------------------------------------------------------------------- /container_lm/build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-fast-bert-lm" 9 | 10 | TAG="$1" 11 | 12 | # parameters 13 | FASTAI_VERSION="1.0" 14 | PY_VERSION="py36" 15 | 16 | # Get the account number associated with the current IAM credentials 17 | account=$(aws sts get-caller-identity --query Account --output text) 18 | 19 | if [ $? -ne 0 ] 20 | then 21 | exit 255 22 | fi 23 | 24 | chmod +x bert/train 25 | 26 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 27 | region=$(aws configure get region) 28 | region=${region:-us-west-2} 29 | 30 | # If the repository doesn't exist in ECR, create it. 31 | 32 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 33 | 34 | if [ $? -ne 0 ] 35 | then 36 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 37 | fi 38 | 39 | # Get the login command from ECR and execute it directly 40 | $(aws ecr get-login --region ${region} --no-include-email) 41 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert 42 | 43 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 44 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 45 | 46 | # loop for each architecture (cpu & gpu) 47 | for arch in gpu 48 | do 49 | echo "Building image with arch=${arch}, region=${region}" 50 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}" 51 | docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" -f "Dockerfile" . 52 | docker tag ${IMAGE}:${TAG} ${FULLNAME} 53 | docker push ${FULLNAME} 54 | done 55 | -------------------------------------------------------------------------------- /container_ner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ARG ARCH=gpu 3 | 4 | ARG py_version=3 5 | 6 | # Validate that arguments are specified 7 | RUN test $py_version || exit 1 8 | 9 | RUN echo $py_version 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | ca-certificates \ 17 | libjpeg-dev \ 18 | nginx \ 19 | jq \ 20 | libsm6 \ 21 | libxext6 \ 22 | libxrender-dev \ 23 | nginx \ 24 | libpng-dev && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 28 | chmod +x ~/miniconda.sh && \ 29 | ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh && \ 31 | /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \ 32 | /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \ 33 | /opt/conda/bin/conda clean -ya 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | RUN pip install --upgrade pip 37 | 38 | RUN python --version 39 | RUN pip --version 40 | 41 | # #RUN df -a 42 | 43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision 44 | 45 | 46 | # Python won’t try to write .pyc or .pyo files on the import of source modules 47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging 48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 49 | 50 | RUN nvcc --version 51 | RUN which nvcc 52 | 53 | RUN pip --no-cache-dir install \ 54 | flask \ 55 | pathlib \ 56 | gunicorn \ 57 | gevent \ 58 | scipy \ 59 | scikit-learn \ 60 | pandas \ 61 | fastprogress \ 62 | python-box \ 63 | tensorboardX \ 64 | fastai 65 | 66 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 67 | 68 | # RUN pip --no-cache-dir install fast-bert 69 | RUN pip install fast-bert 70 | 71 | ENV PATH="/opt/ml/code:${PATH}" 72 | COPY /bert /opt/ml/code 73 | 74 | WORKDIR /opt/ml/code 75 | 76 | RUN cd $WORKDIR 77 | 78 | RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased distilroberta-base 79 | 80 | RUN rm -rf /opt/ml/input/data/training/cache/ 81 | -------------------------------------------------------------------------------- /container_ner/bert/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | 27 | location ~ ^/(ping|invocations) { 28 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 29 | proxy_set_header Host $http_host; 30 | proxy_redirect off; 31 | proxy_pass http://gunicorn; 32 | } 33 | 34 | location / { 35 | return 404 "{}"; 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /container_ner/bert/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import sys 5 | import signal 6 | import traceback 7 | import re 8 | import flask 9 | 10 | import torch 11 | 12 | from fast_bert.prediction_ner import BertNERPredictor 13 | 14 | from fast_bert.utils.spellcheck import BingSpellCheck 15 | from pathlib import Path 16 | 17 | import warnings 18 | 19 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 20 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 21 | 22 | prefix = "/opt/ml/" 23 | 24 | # PATH = Path(os.path.join(prefix, "model")) 25 | PATH = os.path.join(prefix, "model") 26 | 27 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin") 28 | 29 | # request_text = None 30 | 31 | 32 | class ScoringService(object): 33 | model = None # Where we keep the model when it's loaded 34 | 35 | @classmethod 36 | def get_predictor_model(cls): 37 | 38 | # print(cls.searching_all_files(PATH)) 39 | # Get model predictor 40 | if cls.model is None: 41 | with open(os.path.join(PATH, "model_config.json")) as f: 42 | model_config = json.load(f) 43 | 44 | predictor = BertNERPredictor( 45 | os.path.join(PATH, "model_out"), 46 | label_path=PATH, 47 | model_type=model_config["model_type"], 48 | do_lower_case=model_config.get("do_lower_case", "True") == "True", 49 | use_fast_tokenizer=model_config.get("use_fast_tokenizer", "True") 50 | == "True", 51 | ) 52 | cls.model = predictor 53 | 54 | return cls.model 55 | 56 | @classmethod 57 | def predict(cls, text, bing_key=None): 58 | """For the input, do the predictions and return them. 59 | Args: 60 | input (a pandas dataframe): The data on which to do the predictions. There will be 61 | one prediction per row in the dataframe""" 62 | predictor_model = cls.get_predictor_model() 63 | if bing_key: 64 | spellChecker = BingSpellCheck(bing_key) 65 | text = spellChecker.spell_check(text) 66 | prediction = predictor_model.predict(text) 67 | 68 | return prediction 69 | 70 | @classmethod 71 | def searching_all_files(cls, directory: Path): 72 | file_list = [] # A list for storing files existing in directories 73 | 74 | for x in directory.iterdir(): 75 | if x.is_file(): 76 | file_list.append(str(x)) 77 | else: 78 | file_list.append(cls.searching_all_files(x)) 79 | 80 | return file_list 81 | 82 | 83 | # The flask app for serving predictions 84 | app = flask.Flask(__name__) 85 | 86 | 87 | @app.route("/ping", methods=["GET"]) 88 | def ping(): 89 | """Determine if the container is working and healthy. In this sample container, we declare 90 | it healthy if we can load the model successfully.""" 91 | health = ( 92 | ScoringService.get_predictor_model() is not None 93 | ) # You can insert a health check here 94 | 95 | status = 200 if health else 404 96 | return flask.Response(response="\n", status=status, mimetype="application/json") 97 | 98 | 99 | @app.route("/invocations", methods=["POST"]) 100 | def transformation(): 101 | """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert 102 | it to a pandas data frame for internal use and then convert the predictions back to CSV (which really 103 | just means one prediction per line, since there's a single column. 104 | """ 105 | data = None 106 | text = None 107 | 108 | if flask.request.content_type == "application/json": 109 | print("calling json launched") 110 | data = flask.request.get_json(silent=True) 111 | 112 | text = data["text"] 113 | try: 114 | bing_key = data["bing_key"] 115 | except Exception: 116 | bing_key = None 117 | 118 | else: 119 | return flask.Response( 120 | response="This predictor only supports JSON data", 121 | status=415, 122 | mimetype="text/plain", 123 | ) 124 | 125 | print("Invoked with text: {}.".format(text.encode("utf-8"))) 126 | 127 | # Do the prediction 128 | predictions = ScoringService.predict(text, bing_key) 129 | 130 | result = json.dumps(predictions[:10]) 131 | 132 | return flask.Response(response=result, status=200, mimetype="application/json") 133 | -------------------------------------------------------------------------------- /container_ner/bert/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | from __future__ import print_function 17 | import multiprocessing 18 | import os 19 | import signal 20 | import subprocess 21 | import sys 22 | 23 | cpu_count = multiprocessing.cpu_count() 24 | 25 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60) 26 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count)) 27 | 28 | def sigterm_handler(nginx_pid, gunicorn_pid): 29 | try: 30 | os.kill(nginx_pid, signal.SIGQUIT) 31 | except OSError: 32 | pass 33 | try: 34 | os.kill(gunicorn_pid, signal.SIGTERM) 35 | except OSError: 36 | pass 37 | 38 | sys.exit(0) 39 | 40 | def start_server(): 41 | print('Starting the inference server with {} workers.'.format(model_server_workers)) 42 | 43 | 44 | # link the log streams to stdout/err so they will be logged to the container logs 45 | subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log']) 46 | subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log']) 47 | 48 | nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf']) 49 | gunicorn = subprocess.Popen(['gunicorn', 50 | '--timeout', str(model_server_timeout), 51 | '-k', 'gevent', 52 | '-b', 'unix:/tmp/gunicorn.sock', 53 | '-w', str(model_server_workers), 54 | 'wsgi:app']) 55 | 56 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 57 | 58 | # If either subprocess exits, so do we. 59 | pids = set([nginx.pid, gunicorn.pid]) 60 | while True: 61 | pid, _ = os.wait() 62 | if pid in pids: 63 | break 64 | 65 | sigterm_handler(nginx.pid, gunicorn.pid) 66 | print('Inference server exiting') 67 | 68 | # The main routine just invokes the start function. 69 | 70 | if __name__ == '__main__': 71 | start_server() -------------------------------------------------------------------------------- /container_ner/bert/train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import json 5 | import pickle 6 | import sys 7 | import traceback 8 | import pandas as pd 9 | import datetime 10 | from pathlib import Path 11 | import logging 12 | 13 | import torch 14 | 15 | from transformers import AutoTokenizer 16 | 17 | 18 | from fast_bert.data_ner import BertNERDataBunch 19 | from fast_bert.learner_ner import BertNERLearner 20 | 21 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S") 22 | 23 | channel_name = "training" 24 | 25 | prefix = "/opt/ml/" 26 | input_path = prefix + "input/data" # opt/ml/input/data 27 | code_path = prefix + "code" # opt/ml/code 28 | pretrained_model_path = ( 29 | code_path + "/pretrained_models" 30 | ) # opt/ml/code/pretrained_models 31 | 32 | finetuned_path = input_path + "/{}/finetuned".format( 33 | channel_name 34 | ) # opt/ml/input/data/training/finetuned 35 | 36 | output_path = os.path.join(prefix, "output") # opt/ml/output 37 | model_path = os.path.join(prefix, "model") # opt/ml/model 38 | 39 | training_config_path = os.path.join( 40 | input_path, "{}/config".format(channel_name) 41 | ) # opt/ml/input/data/training/config 42 | 43 | hyperparam_path = os.path.join( 44 | prefix, "input/config/hyperparameters.json" 45 | ) # opt/ml/input/config/hyperparameters.json 46 | config_path = os.path.join( 47 | training_config_path, "training_config.json" 48 | ) # opt/ml/input/data/training/config/training_config.json 49 | 50 | 51 | # This algorithm has a single channel of input data called 'training'. Since we run in 52 | # File mode, the input files are copied to the directory specified here. 53 | 54 | training_path = os.path.join(input_path, channel_name) # opt/ml/input/data/training 55 | 56 | 57 | def searching_all_files(directory: Path): 58 | file_list = [] # A list for storing files existing in directories 59 | 60 | for x in directory.iterdir(): 61 | if x.is_file(): 62 | file_list.append(str(x)) 63 | else: 64 | file_list.append(searching_all_files(x)) 65 | 66 | return file_list 67 | 68 | 69 | # The function to execute the training. 70 | def train(): 71 | 72 | print("Starting the training.") 73 | 74 | DATA_PATH = Path(training_path) 75 | 76 | try: 77 | print(config_path) 78 | with open(config_path, "r") as f: 79 | training_config = json.load(f) 80 | print(training_config) 81 | 82 | with open(hyperparam_path, "r") as tc: 83 | hyperparameters = json.load(tc) 84 | print(hyperparameters) 85 | 86 | # convert string bools to booleans 87 | training_config["fp16"] = training_config["fp16"] == "True" 88 | training_config["use_fast_tokenizer"] = ( 89 | training_config.get("use_fast_tokenizer", "True") == "True" 90 | ) 91 | training_config["jsonl_file"] = training_config.get("jsonl_file", "data.jsonl") 92 | 93 | training_config["random_state"] = ( 94 | int(training_config.get("random_state")) 95 | if training_config.get("random_state") 96 | else None 97 | ) 98 | 99 | training_config["train_size"] = float(training_config.get("train_size", 0.8)) 100 | 101 | # Logger 102 | # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"])) 103 | logging.basicConfig( 104 | level=logging.INFO, 105 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 106 | datefmt="%m/%d/%Y %H:%M:%S", 107 | handlers=[ 108 | # logging.FileHandler(logfile), 109 | logging.StreamHandler(sys.stdout) 110 | ], 111 | ) 112 | 113 | logger = logging.getLogger() 114 | 115 | # Define pretrained model path 116 | PRETRAINED_PATH = Path(pretrained_model_path) / training_config["model_name"] 117 | if PRETRAINED_PATH.is_dir(): 118 | logger.info("model path used {}".format(PRETRAINED_PATH)) 119 | model_name_path = str(PRETRAINED_PATH) 120 | else: 121 | model_name_path = training_config["model_name"] 122 | logger.info( 123 | "model {} is not preloaded. Will try to download.".format( 124 | model_name_path 125 | ) 126 | ) 127 | 128 | finetuned_model_name = training_config.get("finetuned_model", None) 129 | if finetuned_model_name is not None: 130 | finetuned_model = os.path.join(finetuned_path, finetuned_model_name) 131 | logger.info("finetuned model loaded from {}".format(finetuned_model)) 132 | else: 133 | logger.info( 134 | "finetuned model not available - loading standard pretrained model" 135 | ) 136 | finetuned_model = None 137 | 138 | # use auto-tokenizer 139 | tokenizer = AutoTokenizer.from_pretrained(model_name_path, use_fast=True) 140 | 141 | device = torch.device("cuda") 142 | if torch.cuda.device_count() > 1: 143 | multi_gpu = True 144 | else: 145 | multi_gpu = False 146 | 147 | logger.info("Number of GPUs: {}".format(torch.cuda.device_count())) 148 | 149 | # Create databunch 150 | databunch = BertNERDataBunch.from_jsonl( 151 | DATA_PATH, 152 | training_config["jsonl_file"], 153 | tokenizer, 154 | clear_cache=True, 155 | batch_size_per_gpu=int(hyperparameters["train_batch_size"]), 156 | max_seq_length=int(hyperparameters["max_seq_length"]), 157 | multi_gpu=multi_gpu, 158 | model_type=training_config["model_type"], 159 | logger=logger, 160 | use_fast_tokenizer=training_config["use_fast_tokenizer"], 161 | train_size=training_config["train_size"], 162 | random_state=training_config["random_state"], 163 | ) 164 | 165 | logger.info("databunch labels: {}".format(len(databunch.labels))) 166 | 167 | # Initialise the learner 168 | learner = BertNERLearner.from_pretrained_model( 169 | databunch, 170 | model_name_path, 171 | output_dir=Path(model_path), 172 | device=device, 173 | logger=logger, 174 | finetuned_wgts_path=finetuned_model, 175 | is_fp16=training_config["fp16"], 176 | fp16_opt_level=training_config["fp16_opt_level"], 177 | warmup_steps=int(hyperparameters["warmup_steps"]), 178 | grad_accumulation_steps=int(training_config["grad_accumulation_steps"]), 179 | multi_gpu=multi_gpu, 180 | logging_steps=int(training_config["logging_steps"]), 181 | save_steps=int(training_config.get("save_steps", 0)), 182 | ) 183 | 184 | learner.fit(int(hyperparameters["epochs"]), float(hyperparameters["lr"])) 185 | 186 | # Run validation 187 | logger.info(learner.validate()) 188 | 189 | # save model and tokenizer artefacts 190 | learner.save_model() 191 | 192 | # save model config file 193 | with open(os.path.join(model_path, "model_config.json"), "w") as f: 194 | json.dump(training_config, f) 195 | 196 | # save label file 197 | with open(os.path.join(model_path, "labels.txt"), "w") as f: 198 | f.write("\n".join(databunch.labels)) 199 | 200 | except Exception as e: 201 | # Write out an error file. This will be returned as the failureReason in the 202 | # DescribeTrainingJob result. 203 | trc = traceback.format_exc() 204 | with open(os.path.join(output_path, "failure"), "w") as s: 205 | s.write("Exception during training: " + str(e) + "\n" + trc) 206 | # Printing this causes the exception to be in the training job logs, as well. 207 | print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr) 208 | # A non-zero exit code causes the training job to be marked as Failed. 209 | sys.exit(255) 210 | 211 | 212 | if __name__ == "__main__": 213 | train() 214 | 215 | # A zero exit code causes the job to be marked a Succeeded. 216 | sys.exit(0) 217 | -------------------------------------------------------------------------------- /container_ner/bert/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app -------------------------------------------------------------------------------- /container_ner/build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-fast-bert-ner" 9 | 10 | # parameters 11 | FASTAI_VERSION="1.0" 12 | PY_VERSION="py36" 13 | 14 | # Get the account number associated with the current IAM credentials 15 | account=$(aws sts get-caller-identity --query Account --output text) 16 | 17 | if [ $? -ne 0 ] 18 | then 19 | exit 255 20 | fi 21 | 22 | chmod +x bert/train 23 | chmod +x bert/serve 24 | 25 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 26 | region=$(aws configure get region) 27 | region=${region:-us-west-2} 28 | 29 | # If the repository doesn't exist in ECR, create it. 30 | 31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 32 | 33 | if [ $? -ne 0 ] 34 | then 35 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 36 | fi 37 | 38 | # Get the login command from ECR and execute it directly 39 | $(aws ecr get-login --region ${region} --no-include-email) 40 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert 41 | 42 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 43 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 44 | 45 | # loop for each architecture (cpu & gpu) 46 | for arch in gpu 47 | do 48 | echo "Building image with arch=${arch}, region=${region}" 49 | TAG="${FASTAI_VERSION}-${arch}-${PY_VERSION}" 50 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}" 51 | docker build -t ${IMAGE}:${TAG} --no-cache --build-arg ARCH="$arch" -f "Dockerfile" . 52 | docker tag ${IMAGE}:${TAG} ${FULLNAME} 53 | docker push ${FULLNAME} 54 | done 55 | -------------------------------------------------------------------------------- /container_t5/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ARG ARCH=gpu 3 | 4 | ARG py_version=3 5 | 6 | # Validate that arguments are specified 7 | RUN test $py_version || exit 1 8 | 9 | RUN echo $py_version 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | ca-certificates \ 17 | libjpeg-dev \ 18 | nginx \ 19 | jq \ 20 | libsm6 \ 21 | libxext6 \ 22 | libxrender-dev \ 23 | nginx \ 24 | libpng-dev && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 28 | chmod +x ~/miniconda.sh && \ 29 | ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh && \ 31 | /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython typing && \ 32 | /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \ 33 | /opt/conda/bin/conda clean -ya 34 | ENV PATH /opt/conda/bin:$PATH 35 | 36 | RUN pip install --upgrade pip 37 | 38 | RUN python --version 39 | RUN pip --version 40 | 41 | # #RUN df -a 42 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision 43 | 44 | 45 | # Python won’t try to write .pyc or .pyo files on the import of source modules 46 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging 47 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 48 | 49 | RUN nvcc --version 50 | RUN which nvcc 51 | 52 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 53 | 54 | COPY requirements.txt ./requirements.txt 55 | RUN pip install -r requirements.txt 56 | 57 | ENV PATH="/opt/ml/code:${PATH}" 58 | COPY /t5 /opt/ml/code 59 | 60 | WORKDIR /opt/ml/code 61 | -------------------------------------------------------------------------------- /container_t5/build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | IMAGE="fluent-fast-bert-t5" 9 | 10 | # parameters 11 | FASTAI_VERSION="1.0" 12 | PY_VERSION="py36" 13 | 14 | # Get the account number associated with the current IAM credentials 15 | account=$(aws sts get-caller-identity --query Account --output text) 16 | 17 | if [ $? -ne 0 ] 18 | then 19 | exit 255 20 | fi 21 | 22 | chmod +x t5/train 23 | chmod +x t5/serve 24 | 25 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 26 | region=$(aws configure get region) 27 | region=${region:-eu-west-1} 28 | 29 | # If the repository doesn't exist in ECR, create it. 30 | 31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1 32 | 33 | if [ $? -ne 0 ] 34 | then 35 | aws ecr create-repository --repository-name "${IMAGE}" > /dev/null 36 | fi 37 | 38 | # Get the login command from ECR and execute it directly 39 | $(aws ecr get-login --region ${region} --no-include-email) 40 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert 41 | 42 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image 43 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email) 44 | 45 | # loop for each architecture (cpu & gpu) 46 | for arch in gpu 47 | do 48 | echo "Building image with arch=${arch}, region=${region}" 49 | TAG="${FASTAI_VERSION}-${arch}-${PY_VERSION}" 50 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}" 51 | docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" . 52 | docker tag ${IMAGE}:${TAG} ${FULLNAME} 53 | docker push ${FULLNAME} 54 | done 55 | -------------------------------------------------------------------------------- /container_t5/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | pathlib 3 | gunicorn 4 | gevent 5 | scipy 6 | scikit-learn 7 | pandas 8 | fastprogress 9 | python-box 10 | tensorboardX 11 | transformers==2.11.0 12 | pytorch_lightning -------------------------------------------------------------------------------- /container_t5/t5/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | 27 | location ~ ^/(ping|invocations) { 28 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 29 | proxy_set_header Host $http_host; 30 | proxy_redirect off; 31 | proxy_pass http://gunicorn; 32 | } 33 | 34 | location / { 35 | return 404 "{}"; 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /container_t5/t5/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import sys 5 | import signal 6 | import traceback 7 | import re 8 | import flask 9 | 10 | import torch 11 | 12 | from fast_bert.prediction import BertClassificationPredictor 13 | 14 | from fast_bert.utils.spellcheck import BingSpellCheck 15 | from pathlib import Path 16 | 17 | import warnings 18 | 19 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 20 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 21 | 22 | prefix = "/opt/ml/" 23 | 24 | # PATH = Path(os.path.join(prefix, "model")) 25 | PATH = os.path.join(prefix, "model") 26 | 27 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin") 28 | 29 | # request_text = None 30 | 31 | 32 | class ScoringService(object): 33 | model = None # Where we keep the model when it's loaded 34 | 35 | @classmethod 36 | def get_predictor_model(cls): 37 | 38 | # print(cls.searching_all_files(PATH)) 39 | # Get model predictor 40 | if cls.model is None: 41 | with open(os.path.join(PATH, "model_config.json")) as f: 42 | model_config = json.load(f) 43 | 44 | predictor = BertClassificationPredictor( 45 | os.path.join(PATH, "model_out"), 46 | label_path=PATH, 47 | multi_label=bool(model_config["multi_label"]), 48 | model_type=model_config["model_type"], 49 | do_lower_case=bool(model_config["do_lower_case"]), 50 | ) 51 | cls.model = predictor 52 | 53 | return cls.model 54 | 55 | @classmethod 56 | def predict(cls, text, bing_key=None): 57 | """For the input, do the predictions and return them. 58 | Args: 59 | input (a pandas dataframe): The data on which to do the predictions. There will be 60 | one prediction per row in the dataframe""" 61 | predictor_model = cls.get_predictor_model() 62 | if bing_key: 63 | spellChecker = BingSpellCheck(bing_key) 64 | text = spellChecker.spell_check(text) 65 | prediction = predictor_model.predict(text) 66 | 67 | return prediction 68 | 69 | @classmethod 70 | def searching_all_files(cls, directory: Path): 71 | file_list = [] # A list for storing files existing in directories 72 | 73 | for x in directory.iterdir(): 74 | if x.is_file(): 75 | file_list.append(str(x)) 76 | else: 77 | file_list.append(cls.searching_all_files(x)) 78 | 79 | return file_list 80 | 81 | 82 | # The flask app for serving predictions 83 | app = flask.Flask(__name__) 84 | 85 | 86 | @app.route("/ping", methods=["GET"]) 87 | def ping(): 88 | """Determine if the container is working and healthy. In this sample container, we declare 89 | it healthy if we can load the model successfully.""" 90 | health = ( 91 | ScoringService.get_predictor_model() is not None 92 | ) # You can insert a health check here 93 | 94 | status = 200 if health else 404 95 | return flask.Response(response="\n", status=status, mimetype="application/json") 96 | 97 | 98 | # @app.route("/execution-parameters", method=["GET"]) 99 | # def get_execution_parameters(): 100 | # params = { 101 | # "MaxConcurrentTransforms": 3, 102 | # "BatchStrategy": "MULTI_RECORD", 103 | # "MaxPayloadInMB": 6, 104 | # } 105 | # return flask.Response( 106 | # response=json.dumps(params), status="200", mimetype="application/json" 107 | # ) 108 | 109 | 110 | @app.route("/invocations", methods=["POST"]) 111 | def transformation(): 112 | """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert 113 | it to a pandas data frame for internal use and then convert the predictions back to CSV (which really 114 | just means one prediction per line, since there's a single column. 115 | """ 116 | data = None 117 | text = None 118 | 119 | if flask.request.content_type == "application/json": 120 | print("calling json launched") 121 | data = flask.request.get_json(silent=True) 122 | 123 | text = data["text"] 124 | try: 125 | bing_key = data["bing_key"] 126 | except Exception: 127 | bing_key = None 128 | 129 | else: 130 | return flask.Response( 131 | response="This predictor only supports JSON data", 132 | status=415, 133 | mimetype="text/plain", 134 | ) 135 | 136 | print("Invoked with text: {}.".format(text.encode("utf-8"))) 137 | 138 | # Do the prediction 139 | predictions = ScoringService.predict(text, bing_key) 140 | 141 | result = json.dumps(predictions[:10]) 142 | 143 | return flask.Response(response=result, status=200, mimetype="application/json") 144 | -------------------------------------------------------------------------------- /container_t5/t5/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | from __future__ import print_function 17 | import multiprocessing 18 | import os 19 | import signal 20 | import subprocess 21 | import sys 22 | 23 | cpu_count = multiprocessing.cpu_count() 24 | 25 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60) 26 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count)) 27 | 28 | def sigterm_handler(nginx_pid, gunicorn_pid): 29 | try: 30 | os.kill(nginx_pid, signal.SIGQUIT) 31 | except OSError: 32 | pass 33 | try: 34 | os.kill(gunicorn_pid, signal.SIGTERM) 35 | except OSError: 36 | pass 37 | 38 | sys.exit(0) 39 | 40 | def start_server(): 41 | print('Starting the inference server with {} workers.'.format(model_server_workers)) 42 | 43 | 44 | # link the log streams to stdout/err so they will be logged to the container logs 45 | subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log']) 46 | subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log']) 47 | 48 | nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf']) 49 | gunicorn = subprocess.Popen(['gunicorn', 50 | '--timeout', str(model_server_timeout), 51 | '-k', 'gevent', 52 | '-b', 'unix:/tmp/gunicorn.sock', 53 | '-w', str(model_server_workers), 54 | 'wsgi:app']) 55 | 56 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 57 | 58 | # If either subprocess exits, so do we. 59 | pids = set([nginx.pid, gunicorn.pid]) 60 | while True: 61 | pid, _ = os.wait() 62 | if pid in pids: 63 | break 64 | 65 | sigterm_handler(nginx.pid, gunicorn.pid) 66 | print('Inference server exiting') 67 | 68 | # The main routine just invokes the start function. 69 | 70 | if __name__ == '__main__': 71 | start_server() -------------------------------------------------------------------------------- /container_t5/t5/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app -------------------------------------------------------------------------------- /deploy_pip.sh: -------------------------------------------------------------------------------- 1 | rm -rf dist 2 | python3 setup.py sdist bdist_wheel 3 | twine upload dist/* -------------------------------------------------------------------------------- /fast_bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import BertForMultiLabelSequenceClassification 2 | 3 | # from .data import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features 4 | from .data_cls import ( 5 | BertDataBunch, 6 | InputExample, 7 | InputFeatures, 8 | MultiLabelTextProcessor, 9 | convert_examples_to_features, 10 | ) 11 | 12 | 13 | from .learner_cls import BertLearner 14 | 15 | 16 | # from .prediction import BertClassificationPredictor 17 | from .utils.spellcheck import BingSpellCheck 18 | 19 | 20 | 21 | from .onnx_helper import * 22 | -------------------------------------------------------------------------------- /fast_bert/bert_layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class BertLayerNorm(nn.Module): 6 | def __init__(self, hidden_size, eps=1e-12): 7 | """Construct a layernorm module in the TF style (epsilon inside the square root). 8 | """ 9 | super(BertLayerNorm, self).__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 12 | self.variance_epsilon = eps 13 | 14 | def forward(self, x): 15 | u = x.mean(-1, keepdim=True) 16 | s = (x - u).pow(2).mean(-1, keepdim=True) 17 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 18 | return self.weight * x + self.bias 19 | -------------------------------------------------------------------------------- /fast_bert/data_abs.py: -------------------------------------------------------------------------------- 1 | import re 2 | import html 3 | import logging 4 | import pandas as pd 5 | import os 6 | import random 7 | import torch 8 | from pathlib import Path 9 | import pickle 10 | import shutil 11 | from collections import deque, namedtuple 12 | from torch.utils.data import Dataset, DataLoader, SequentialSampler 13 | from tokenizers import BertWordPieceTokenizer 14 | from transformers import BertTokenizer 15 | 16 | Batch = namedtuple( 17 | "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"] 18 | ) 19 | 20 | 21 | class SummarizationDataset(Dataset): 22 | """ Abstracts the dataset used to train seq2seq models. 23 | The class will process the documents that are located in the specified 24 | folder. The preprocessing will work on any document that is reasonably 25 | formatted. On the CNN/DailyMail dataset it will extract both the story 26 | and the summary. 27 | CNN/Daily News: 28 | The CNN/Daily News raw datasets are downloaded from [1]. The stories are 29 | stored in different files; the summary appears at the end of the story as 30 | sentences that are prefixed by the special `@highlight` line. To process 31 | the data, untar both datasets in the same folder, and pass the path to this 32 | folder as the "data_dir argument. The formatting code was inspired by [2]. 33 | [1] https://cs.nyu.edu/~kcho/ 34 | [2] https://github.com/abisee/cnn-dailymail/ 35 | """ 36 | 37 | def __init__(self, path="", prefix="train"): 38 | """ We initialize the class by listing all the documents to summarize. 39 | Files are not read in memory due to the size of some datasets (like CNN/DailyMail). 40 | """ 41 | assert os.path.isdir(path) 42 | 43 | self.documents = [] 44 | filenames_list = os.listdir(path) 45 | for filename in filenames_list: 46 | if "summary" in filename: 47 | continue 48 | path_to_text = os.path.join(path, filename) 49 | if not os.path.isfile(path_to_text): 50 | continue 51 | self.documents.append(path_to_text) 52 | 53 | def __len__(self): 54 | """ Returns the number of documents. """ 55 | return len(self.documents) 56 | 57 | def __getitem__(self, idx): 58 | document_path = self.documents[idx] 59 | document_name = document_path.split("/")[-1] 60 | with open(document_path, encoding="utf-8") as source: 61 | raw_doc = source.read() 62 | doc_lines = process_document(raw_doc) 63 | return document_name, doc_lines, [] 64 | 65 | 66 | class SummarizationInMemoryDataset(Dataset): 67 | """ Abstracts the dataset used to train seq2seq models. 68 | The class will process the documents that are located in the specified 69 | folder. The preprocessing will work on any document that is reasonably 70 | formatted. On the CNN/DailyMail dataset it will extract both the story 71 | and the summary. 72 | CNN/Daily News: 73 | The CNN/Daily News raw datasets are downloaded from [1]. The stories are 74 | stored in different files; the summary appears at the end of the story as 75 | sentences that are prefixed by the special `@highlight` line. To process 76 | the data, untar both datasets in the same folder, and pass the path to this 77 | folder as the "data_dir argument. The formatting code was inspired by [2]. 78 | [1] https://cs.nyu.edu/~kcho/ 79 | [2] https://github.com/abisee/cnn-dailymail/ 80 | """ 81 | 82 | def __init__(self, texts=[]): 83 | """ We initialize the class by listing all the documents to summarize. 84 | Files are not read in memory due to the size of some datasets (like CNN/DailyMail). 85 | """ 86 | self.documents = texts 87 | 88 | def __len__(self): 89 | """ Returns the number of documents. """ 90 | return len(self.documents) 91 | 92 | def __getitem__(self, idx): 93 | raw_doc = self.documents[idx] 94 | doc_lines = process_document(raw_doc) 95 | 96 | return None, doc_lines, [] 97 | 98 | 99 | def process_document(raw_doc): 100 | """ Extract the story and summary from a story file. 101 | Attributes: 102 | raw_story (str): content of the story file as an utf-8 encoded string. 103 | Raises: 104 | IndexError: If the stoy is empty or contains no highlights. 105 | """ 106 | nonempty_lines = list( 107 | filter(lambda x: len(x) != 0, [line.strip() for line in raw_doc.split("\n")]) 108 | ) 109 | 110 | # for some unknown reason some lines miss a period, add it 111 | nonempty_lines = [_add_missing_period(line) for line in nonempty_lines] 112 | 113 | # gather article lines 114 | doc_lines = [] 115 | lines = deque(nonempty_lines) 116 | while True: 117 | try: 118 | element = lines.popleft() 119 | if element.startswith("@highlight"): 120 | break 121 | doc_lines.append(element) 122 | except IndexError: 123 | # if "@highlight" is absent from the file we pop 124 | # all elements until there is None, raising an exception. 125 | return doc_lines 126 | 127 | return doc_lines 128 | 129 | 130 | def _add_missing_period(line): 131 | END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"] 132 | if line.startswith("@highlight"): 133 | return line 134 | if line[-1] in END_TOKENS: 135 | return line 136 | return line + "." 137 | 138 | 139 | # Abstractive databunch 140 | class BertAbsDataBunch(object): 141 | def __init__( 142 | self, 143 | tokenizer, 144 | device, 145 | data_dir=None, 146 | test_data=None, 147 | batch_size_per_gpu=16, 148 | max_seq_length=512, 149 | multi_gpu=True, 150 | multi_label=False, 151 | model_type="bert", 152 | logger=None, 153 | clear_cache=False, 154 | no_cache=False, 155 | ): 156 | 157 | # just in case someone passes string instead of Path 158 | if isinstance(data_dir, str): 159 | data_dir = Path(data_dir) 160 | 161 | if isinstance(tokenizer, str): 162 | # instantiate the new tokeniser object using the tokeniser name 163 | tokenizer = BertTokenizer.from_pretrained( 164 | "bert-base-uncased", do_lower_case=True 165 | ) 166 | self.tokenizer = tokenizer 167 | 168 | if type(self.tokenizer) == BertWordPieceTokenizer: 169 | 170 | self.tokenizer.cls_token_id = self.tokenizer.token_to_id("[CLS]") 171 | self.tokenizer.pad_token_id = self.tokenizer.token_to_id("[PAD]") 172 | 173 | self.max_seq_length = max_seq_length 174 | self.batch_size_per_gpu = batch_size_per_gpu 175 | self.device = device 176 | if data_dir: 177 | self.data_dir = data_dir 178 | self.cache_dir = data_dir / "lm_cache" 179 | # Create folder if it doesn't exist 180 | self.cache_dir.mkdir(exist_ok=True) 181 | self.no_cache = no_cache 182 | if clear_cache: 183 | shutil.rmtree(self.cache_dir, ignore_errors=True) 184 | else: 185 | self.no_cache = True 186 | self.data_dir = None 187 | 188 | self.model_type = model_type 189 | if logger is None: 190 | logger = logging.getLogger() 191 | self.logger = logger 192 | self.n_gpu = 1 193 | if multi_gpu: 194 | self.n_gpu = torch.cuda.device_count() 195 | 196 | # get dataset 197 | if self.data_dir: 198 | dataset = SummarizationDataset(self.data_dir) 199 | elif test_data: 200 | dataset = SummarizationInMemoryDataset(test_data) 201 | else: 202 | dataset = None 203 | 204 | if dataset: 205 | sampler = SequentialSampler(dataset) 206 | 207 | collate_fn = lambda data: collate( 208 | data, self.tokenizer, block_size=self.max_seq_length, device=self.device 209 | ) 210 | 211 | self.test_dl = DataLoader( 212 | dataset, 213 | sampler=sampler, 214 | batch_size=self.batch_size_per_gpu, 215 | collate_fn=collate_fn, 216 | ) 217 | else: 218 | self.test_dl = None 219 | 220 | def get_dl_from_texts(self, texts): 221 | 222 | dataset = SummarizationInMemoryDataset(texts) 223 | 224 | sampler = SequentialSampler(dataset) 225 | 226 | collate_fn = lambda data: collate( 227 | data, self.tokenizer, block_size=self.max_seq_length, device=self.device 228 | ) 229 | return DataLoader( 230 | dataset, 231 | sampler=sampler, 232 | batch_size=self.batch_size_per_gpu, 233 | collate_fn=collate_fn, 234 | ) 235 | 236 | 237 | def collate(data, tokenizer, block_size, device): 238 | """ Collate formats the data passed to the data loader. 239 | In particular we tokenize the data batch after batch to avoid keeping them 240 | all in memory. We output the data as a namedtuple to fit the original BertAbs's 241 | API. 242 | """ 243 | data = [x for x in data if not len(x[1]) == 0] # remove empty_files 244 | names = [name for name, _, _ in data] 245 | summaries = [" ".join(summary_list) for _, _, summary_list in data] 246 | 247 | if type(tokenizer) == BertWordPieceTokenizer: 248 | encoded_text = [ 249 | encode_for_summarization_new_tokenizer(story, summary, tokenizer) 250 | for _, story, summary in data 251 | ] 252 | else: 253 | encoded_text = [ 254 | encode_for_summarization(story, summary, tokenizer) 255 | for _, story, summary in data 256 | ] 257 | encoded_stories = torch.tensor( 258 | [ 259 | fit_to_block_size(story, block_size, tokenizer.pad_token_id) 260 | for story, _ in encoded_text 261 | ] 262 | ) 263 | encoder_token_type_ids = compute_token_type_ids( 264 | encoded_stories, tokenizer.cls_token_id 265 | ) 266 | encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id) 267 | 268 | batch = Batch( 269 | document_names=names, 270 | batch_size=len(encoded_stories), 271 | src=encoded_stories.to(device), 272 | segs=encoder_token_type_ids.to(device), 273 | mask_src=encoder_mask.to(device), 274 | tgt_str=summaries, 275 | ) 276 | 277 | return batch 278 | 279 | 280 | def encode_for_summarization(story_lines, summary_lines, tokenizer): 281 | """ Encode the story and summary lines, and join them 282 | as specified in [1] by using `[SEP] [CLS]` tokens to separate 283 | sentences. 284 | """ 285 | story_lines_token_ids = [tokenizer.encode(line) for line in story_lines] 286 | story_token_ids = [ 287 | token for sentence in story_lines_token_ids for token in sentence 288 | ] 289 | summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines] 290 | summary_token_ids = [ 291 | token for sentence in summary_lines_token_ids for token in sentence 292 | ] 293 | 294 | return story_token_ids, summary_token_ids 295 | 296 | 297 | def encode_for_summarization_new_tokenizer(story_lines, summary_lines, tokenizer): 298 | """ Encode the story and summary lines, and join them 299 | as specified in [1] by using `[SEP] [CLS]` tokens to separate 300 | sentences. 301 | """ 302 | story_lines_token_ids = [tokenizer.encode(line).ids for line in story_lines] 303 | story_token_ids = [ 304 | token for sentence in story_lines_token_ids for token in sentence 305 | ] 306 | summary_lines_token_ids = [tokenizer.encode(line).ids for line in summary_lines] 307 | summary_token_ids = [ 308 | token for sentence in summary_lines_token_ids for token in sentence 309 | ] 310 | 311 | return story_token_ids, summary_token_ids 312 | 313 | 314 | def fit_to_block_size(sequence, block_size, pad_token_id): 315 | """ Adapt the source and target sequences' lengths to the block size. 316 | If the sequence is shorter we append padding token to the right of the sequence. 317 | """ 318 | if len(sequence) > block_size: 319 | return sequence[:block_size] 320 | else: 321 | sequence.extend([pad_token_id] * (block_size - len(sequence))) 322 | return sequence 323 | 324 | 325 | def build_mask(sequence, pad_token_id): 326 | """ Builds the mask. The attention mechanism will only attend to positions 327 | with value 1. """ 328 | mask = torch.ones_like(sequence) 329 | idx_pad_tokens = sequence == pad_token_id 330 | mask[idx_pad_tokens] = 0 331 | return mask 332 | 333 | 334 | def compute_token_type_ids(batch, separator_token_id): 335 | """ Segment embeddings as described in [1] 336 | The values {0,1} were found in the repository [2]. 337 | Attributes: 338 | batch: torch.Tensor, size [batch_size, block_size] 339 | Batch of input. 340 | separator_token_id: int 341 | The value of the token that separates the segments. 342 | [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders." 343 | arXiv preprint arXiv:1908.08345 (2019). 344 | [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217) 345 | """ 346 | batch_embeddings = [] 347 | for sequence in batch: 348 | sentence_num = -1 349 | embeddings = [] 350 | for s in sequence: 351 | if s == separator_token_id: 352 | sentence_num += 1 353 | embeddings.append(sentence_num % 2) 354 | batch_embeddings.append(embeddings) 355 | return torch.tensor(batch_embeddings) 356 | -------------------------------------------------------------------------------- /fast_bert/data_lm.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | import re 3 | import html 4 | import logging 5 | import pandas as pd 6 | import os 7 | import random 8 | import torch 9 | from pathlib import Path 10 | import pickle 11 | import shutil 12 | import itertools 13 | import more_itertools 14 | 15 | 16 | from torch.utils.data import ( 17 | TensorDataset, 18 | DataLoader, 19 | RandomSampler, 20 | SequentialSampler, 21 | Dataset, 22 | ) 23 | from torch.utils.data.distributed import DistributedSampler 24 | 25 | from tqdm import tqdm, trange 26 | from fastprogress.fastprogress import master_bar, progress_bar 27 | 28 | from transformers import ( 29 | WEIGHTS_NAME, 30 | BertConfig, 31 | BertForSequenceClassification, 32 | BertTokenizer, 33 | XLMConfig, 34 | XLMForSequenceClassification, 35 | XLMTokenizer, 36 | XLNetConfig, 37 | XLNetForSequenceClassification, 38 | XLNetTokenizer, 39 | RobertaConfig, 40 | RobertaForSequenceClassification, 41 | RobertaTokenizer, 42 | DistilBertConfig, 43 | DistilBertForSequenceClassification, 44 | DistilBertTokenizer, 45 | CamembertConfig, 46 | CamembertForSequenceClassification, 47 | CamembertTokenizer, 48 | ElectraConfig, 49 | ElectraForSequenceClassification, 50 | ElectraTokenizer, 51 | ) 52 | 53 | MODEL_CLASSES = { 54 | "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), 55 | "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 56 | "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 57 | "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 58 | "distilbert": ( 59 | DistilBertConfig, 60 | DistilBertForSequenceClassification, 61 | DistilBertTokenizer, 62 | ), 63 | "camembert-base": ( 64 | CamembertConfig, 65 | CamembertForSequenceClassification, 66 | CamembertTokenizer, 67 | ), 68 | "electra": (ElectraConfig, ElectraForSequenceClassification, ElectraTokenizer), 69 | } 70 | 71 | # Create text corpus suitable for language model training 72 | 73 | 74 | def create_corpus(text_list, target_path, logger=None): 75 | 76 | with open(target_path, "w") as f: 77 | # Split sentences for each document 78 | logger.info("Formatting corpus for {}".format(target_path)) 79 | for text in progress_bar(text_list): 80 | 81 | text = fix_html(text) 82 | text = replace_multi_newline(text) 83 | text = spec_add_spaces(text) 84 | text = rm_useless_spaces(text) 85 | text = text.strip() 86 | 87 | f.write(text + "\n") 88 | 89 | 90 | # text_lines = [re.sub(r"\n(\s)*","",str(sent)) for i, sent in enumerate(nlp(str(text)).sents)] 91 | # text_lines = [text_line for text_line in text_lines if re.search(r'[a-zA-Z]', text_line)] 92 | 93 | # f.write('\n'.join(text_lines)) 94 | # f.write("\n \n") 95 | 96 | 97 | def spec_add_spaces(t: str) -> str: 98 | "Add spaces around / and # in `t`. \n" 99 | return re.sub(r"([/#\n])", r" \1 ", t) 100 | 101 | 102 | def rm_useless_spaces(t: str) -> str: 103 | "Remove multiple spaces in `t`." 104 | return re.sub(" {2,}", " ", t) 105 | 106 | 107 | def replace_multi_newline(t: str) -> str: 108 | return re.sub(r"(\n(\s)*){2,}", "\n", t) 109 | 110 | 111 | def fix_html(x: str) -> str: 112 | "List of replacements from html strings in `x`." 113 | re1 = re.compile(r" +") 114 | x = ( 115 | x.replace("#39;", "'") 116 | .replace("amp;", "&") 117 | .replace("#146;", "'") 118 | .replace("nbsp;", " ") 119 | .replace("#36;", "$") 120 | .replace("\\n", "\n") 121 | .replace("quot;", "'") 122 | .replace("
", "\n") 123 | .replace('\\"', '"') 124 | .replace(" @.@ ", ".") 125 | .replace(" @-@ ", "-") 126 | .replace(" @,@ ", ",") 127 | .replace("\\", " \\ ") 128 | ) 129 | return re1.sub(" ", html.unescape(x)) 130 | 131 | 132 | class TextDataset(Dataset): 133 | def __init__(self, tokenizer, file_path, cache_path, logger, block_size=512): 134 | assert os.path.isfile(file_path) 135 | 136 | if os.path.exists(cache_path): 137 | logger.info("Loading features from cached file %s", cache_path) 138 | with open(cache_path, "rb") as handle: 139 | self.examples = pickle.load(handle) 140 | else: 141 | logger.info("Creating features from dataset file %s", file_path) 142 | 143 | self.examples = [] 144 | text = (line.strip() for line in open(file_path, encoding="utf-8")) 145 | text = progress_bar(list(text)) 146 | text = map( 147 | lambda x: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)), text 148 | ) 149 | text = itertools.chain.from_iterable(text) 150 | text = more_itertools.chunked(text, block_size) 151 | self.examples = list(text)[:-1] 152 | # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) 153 | # If your dataset is small, first you should loook for a bigger one :-) and second you 154 | # can change this behavior by adding (model specific) padding. 155 | 156 | logger.info("Saving features into cached file %s", cache_path) 157 | with open(cache_path, "wb") as handle: 158 | pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) 159 | 160 | def __len__(self): 161 | return len(self.examples) 162 | 163 | def __getitem__(self, item): 164 | return torch.tensor(self.examples[item]) 165 | 166 | 167 | # DataBunch object for language models 168 | class BertLMDataBunch(object): 169 | @staticmethod 170 | def from_raw_corpus( 171 | data_dir, 172 | text_list, 173 | tokenizer, 174 | batch_size_per_gpu=32, 175 | max_seq_length=512, 176 | multi_gpu=True, 177 | test_size=0.1, 178 | model_type="bert", 179 | logger=None, 180 | clear_cache=False, 181 | no_cache=False, 182 | ): 183 | 184 | train_file = "lm_train.txt" 185 | val_file = "lm_val.txt" 186 | 187 | train_list, val_list = train_test_split( 188 | text_list, test_size=test_size, shuffle=True 189 | ) 190 | # Create train corpus 191 | create_corpus(train_list, str(data_dir / train_file), logger=logger) 192 | 193 | # Create val corpus 194 | create_corpus(val_list, str(data_dir / val_file), logger=logger) 195 | 196 | return BertLMDataBunch( 197 | data_dir, 198 | tokenizer, 199 | train_file=train_file, 200 | val_file=val_file, 201 | batch_size_per_gpu=batch_size_per_gpu, 202 | max_seq_length=max_seq_length, 203 | multi_gpu=multi_gpu, 204 | model_type=model_type, 205 | logger=logger, 206 | clear_cache=clear_cache, 207 | no_cache=no_cache, 208 | ) 209 | 210 | def __init__( 211 | self, 212 | data_dir, 213 | tokenizer, 214 | train_file="lm_train.txt", 215 | val_file="lm_val.txt", 216 | batch_size_per_gpu=32, 217 | max_seq_length=512, 218 | multi_gpu=True, 219 | model_type="bert", 220 | logger=None, 221 | clear_cache=False, 222 | no_cache=False, 223 | ): 224 | 225 | # just in case someone passes string instead of Path 226 | if isinstance(data_dir, str): 227 | data_dir = Path(data_dir) 228 | 229 | # Instantiate correct tokenizer if the tokenizer name is passed instead of object 230 | if isinstance(tokenizer, str): 231 | _, _, tokenizer_class = MODEL_CLASSES[model_type] 232 | # instantiate the new tokeniser object using the tokeniser name 233 | tokenizer = tokenizer_class.from_pretrained( 234 | tokenizer, do_lower_case=("uncased" in tokenizer) 235 | ) 236 | 237 | # Bug workaround for RoBERTa 238 | if model_type == "roberta": 239 | tokenizer.max_len_single_sentence = tokenizer.max_len - 2 240 | 241 | self.tokenizer = tokenizer 242 | self.max_seq_length = max_seq_length 243 | self.batch_size_per_gpu = batch_size_per_gpu 244 | self.train_dl = None 245 | self.val_dl = None 246 | self.data_dir = data_dir 247 | self.cache_dir = data_dir / "lm_cache" 248 | self.no_cache = no_cache 249 | self.model_type = model_type 250 | if logger is None: 251 | logger = logging.getLogger() 252 | self.logger = logger 253 | self.n_gpu = 1 254 | if multi_gpu: 255 | self.n_gpu = torch.cuda.device_count() 256 | 257 | if clear_cache: 258 | shutil.rmtree(self.cache_dir, ignore_errors=True) 259 | 260 | # Create folder if it doesn't exist 261 | self.cache_dir.mkdir(exist_ok=True) 262 | 263 | if train_file: 264 | # Train DataLoader 265 | # train_examples = None 266 | cached_features_file = os.path.join( 267 | self.cache_dir, 268 | "cached_{}_{}_{}".format( 269 | self.model_type, "train", str(self.max_seq_length) 270 | ), 271 | ) 272 | 273 | train_filepath = str(self.data_dir / train_file) 274 | train_dataset = TextDataset( 275 | self.tokenizer, 276 | train_filepath, 277 | cached_features_file, 278 | self.logger, 279 | block_size=self.tokenizer.max_len_single_sentence, 280 | ) 281 | 282 | self.train_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu) 283 | 284 | train_sampler = RandomSampler(train_dataset) 285 | self.train_dl = DataLoader( 286 | train_dataset, sampler=train_sampler, batch_size=self.train_batch_size 287 | ) 288 | 289 | if val_file: 290 | # Val DataLoader 291 | # val_examples = None 292 | cached_features_file = os.path.join( 293 | self.cache_dir, 294 | "cached_{}_{}_{}".format( 295 | self.model_type, "dev", str(self.max_seq_length) 296 | ), 297 | ) 298 | 299 | val_filepath = str(self.data_dir / val_file) 300 | val_dataset = TextDataset( 301 | self.tokenizer, 302 | val_filepath, 303 | cached_features_file, 304 | self.logger, 305 | block_size=self.tokenizer.max_len_single_sentence, 306 | ) 307 | 308 | self.val_batch_size = self.batch_size_per_gpu * 2 * max(1, self.n_gpu) 309 | 310 | val_sampler = RandomSampler(val_dataset) 311 | self.val_dl = DataLoader( 312 | val_dataset, sampler=val_sampler, batch_size=self.val_batch_size 313 | ) 314 | 315 | # Mask tokens 316 | 317 | def mask_tokens(self, inputs, mlm_probability=0.15): 318 | """Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.""" 319 | labels = inputs.clone() 320 | # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa) 321 | 322 | masked_indices = torch.bernoulli( 323 | torch.full(labels.shape, mlm_probability) 324 | ).bool() 325 | # do not mask special tokens 326 | masked_indices[:, 0] = False 327 | masked_indices[:, -1] = False 328 | 329 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 330 | 331 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 332 | indices_replaced = ( 333 | torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 334 | ) 335 | inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( 336 | self.tokenizer.mask_token 337 | ) 338 | 339 | # 10% of the time, we replace masked input tokens with random word 340 | indices_random = ( 341 | torch.bernoulli(torch.full(labels.shape, 0.5)).bool() 342 | & masked_indices 343 | & ~indices_replaced 344 | ) 345 | random_words = torch.randint( 346 | len(self.tokenizer), labels.shape, dtype=torch.long 347 | ) 348 | inputs[indices_random] = random_words[indices_random] 349 | 350 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 351 | return inputs, labels 352 | 353 | def save(self, filename="databunch.pkl"): 354 | tmp_path = self.data_dir / "tmp" 355 | tmp_path.mkdir(exist_ok=True) 356 | with open(str(tmp_path / filename), "wb") as f: 357 | pickle.dump(self, f) 358 | -------------------------------------------------------------------------------- /fast_bert/learner_abs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .data_abs import BertAbsDataBunch 3 | from .learner_util import Learner 4 | from torch import nn 5 | from typing import List 6 | import torch 7 | from box import Box 8 | from tokenizers import BertWordPieceTokenizer 9 | 10 | from .summarisation import BertAbs, build_predictor 11 | from .summarisation import BertAbsConfig 12 | from fastprogress.fastprogress import master_bar, progress_bar 13 | import numpy as np 14 | import pandas as pd 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | MODEL_CLASSES = {"bert": (BertAbsConfig, BertAbs)} 21 | 22 | 23 | class BertAbsLearner(Learner): 24 | @staticmethod 25 | def from_pretrained_model( 26 | databunch, 27 | pretrained_path, 28 | device, 29 | logger, 30 | metrics=None, 31 | finetuned_wgts_path=None, 32 | multi_gpu=True, 33 | is_fp16=True, 34 | loss_scale=0, 35 | warmup_steps=0, 36 | fp16_opt_level="O1", 37 | grad_accumulation_steps=1, 38 | max_grad_norm=1.0, 39 | adam_epsilon=1e-8, 40 | logging_steps=100, 41 | alpha=0.95, 42 | beam_size=5, 43 | min_length=50, 44 | max_length=200, 45 | block_trigram=True, 46 | ): 47 | 48 | model_state_dict = None 49 | 50 | model_type = databunch.model_type 51 | 52 | config_class, model_class = MODEL_CLASSES[model_type] 53 | 54 | if torch.cuda.is_available(): 55 | map_location = lambda storage, loc: storage.cuda() 56 | else: 57 | map_location = 'cpu' 58 | 59 | if finetuned_wgts_path: 60 | model_state_dict = torch.load(finetuned_wgts_path, map_location=map_location) 61 | else: 62 | model_state_dict = None 63 | 64 | model = model_class.from_pretrained( 65 | str(pretrained_path), state_dict=model_state_dict 66 | ) 67 | 68 | model.to(device) 69 | 70 | return BertAbsLearner( 71 | databunch, 72 | model, 73 | str(pretrained_path), 74 | device, 75 | logger, 76 | metrics, 77 | multi_gpu, 78 | is_fp16, 79 | loss_scale, 80 | warmup_steps, 81 | fp16_opt_level, 82 | grad_accumulation_steps, 83 | max_grad_norm, 84 | adam_epsilon, 85 | logging_steps, 86 | alpha, 87 | beam_size, 88 | min_length, 89 | max_length, 90 | block_trigram, 91 | ) 92 | 93 | def __init__( 94 | self, 95 | data: BertAbsDataBunch, 96 | model: nn.Module, 97 | pretrained_model_path, 98 | device, 99 | logger, 100 | metrics=None, 101 | multi_gpu=True, 102 | is_fp16=True, 103 | loss_scale=0, 104 | warmup_steps=0, 105 | fp16_opt_level="O1", 106 | grad_accumulation_steps=1, 107 | max_grad_norm=1.0, 108 | adam_epsilon=1e-8, 109 | logging_steps=100, 110 | alpha=0.95, 111 | beam_size=5, 112 | min_length=50, 113 | max_length=200, 114 | block_trigram=True, 115 | ): 116 | 117 | super(BertAbsLearner, self).__init__( 118 | data, 119 | model, 120 | pretrained_model_path, 121 | None, 122 | device, 123 | logger, 124 | multi_gpu, 125 | is_fp16, 126 | warmup_steps, 127 | fp16_opt_level, 128 | grad_accumulation_steps, 129 | max_grad_norm, 130 | adam_epsilon, 131 | logging_steps, 132 | ) 133 | 134 | # Classification specific attributes 135 | self.metrics = metrics 136 | 137 | # Summarisation specific features 138 | if type(self.data.tokenizer) == BertWordPieceTokenizer: 139 | symbols = { 140 | "BOS": self.data.tokenizer.token_to_id("[unused0]"), 141 | "EOS": self.data.tokenizer.token_to_id("[unused1]"), 142 | "PAD": self.data.tokenizer.token_to_id("[PAD]"), 143 | } 144 | else: 145 | symbols = { 146 | "BOS": self.data.tokenizer.vocab["[unused0]"], 147 | "EOS": self.data.tokenizer.vocab["[unused1]"], 148 | "PAD": self.data.tokenizer.vocab["[PAD]"], 149 | } 150 | 151 | self.predictor_args = Box( 152 | { 153 | "alpha": alpha, 154 | "beam_size": beam_size, 155 | "min_length": min_length, 156 | "max_length": max_length, 157 | "block_trigram": block_trigram, 158 | } 159 | ) 160 | 161 | # predictor object 162 | self.predictor = build_predictor( 163 | self.predictor_args, self.data.tokenizer, symbols, self.model 164 | ) 165 | 166 | ### Train the model ### 167 | def fit( 168 | self, 169 | epochs, 170 | lr, 171 | validate=True, 172 | schedule_type="warmup_cosine", 173 | optimizer_type="lamb", 174 | ): 175 | self.logger.info( 176 | "Irony...fit is not implmented yet. This is a pretrained-only inference model" 177 | ) 178 | 179 | ### Evaluate the model 180 | def validate(self): 181 | self.logger.info( 182 | "Irony...fit is not implmented yet. This is a pretrained-only inference model" 183 | ) 184 | 185 | ### Return Predictions ### 186 | def predict_batch(self, texts=None): 187 | 188 | if texts: 189 | dl = self.data.get_dl_from_texts(texts) 190 | else: 191 | dl = self.data.test_dl 192 | 193 | all_summaries = [] 194 | 195 | self.model.eval() 196 | for step, batch in enumerate(dl): 197 | # batch = tuple(t.to(self.device) for t in batch) 198 | 199 | batch_data = self.predictor.translate_batch(batch) 200 | translations = self.predictor.from_batch(batch_data) 201 | 202 | summaries = [format_summary(t) for t in translations] 203 | all_summaries.extend(summaries) 204 | 205 | return all_summaries 206 | 207 | 208 | def format_summary(translation): 209 | """ Transforms the output of the `from_batch` function 210 | into nicely formatted summaries. 211 | """ 212 | raw_summary, _, _ = translation 213 | summary = ( 214 | raw_summary.replace("[unused0]", "") 215 | .replace("[unused3]", "") 216 | .replace("[PAD]", "") 217 | .replace("[unused1]", "") 218 | .replace(r" +", " ") 219 | .replace(" [unused2] ", ". ") 220 | .replace("[unused2]", "") 221 | .strip() 222 | ) 223 | 224 | return summary 225 | -------------------------------------------------------------------------------- /fast_bert/learner_lm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from packaging import version 4 | from pathlib import Path 5 | import numpy as np 6 | 7 | from fastprogress.fastprogress import master_bar, progress_bar 8 | from tensorboardX import SummaryWriter 9 | 10 | from .learner_util import Learner 11 | 12 | from .data_lm import BertLMDataBunch 13 | 14 | from transformers import ( 15 | WEIGHTS_NAME, 16 | BertConfig, 17 | BertForMaskedLM, 18 | RobertaConfig, 19 | RobertaForMaskedLM, 20 | DistilBertConfig, 21 | DistilBertForMaskedLM, 22 | CamembertConfig, 23 | CamembertForMaskedLM, 24 | ElectraConfig, 25 | ElectraForMaskedLM, 26 | ) 27 | 28 | from torch.optim.lr_scheduler import _LRScheduler, Optimizer 29 | 30 | MODEL_CLASSES = { 31 | "bert": (BertConfig, BertForMaskedLM), 32 | "roberta": (RobertaConfig, RobertaForMaskedLM), 33 | "distilbert": (DistilBertConfig, DistilBertForMaskedLM), 34 | "camembert-base": (CamembertConfig, CamembertForMaskedLM), 35 | "electra": (ElectraConfig, ElectraForMaskedLM), 36 | } 37 | 38 | if version.parse(torch.__version__) >= version.parse("1.6"): 39 | IS_AMP_AVAILABLE = True 40 | from torch.cuda.amp import autocast 41 | else: 42 | IS_AMP_AVAILABLE = False 43 | 44 | 45 | class BertLMLearner(Learner): 46 | @staticmethod 47 | def from_pretrained_model( 48 | dataBunch, 49 | pretrained_path, 50 | output_dir, 51 | metrics, 52 | device, 53 | logger, 54 | multi_gpu=True, 55 | is_fp16=True, 56 | warmup_steps=0, 57 | fp16_opt_level="O1", 58 | grad_accumulation_steps=1, 59 | max_grad_norm=1.0, 60 | adam_epsilon=1e-8, 61 | logging_steps=100, 62 | ): 63 | 64 | if is_fp16 and (IS_AMP_AVAILABLE is False): 65 | logger.debug("Apex not installed. switching off FP16 training") 66 | is_fp16 = False 67 | 68 | model_type = dataBunch.model_type 69 | 70 | config_class, model_class = MODEL_CLASSES[model_type] 71 | 72 | config = config_class.from_pretrained(pretrained_path) 73 | model = model_class.from_pretrained(pretrained_path, config=config) 74 | model.to(device) 75 | 76 | return BertLMLearner( 77 | dataBunch, 78 | model, 79 | pretrained_path, 80 | output_dir, 81 | metrics, 82 | device, 83 | logger, 84 | multi_gpu, 85 | is_fp16, 86 | warmup_steps, 87 | fp16_opt_level, 88 | grad_accumulation_steps, 89 | max_grad_norm, 90 | adam_epsilon, 91 | logging_steps, 92 | ) 93 | 94 | # Learner initialiser 95 | def __init__( 96 | self, 97 | data: BertLMDataBunch, 98 | model: torch.nn.Module, 99 | pretrained_model_path, 100 | output_dir, 101 | metrics, 102 | device, 103 | logger, 104 | multi_gpu=True, 105 | is_fp16=True, 106 | warmup_steps=0, 107 | fp16_opt_level="O1", 108 | grad_accumulation_steps=1, 109 | max_grad_norm=1.0, 110 | adam_epsilon=1e-8, 111 | logging_steps=100, 112 | ): 113 | 114 | if isinstance(output_dir, str): 115 | output_dir = Path(output_dir) 116 | 117 | self.data = data 118 | self.model = model 119 | self.pretrained_model_path = pretrained_model_path 120 | self.metrics = metrics 121 | self.multi_gpu = multi_gpu 122 | self.is_fp16 = is_fp16 123 | self.fp16_opt_level = fp16_opt_level 124 | self.adam_epsilon = adam_epsilon 125 | self.warmup_steps = warmup_steps 126 | self.grad_accumulation_steps = grad_accumulation_steps 127 | self.device = device 128 | self.logger = logger 129 | self.optimizer = None 130 | self.n_gpu = 0 131 | self.max_grad_norm = max_grad_norm 132 | self.logging_steps = logging_steps 133 | self.max_steps = -1 134 | self.weight_decay = 0.0 135 | self.model_type = data.model_type 136 | 137 | self.output_dir = output_dir 138 | 139 | self.scaler = torch.cuda.amp.GradScaler() if is_fp16 is True else None 140 | 141 | if self.multi_gpu: 142 | self.n_gpu = torch.cuda.device_count() 143 | 144 | ### Train the model ### 145 | def fit( 146 | self, 147 | epochs, 148 | lr, 149 | validate=True, 150 | schedule_type="warmup_cosine", 151 | optimizer_type="lamb", 152 | ): 153 | 154 | tensorboard_dir = self.output_dir / "tensorboard" 155 | tensorboard_dir.mkdir(exist_ok=True) 156 | 157 | # Train the model 158 | tb_writer = SummaryWriter(tensorboard_dir) 159 | 160 | train_dataloader = self.data.train_dl 161 | if self.max_steps > 0: 162 | t_total = self.max_steps 163 | self.epochs = ( 164 | self.max_steps // len(train_dataloader) // self.grad_accumulation_steps 165 | + 1 166 | ) 167 | else: 168 | t_total = len(train_dataloader) // self.grad_accumulation_steps * epochs 169 | 170 | # Prepare optimiser and schedule 171 | optimizer = self.get_optimizer(lr, optimizer_type=optimizer_type) 172 | 173 | # get the base model if its already wrapped around DataParallel 174 | if hasattr(self.model, "module"): 175 | self.model = self.model.module 176 | 177 | # Get scheduler 178 | scheduler = self.get_scheduler( 179 | optimizer, t_total=t_total, schedule_type=schedule_type 180 | ) 181 | 182 | # Parallelize the model architecture 183 | if self.multi_gpu is True: 184 | self.model = torch.nn.DataParallel(self.model) 185 | 186 | # Start Training 187 | self.logger.info("***** Running training *****") 188 | self.logger.info(" Num examples = %d", len(train_dataloader.dataset)) 189 | self.logger.info(" Num Epochs = %d", epochs) 190 | self.logger.info( 191 | " Total train batch size (w. parallel, distributed & accumulation) = %d", 192 | self.data.train_batch_size * self.grad_accumulation_steps, 193 | ) 194 | self.logger.info( 195 | " Gradient Accumulation steps = %d", self.grad_accumulation_steps 196 | ) 197 | self.logger.info(" Total optimization steps = %d", t_total) 198 | 199 | global_step = 0 200 | epoch_step = 0 201 | tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0 202 | self.model.zero_grad() 203 | pbar = master_bar(range(epochs)) 204 | 205 | for epoch in pbar: 206 | epoch_step = 0 207 | epoch_loss = 0.0 208 | for step, batch in enumerate(progress_bar(train_dataloader, parent=pbar)): 209 | inputs, labels = self.data.mask_tokens(batch) 210 | cpu_device = torch.device("cpu") 211 | loss = self.training_step(batch) 212 | 213 | tr_loss += loss.item() 214 | epoch_loss += loss.item() 215 | 216 | batch.to(cpu_device) 217 | inputs.to(cpu_device) 218 | labels.to(cpu_device) 219 | torch.cuda.empty_cache() 220 | 221 | if (step + 1) % self.grad_accumulation_steps == 0: 222 | # gradient clipping 223 | torch.nn.utils.clip_grad_norm_( 224 | self.model.parameters(), self.max_grad_norm 225 | ) 226 | 227 | if self.is_fp16: 228 | # AMP: gradients need unscaling 229 | self.scaler.unscale_(optimizer) 230 | 231 | if self.is_fp16: 232 | self.scaler.step(optimizer) 233 | self.scaler.update() 234 | else: 235 | optimizer.step() 236 | scheduler.step() 237 | 238 | self.model.zero_grad() 239 | global_step += 1 240 | epoch_step += 1 241 | 242 | if self.logging_steps > 0 and global_step % self.logging_steps == 0: 243 | if validate: 244 | # evaluate model 245 | results = self.validate() 246 | for key, value in results.items(): 247 | tb_writer.add_scalar( 248 | "eval_{}".format(key), value, global_step 249 | ) 250 | self.logger.info( 251 | "eval_{} after step {}: {}: ".format( 252 | key, global_step, value 253 | ) 254 | ) 255 | 256 | # Log metrics 257 | self.logger.info( 258 | "lr after step {}: {}".format( 259 | global_step, scheduler.get_lr()[0] 260 | ) 261 | ) 262 | self.logger.info( 263 | "train_loss after step {}: {}".format( 264 | global_step, 265 | (tr_loss - logging_loss) / self.logging_steps, 266 | ) 267 | ) 268 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) 269 | tb_writer.add_scalar( 270 | "loss", 271 | (tr_loss - logging_loss) / self.logging_steps, 272 | global_step, 273 | ) 274 | 275 | logging_loss = tr_loss 276 | 277 | # Evaluate the model after every epoch 278 | if validate: 279 | results = self.validate() 280 | for key, value in results.items(): 281 | self.logger.info( 282 | "eval_{} after epoch {}: {}: ".format(key, (epoch + 1), value) 283 | ) 284 | 285 | # Log metrics 286 | self.logger.info( 287 | "lr after epoch {}: {}".format((epoch + 1), scheduler.get_lr()[0]) 288 | ) 289 | self.logger.info( 290 | "train_loss after epoch {}: {}".format( 291 | (epoch + 1), epoch_loss / epoch_step 292 | ) 293 | ) 294 | self.logger.info("\n") 295 | 296 | tb_writer.close() 297 | return global_step, tr_loss / global_step 298 | 299 | ### Training step 300 | def training_step(self, batch): 301 | inputs, labels = self.data.mask_tokens(batch) 302 | 303 | inputs = inputs.to(self.device) 304 | labels = labels.to(self.device) 305 | 306 | self.model.train() 307 | 308 | if self.is_fp16: 309 | with autocast(): 310 | outputs = self.model(inputs, masked_lm_labels=labels) 311 | else: 312 | outputs = self.model(inputs, masked_lm_labels=labels) 313 | 314 | loss = outputs[0] 315 | 316 | if self.n_gpu > 1: 317 | loss = loss.mean() 318 | if self.grad_accumulation_steps > 1: 319 | loss = loss / self.grad_accumulation_steps 320 | 321 | if self.is_fp16: 322 | self.scaler.scale(loss).backward() 323 | else: 324 | loss.backward() 325 | 326 | return loss 327 | 328 | ### Evaluate the model 329 | def validate(self): 330 | self.logger.info("Running evaluation") 331 | 332 | self.logger.info("Num examples = %d", len(self.data.val_dl.dataset)) 333 | self.logger.info("Validation Batch size = %d", self.data.val_batch_size) 334 | 335 | eval_loss = 0 336 | nb_eval_steps = 0 337 | 338 | validation_scores = {metric["name"]: 0.0 for metric in self.metrics} 339 | 340 | for step, batch in enumerate(progress_bar(self.data.val_dl)): 341 | self.model.eval() 342 | batch = batch.to(self.device) 343 | 344 | with torch.no_grad(): 345 | outputs = self.model(batch, masked_lm_labels=batch) 346 | tmp_eval_loss = outputs[0] 347 | eval_loss += tmp_eval_loss.mean().item() 348 | 349 | cpu_device = torch.device("cpu") 350 | batch.to(cpu_device) 351 | torch.cuda.empty_cache() 352 | 353 | nb_eval_steps += 1 354 | 355 | eval_loss = eval_loss / nb_eval_steps 356 | perplexity = torch.exp(torch.tensor(eval_loss)) 357 | 358 | results = {"loss": eval_loss, "perplexity": float(perplexity)} 359 | 360 | results.update(validation_scores) 361 | 362 | return results 363 | 364 | def save_model(self, path=None): 365 | 366 | if not path: 367 | path = self.output_dir / "model_out" 368 | 369 | path.mkdir(exist_ok=True) 370 | 371 | torch.cuda.empty_cache() 372 | # Save a trained model 373 | model_to_save = ( 374 | self.model.module if hasattr(self.model, "module") else self.model 375 | ) # Only save the model it-self 376 | model_to_save.save_pretrained(path) 377 | 378 | # save the tokenizer 379 | self.data.tokenizer.save_pretrained(path) 380 | -------------------------------------------------------------------------------- /fast_bert/learner_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pathlib import Path 3 | import logging 4 | 5 | from transformers import ( 6 | AdamW, 7 | get_constant_schedule, 8 | get_constant_schedule_with_warmup, 9 | get_linear_schedule_with_warmup, 10 | get_cosine_schedule_with_warmup, 11 | get_cosine_with_hard_restarts_schedule_with_warmup, 12 | ) 13 | 14 | from pytorch_lamb import Lamb 15 | 16 | 17 | class Learner(object): 18 | def __init__( 19 | self, 20 | data, 21 | model, 22 | pretrained_model_path, 23 | output_dir, 24 | device, 25 | logger=logging.getLogger(__name__), 26 | multi_gpu=True, 27 | is_fp16=True, 28 | warmup_steps=0, 29 | fp16_opt_level="O1", 30 | grad_accumulation_steps=1, 31 | max_grad_norm=1.0, 32 | adam_epsilon=1e-8, 33 | logging_steps=100, 34 | ): 35 | 36 | if isinstance(output_dir, str): 37 | output_dir = Path(output_dir) 38 | 39 | self.data = data 40 | self.model = model 41 | self.pretrained_model_path = pretrained_model_path 42 | self.multi_gpu = multi_gpu 43 | self.is_fp16 = is_fp16 44 | self.fp16_opt_level = fp16_opt_level 45 | self.adam_epsilon = adam_epsilon 46 | self.warmup_steps = warmup_steps 47 | self.grad_accumulation_steps = grad_accumulation_steps 48 | self.device = device 49 | self.logger = logger 50 | self.layer_groups = None 51 | self.optimizer = None 52 | self.n_gpu = 0 53 | self.max_grad_norm = max_grad_norm 54 | self.logging_steps = logging_steps 55 | self.max_steps = -1 56 | self.weight_decay = 0.0 57 | self.model_type = data.model_type 58 | 59 | self.output_dir = output_dir 60 | 61 | if self.multi_gpu: 62 | self.n_gpu = torch.cuda.device_count() 63 | 64 | # Get the optimiser object 65 | def get_optimizer(self, lr, optimizer_type="lamb"): 66 | 67 | # Prepare optimiser and schedule 68 | no_decay = ["bias", "LayerNorm.weight"] 69 | 70 | optimizer_grouped_parameters = [ 71 | { 72 | "params": [ 73 | p 74 | for n, p in self.model.named_parameters() 75 | if not any(nd in n for nd in no_decay) 76 | ], 77 | "weight_decay": self.weight_decay, 78 | }, 79 | { 80 | "params": [ 81 | p 82 | for n, p in self.model.named_parameters() 83 | if any(nd in n for nd in no_decay) 84 | ], 85 | "weight_decay": 0.0, 86 | }, 87 | ] 88 | 89 | if optimizer_type == "lamb": 90 | optimizer = Lamb( 91 | optimizer_grouped_parameters, weight_decay=0.1, lr=lr, eps=1e-12 92 | ) 93 | elif optimizer_type == "adamw": 94 | optimizer = AdamW( 95 | optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon 96 | ) 97 | 98 | return optimizer 99 | 100 | # Get learning rate scheduler 101 | def get_scheduler(self, optimizer, t_total, schedule_type="warmup_cosine"): 102 | 103 | SCHEDULES = { 104 | None: get_constant_schedule, 105 | "none": get_constant_schedule, 106 | "warmup_cosine": get_cosine_schedule_with_warmup, 107 | "warmup_constant": get_constant_schedule_with_warmup, 108 | "warmup_linear": get_linear_schedule_with_warmup, 109 | "warmup_cosine_hard_restarts": get_cosine_with_hard_restarts_schedule_with_warmup, 110 | } 111 | 112 | if schedule_type is None or schedule_type == "none": 113 | return SCHEDULES[schedule_type](optimizer) 114 | 115 | elif schedule_type == "warmup_constant": 116 | return SCHEDULES[schedule_type]( 117 | optimizer, num_warmup_steps=self.warmup_steps 118 | ) 119 | 120 | else: 121 | return SCHEDULES[schedule_type]( 122 | optimizer, 123 | num_warmup_steps=self.warmup_steps, 124 | num_training_steps=t_total, 125 | ) 126 | 127 | def save_model(self, path=None): 128 | 129 | if not path: 130 | path = self.output_dir / "model_out" 131 | 132 | path.mkdir(exist_ok=True) 133 | 134 | # Convert path to str for save_pretrained calls 135 | path = str(path) 136 | 137 | torch.cuda.empty_cache() 138 | # Save a trained model 139 | model_to_save = ( 140 | self.model.module if hasattr(self.model, "module") else self.model 141 | ) # Only save the model it-self 142 | model_to_save.save_pretrained(path) 143 | 144 | # save the tokenizer 145 | self.data.tokenizer.save_pretrained(path) 146 | -------------------------------------------------------------------------------- /fast_bert/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import ( 2 | roc_curve, 3 | auc, 4 | hamming_loss, 5 | accuracy_score, 6 | confusion_matrix as sklearn_confusion_matrix, 7 | ) 8 | import numpy as np 9 | from torch import Tensor 10 | 11 | import pdb 12 | import logging 13 | 14 | logger = logging.getLogger() 15 | 16 | CLASSIFICATION_THRESHOLD: float = 0.5 # Best keep it in [0.0, 1.0] range 17 | 18 | # def accuracy(out, labels): 19 | # outputs = np.argmax(out, axis=1) 20 | # return np.sum(outputs == labels) 21 | 22 | 23 | def accuracy(y_pred: Tensor, y_true: Tensor, **kwargs): 24 | y_pred = y_pred.cpu() 25 | outputs = np.argmax(y_pred, axis=1) 26 | return np.mean(outputs.numpy() == y_true.detach().cpu().numpy()) 27 | 28 | 29 | def accuracy_multilabel(y_pred: Tensor, y_true: Tensor, sigmoid: bool = True, **kwargs): 30 | if sigmoid: 31 | y_pred = y_pred.sigmoid() 32 | y_pred = y_pred.cpu() 33 | y_true = y_true.cpu() 34 | outputs = np.argmax(y_pred, axis=1) 35 | real_vals = np.argmax(y_true, axis=1) 36 | return np.mean(outputs.numpy() == real_vals.numpy()) 37 | 38 | 39 | def accuracy_thresh( 40 | y_pred: Tensor, 41 | y_true: Tensor, 42 | thresh: float = CLASSIFICATION_THRESHOLD, 43 | sigmoid: bool = True, 44 | **kwargs 45 | ): 46 | "Compute accuracy when `y_pred` and `y_true` are the same size." 47 | if sigmoid: 48 | y_pred = y_pred.sigmoid() 49 | return ((y_pred > thresh) == y_true.bool()).float().mean().item() 50 | 51 | 52 | # return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum() 53 | 54 | 55 | def fbeta( 56 | y_pred: Tensor, 57 | y_true: Tensor, 58 | thresh: float = 0.3, 59 | beta: float = 2, 60 | eps: float = 1e-9, 61 | sigmoid: bool = True, 62 | **kwargs 63 | ): 64 | "Computes the f_beta between `preds` and `targets`" 65 | beta2 = beta ** 2 66 | if sigmoid: 67 | y_pred = y_pred.sigmoid() 68 | y_pred = (y_pred > thresh).float() 69 | y_true = y_true.float() 70 | TP = (y_pred * y_true).sum(dim=1) 71 | prec = TP / (y_pred.sum(dim=1) + eps) 72 | rec = TP / (y_true.sum(dim=1) + eps) 73 | res = (prec * rec) / (prec * beta2 + rec + eps) * (1 + beta2) 74 | return res.mean().item() 75 | 76 | 77 | def roc_auc(y_pred: Tensor, y_true: Tensor, **kwargs): 78 | # ROC-AUC calcualation 79 | # Compute ROC curve and ROC area for each class 80 | fpr = dict() 81 | tpr = dict() 82 | roc_auc = dict() 83 | 84 | y_true = y_true.detach().cpu().numpy() 85 | y_pred = y_pred.detach().cpu().numpy() 86 | 87 | # Compute micro-average ROC curve and ROC area 88 | fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel()) 89 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 90 | 91 | return roc_auc["micro"] 92 | 93 | 94 | def Hamming_loss( 95 | y_pred: Tensor, 96 | y_true: Tensor, 97 | sigmoid: bool = True, 98 | thresh: float = CLASSIFICATION_THRESHOLD, 99 | sample_weight=None, 100 | **kwargs 101 | ): 102 | if sigmoid: 103 | y_pred = y_pred.sigmoid() 104 | y_pred = (y_pred > thresh).float() 105 | return hamming_loss(y_true, y_pred, sample_weight=sample_weight) 106 | 107 | 108 | def Exact_Match_Ratio( 109 | y_pred: Tensor, 110 | y_true: Tensor, 111 | sigmoid: bool = True, 112 | thresh: float = CLASSIFICATION_THRESHOLD, 113 | normalize: bool = True, 114 | sample_weight=None, 115 | **kwargs 116 | ): 117 | if sigmoid: 118 | y_pred = y_pred.sigmoid() 119 | y_pred = (y_pred > thresh).float() 120 | return accuracy_score( 121 | y_true, y_pred, normalize=normalize, sample_weight=sample_weight 122 | ) 123 | 124 | 125 | def F1( 126 | y_pred: Tensor, 127 | y_true: Tensor, 128 | threshold: float = CLASSIFICATION_THRESHOLD, 129 | **kwargs 130 | ): 131 | return fbeta(y_pred, y_true, thresh=threshold, beta=1) 132 | 133 | 134 | def confusion_matrix(y_pred: Tensor, y_true: Tensor, **kwargs): 135 | try: 136 | y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1) 137 | return sklearn_confusion_matrix( 138 | y_true.detach().cpu().numpy(), y_pred, labels=kwargs.get("labels"), 139 | ) 140 | except Exception as e: 141 | logger.error(e) 142 | 143 | -------------------------------------------------------------------------------- /fast_bert/onnx_helper.py: -------------------------------------------------------------------------------- 1 | from onnxruntime import ( 2 | GraphOptimizationLevel, 3 | InferenceSession, 4 | SessionOptions, 5 | get_all_providers, 6 | ) 7 | 8 | import logging 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | logger = logging.getLogger() 13 | 14 | 15 | def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: 16 | 17 | assert ( 18 | provider in get_all_providers() 19 | ), f"provider {provider} not found, {get_all_providers()}" 20 | 21 | # Few properties that might have an impact on performances (provided by MS) 22 | options = SessionOptions() 23 | options.intra_op_num_threads = 1 24 | options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL 25 | 26 | # Load the model as a graph and prepare the CPU backend 27 | session = InferenceSession(model_path, options, providers=[provider]) 28 | session.disable_fallback() 29 | 30 | return session 31 | 32 | 33 | def load_model(model_path: Path): 34 | try: 35 | quantized_model = create_model_for_provider( 36 | model_path.as_posix(), "CPUExecutionProvider" 37 | ) 38 | return quantized_model 39 | except Exception as e: 40 | logger.error(e) 41 | raise e 42 | -------------------------------------------------------------------------------- /fast_bert/prediction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pathlib import Path 3 | 4 | from .onnx_helper import load_model 5 | 6 | 7 | from transformers import AutoTokenizer 8 | import numpy as np 9 | 10 | import warnings 11 | 12 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 13 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 14 | 15 | 16 | class BertClassificationPredictor(object): 17 | def __init__( 18 | self, 19 | model_path, 20 | label_path, 21 | multi_label=False, 22 | model_type="bert", 23 | use_fast_tokenizer=True, 24 | do_lower_case=True, 25 | device=None, 26 | ): 27 | if device is None: 28 | device = ( 29 | torch.device("cuda") 30 | if torch.cuda.is_available() 31 | else torch.device("cpu") 32 | ) 33 | 34 | self.model_path = model_path 35 | self.label_path = label_path 36 | self.multi_label = multi_label 37 | self.model_type = model_type 38 | self.do_lower_case = do_lower_case 39 | self.device = device 40 | 41 | # Use auto-tokenizer 42 | self.tokenizer = AutoTokenizer.from_pretrained( 43 | self.model_path, use_fast=use_fast_tokenizer 44 | ) 45 | 46 | self.learner = self.get_learner() 47 | 48 | def get_learner(self): 49 | from .learner_cls import BertLearner 50 | from .data_cls import BertDataBunch 51 | 52 | databunch = BertDataBunch( 53 | self.label_path, 54 | self.label_path, 55 | self.tokenizer, 56 | train_file=None, 57 | val_file=None, 58 | batch_size_per_gpu=32, 59 | max_seq_length=512, 60 | multi_gpu=False, 61 | multi_label=self.multi_label, 62 | model_type=self.model_type, 63 | no_cache=True, 64 | ) 65 | 66 | learner = BertLearner.from_pretrained_model( 67 | databunch, 68 | self.model_path, 69 | metrics=[], 70 | device=self.device, 71 | logger=None, 72 | output_dir=None, 73 | warmup_steps=0, 74 | multi_gpu=False, 75 | is_fp16=False, 76 | multi_label=self.multi_label, 77 | logging_steps=0, 78 | ) 79 | 80 | return learner 81 | 82 | def predict_batch(self, texts, verbose=False): 83 | return self.learner.predict_batch(texts, verbose=verbose) 84 | 85 | def predict(self, text, verbose=False): 86 | predictions = self.predict_batch([text], verbose=verbose)[0] 87 | return predictions 88 | 89 | 90 | class BertOnnxClassificationPredictor(object): 91 | def __init__( 92 | self, 93 | model_path, 94 | label_path, 95 | model_name="model.onnx", 96 | multi_label=False, 97 | model_type="bert", 98 | use_fast_tokenizer=True, 99 | do_lower_case=True, 100 | device=None, 101 | ): 102 | if device is None: 103 | device = ( 104 | torch.device("cuda") 105 | if torch.cuda.is_available() 106 | else torch.device("cpu") 107 | ) 108 | 109 | self.model_path = model_path 110 | self.label_path = label_path 111 | self.multi_label = multi_label 112 | self.model_type = model_type 113 | self.do_lower_case = do_lower_case 114 | self.device = device 115 | self.labels = [] 116 | 117 | # Use auto-tokenizer 118 | self.tokenizer = AutoTokenizer.from_pretrained( 119 | self.model_path, use_fast=use_fast_tokenizer 120 | ) 121 | 122 | with open(label_path / "labels.csv", "r") as f: 123 | self.labels = f.read().split("\n") 124 | 125 | self.model = load_model(Path(self.model_path) / model_name) 126 | 127 | def predict(self, text, verbose=False): 128 | # Inputs are provided through numpy array 129 | model_inputs = self.tokenizer(text, return_tensors="pt") 130 | inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()} 131 | outputs = self.model.run(None, inputs_onnx) 132 | softmax_preds = softmax(outputs[0]) 133 | preds = list(zip(self.labels, softmax_preds[0])) 134 | return sorted(preds, key=lambda x: x[1], reverse=True) 135 | 136 | 137 | 138 | def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: 139 | x_max = np.max(x, axis=axis, keepdims=True) 140 | tmp = np.exp(x - x_max) 141 | s = np.sum(tmp, axis=axis, keepdims=True) 142 | return tmp / s -------------------------------------------------------------------------------- /fast_bert/prediction_ner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pathlib import Path 3 | 4 | from .onnx_helper import load_model 5 | 6 | from .learner_ner import group_entities 7 | from .data_ner import get_labels 8 | 9 | from transformers import AutoTokenizer 10 | import numpy as np 11 | 12 | import warnings 13 | 14 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 15 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 16 | 17 | 18 | class BertNERPredictor(object): 19 | def __init__( 20 | self, 21 | model_path, 22 | label_path, 23 | model_type="bert", 24 | use_fast_tokenizer=True, 25 | do_lower_case=True, 26 | device=None, 27 | ): 28 | if device is None: 29 | device = ( 30 | torch.device("cuda") 31 | if torch.cuda.is_available() 32 | else torch.device("cpu") 33 | ) 34 | 35 | self.model_path = model_path 36 | self.label_path = label_path 37 | self.model_type = model_type 38 | self.do_lower_case = do_lower_case 39 | self.device = device 40 | 41 | # Use auto-tokenizer 42 | self.tokenizer = AutoTokenizer.from_pretrained( 43 | self.model_path, use_fast=use_fast_tokenizer 44 | ) 45 | 46 | self.learner = self.get_learner() 47 | 48 | def get_learner(self): 49 | from .data_ner import BertNERDataBunch 50 | from .learner_ner import BertNERLearner 51 | 52 | databunch = BertNERDataBunch( 53 | self.label_path, 54 | self.tokenizer, 55 | train_file=None, 56 | val_file=None, 57 | batch_size_per_gpu=32, 58 | max_seq_length=512, 59 | multi_gpu=False, 60 | model_type=self.model_type, 61 | no_cache=True, 62 | ) 63 | 64 | learner = BertNERLearner.from_pretrained_model( 65 | databunch, 66 | self.model_path, 67 | device=self.device, 68 | logger=None, 69 | output_dir=None, 70 | warmup_steps=0, 71 | multi_gpu=False, 72 | is_fp16=False, 73 | logging_steps=0, 74 | ) 75 | 76 | return learner 77 | 78 | def predict_batch(self, texts, group=True, exclude_entities=["O"]): 79 | predictions = [] 80 | 81 | for text in texts: 82 | pred = self.predict(text, group=group, exclude_entities=exclude_entities) 83 | if pred: 84 | predictions.append({"text": text, "results": pred}) 85 | 86 | def predict(self, text, group=True, exclude_entities=["O"]): 87 | predictions = self.learner.predict( 88 | text, group=group, exclude_entities=exclude_entities 89 | ) 90 | return predictions 91 | 92 | 93 | class BertOnnxNERPredictor(object): 94 | def __init__( 95 | self, 96 | model_path, 97 | label_path, 98 | model_name="model.onnx", 99 | model_type="bert", 100 | use_fast_tokenizer=True, 101 | do_lower_case=True, 102 | device=None, 103 | ): 104 | if device is None: 105 | device = ( 106 | torch.device("cuda") 107 | if torch.cuda.is_available() 108 | else torch.device("cpu") 109 | ) 110 | 111 | self.model_path = model_path 112 | self.label_path = label_path 113 | self.model_type = model_type 114 | self.do_lower_case = do_lower_case 115 | self.device = device 116 | self.labels = [] 117 | 118 | # Use auto-tokenizer 119 | self.tokenizer = AutoTokenizer.from_pretrained( 120 | self.model_path, use_fast=use_fast_tokenizer 121 | ) 122 | 123 | self.labels = get_labels(str(label_path / "labels.txt")) 124 | 125 | self.model = load_model(Path(self.model_path) / model_name) 126 | 127 | def predict(self, text, group=True, exclude_entities=["O"]): 128 | # Inputs are provided through numpy array 129 | tokens = self.tokenizer.tokenize( 130 | self.tokenizer.decode(self.tokenizer.encode(text)) 131 | ) 132 | 133 | model_inputs = self.tokenizer(text, return_tensors="pt") 134 | inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()} 135 | outputs = self.model.run(None, inputs_onnx)[0] 136 | outputs = softmax(outputs) 137 | 138 | predictions = np.argmax(outputs, axis=2) 139 | 140 | preds = [ 141 | (token, self.labels[prediction], output[prediction]) 142 | for token, output, prediction in zip(tokens, outputs[0], predictions[0]) 143 | ][1:-1] 144 | 145 | preds = [ 146 | { 147 | "index": index, 148 | "word": prediction[0], 149 | "entity": prediction[1], 150 | "score": prediction[2], 151 | } 152 | for index, prediction in enumerate(preds) 153 | ] 154 | 155 | if group is True: 156 | preds = group_entities(preds) 157 | 158 | out_preds = [] 159 | for pred in preds: 160 | if pred["entity"] not in exclude_entities: 161 | try: 162 | pred["entity"] = pred["entity"].split("-")[1] 163 | except Exception: 164 | pass 165 | 166 | out_preds.append(pred) 167 | 168 | return out_preds 169 | 170 | def predict_batch(self, texts, group=True, exclude_entities=["O"]): 171 | predictions = [] 172 | 173 | for text in texts: 174 | pred = self.predict(text, group=group, exclude_entities=exclude_entities) 175 | if pred: 176 | predictions.append({"text": text, "results": pred}) 177 | 178 | 179 | def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: 180 | x_max = np.max(x, axis=axis, keepdims=True) 181 | tmp = np.exp(x - x_max) 182 | s = np.sum(tmp, axis=axis, keepdims=True) 183 | return tmp / s 184 | -------------------------------------------------------------------------------- /fast_bert/summarisation/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_bertabs import * 2 | from .modeling_bertabs import * 3 | -------------------------------------------------------------------------------- /fast_bert/summarisation/configuration_bertabs.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BertAbs configuration """ 17 | import json 18 | import logging 19 | import sys 20 | 21 | from transformers import PretrainedConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | BERTABS_FINETUNED_CONFIG_MAP = { 28 | "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json", 29 | } 30 | 31 | 32 | class BertAbsConfig(PretrainedConfig): 33 | r""" Class to store the configuration of the BertAbs model. 34 | 35 | Arguments: 36 | vocab_size: int 37 | Number of tokens in the vocabulary. 38 | max_pos: int 39 | The maximum sequence length that this model will be used with. 40 | enc_layer: int 41 | The numner of hidden layers in the Transformer encoder. 42 | enc_hidden_size: int 43 | The size of the encoder's layers. 44 | enc_heads: int 45 | The number of attention heads for each attention layer in the encoder. 46 | enc_ff_size: int 47 | The size of the encoder's feed-forward layers. 48 | enc_dropout: int 49 | The dropout probabilitiy for all fully connected layers in the 50 | embeddings, layers, pooler and also the attention probabilities in 51 | the encoder. 52 | dec_layer: int 53 | The numner of hidden layers in the decoder. 54 | dec_hidden_size: int 55 | The size of the decoder's layers. 56 | dec_heads: int 57 | The number of attention heads for each attention layer in the decoder. 58 | dec_ff_size: int 59 | The size of the decoder's feed-forward layers. 60 | dec_dropout: int 61 | The dropout probabilitiy for all fully connected layers in the 62 | embeddings, layers, pooler and also the attention probabilities in 63 | the decoder. 64 | """ 65 | 66 | pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP 67 | 68 | def __init__( 69 | self, 70 | vocab_size=30522, 71 | max_pos=512, 72 | enc_layers=6, 73 | enc_hidden_size=512, 74 | enc_heads=8, 75 | enc_ff_size=512, 76 | enc_dropout=0.2, 77 | dec_layers=6, 78 | dec_hidden_size=768, 79 | dec_heads=8, 80 | dec_ff_size=2048, 81 | dec_dropout=0.2, 82 | **kwargs, 83 | ): 84 | super(BertAbsConfig, self).__init__(**kwargs) 85 | 86 | self.vocab_size = vocab_size 87 | self.max_pos = max_pos 88 | 89 | self.enc_layers = enc_layers 90 | self.enc_hidden_size = enc_hidden_size 91 | self.enc_heads = enc_heads 92 | self.enc_ff_size = enc_ff_size 93 | self.enc_dropout = enc_dropout 94 | 95 | self.dec_layers = dec_layers 96 | self.dec_hidden_size = dec_hidden_size 97 | self.dec_heads = dec_heads 98 | self.dec_ff_size = dec_ff_size 99 | self.dec_dropout = dec_dropout 100 | -------------------------------------------------------------------------------- /fast_bert/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .spellcheck import BingSpellCheck 2 | -------------------------------------------------------------------------------- /fast_bert/utils/spellcheck.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | 5 | class BingSpellCheck(object): 6 | def __init__(self, key): 7 | self.api_key = key 8 | self.endpoint = "https://api.cognitive.microsoft.com/bing/v7.0/SpellCheck" 9 | 10 | def spell_check(self, text, mode='spell'): 11 | data = {'text': text} 12 | 13 | params = { 14 | 'mkt': 'en-us', 15 | 'mode': mode 16 | } 17 | 18 | headers = { 19 | 'Content-Type': 'application/x-www-form-urlencoded', 20 | 'Ocp-Apim-Subscription-Key': self.api_key, 21 | } 22 | response = requests.post( 23 | self.endpoint, headers=headers, params=params, data=data) 24 | 25 | corrected_spells = response.json() 26 | 27 | flaggedTokens = corrected_spells['flaggedTokens'] 28 | 29 | for flagged in flaggedTokens: 30 | text = text.replace( 31 | flagged['token'], flagged['suggestions'][0]['suggestion']) 32 | 33 | return text 34 | -------------------------------------------------------------------------------- /fast_bert/utils_squad_evaluate.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for SQuAD version 2.0. 2 | Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 3 | In addition to basic functionality, we also compute additional statistics and 4 | plot precision-recall curves if an additional na_prob.json file is provided. 5 | This file is expected to map question ID's to the model's predicted probability 6 | that a question is unanswerable. 7 | """ 8 | import argparse 9 | import collections 10 | import json 11 | import numpy as np 12 | import os 13 | import re 14 | import string 15 | import sys 16 | 17 | class EVAL_OPTS(): 18 | def __init__(self, data_file, pred_file, out_file="", 19 | na_prob_file="na_prob.json", na_prob_thresh=1.0, 20 | out_image_dir=None, verbose=False): 21 | self.data_file = data_file 22 | self.pred_file = pred_file 23 | self.out_file = out_file 24 | self.na_prob_file = na_prob_file 25 | self.na_prob_thresh = na_prob_thresh 26 | self.out_image_dir = out_image_dir 27 | self.verbose = verbose 28 | 29 | OPTS = None 30 | 31 | def parse_args(): 32 | parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') 33 | parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') 34 | parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') 35 | parser.add_argument('--out-file', '-o', metavar='eval.json', 36 | help='Write accuracy metrics to file (default is stdout).') 37 | parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', 38 | help='Model estimates of probability of no answer.') 39 | parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, 40 | help='Predict "" if no-answer probability exceeds this (default = 1.0).') 41 | parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, 42 | help='Save precision-recall curves to directory.') 43 | parser.add_argument('--verbose', '-v', action='store_true') 44 | if len(sys.argv) == 1: 45 | parser.print_help() 46 | sys.exit(1) 47 | return parser.parse_args() 48 | 49 | def make_qid_to_has_ans(dataset): 50 | qid_to_has_ans = {} 51 | for article in dataset: 52 | for p in article['paragraphs']: 53 | for qa in p['qas']: 54 | qid_to_has_ans[qa['id']] = bool(qa['answers']) 55 | return qid_to_has_ans 56 | 57 | def normalize_answer(s): 58 | """Lower text and remove punctuation, articles and extra whitespace.""" 59 | def remove_articles(text): 60 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 61 | return re.sub(regex, ' ', text) 62 | def white_space_fix(text): 63 | return ' '.join(text.split()) 64 | def remove_punc(text): 65 | exclude = set(string.punctuation) 66 | return ''.join(ch for ch in text if ch not in exclude) 67 | def lower(text): 68 | return text.lower() 69 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 70 | 71 | def get_tokens(s): 72 | if not s: return [] 73 | return normalize_answer(s).split() 74 | 75 | def compute_exact(a_gold, a_pred): 76 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 77 | 78 | def compute_f1(a_gold, a_pred): 79 | gold_toks = get_tokens(a_gold) 80 | pred_toks = get_tokens(a_pred) 81 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks) 82 | num_same = sum(common.values()) 83 | if len(gold_toks) == 0 or len(pred_toks) == 0: 84 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 85 | return int(gold_toks == pred_toks) 86 | if num_same == 0: 87 | return 0 88 | precision = 1.0 * num_same / len(pred_toks) 89 | recall = 1.0 * num_same / len(gold_toks) 90 | f1 = (2 * precision * recall) / (precision + recall) 91 | return f1 92 | 93 | def get_raw_scores(dataset, preds): 94 | exact_scores = {} 95 | f1_scores = {} 96 | for article in dataset: 97 | for p in article['paragraphs']: 98 | for qa in p['qas']: 99 | qid = qa['id'] 100 | gold_answers = [a['text'] for a in qa['answers'] 101 | if normalize_answer(a['text'])] 102 | if not gold_answers: 103 | # For unanswerable questions, only correct answer is empty string 104 | gold_answers = [''] 105 | if qid not in preds: 106 | print('Missing prediction for %s' % qid) 107 | continue 108 | a_pred = preds[qid] 109 | # Take max over all gold answers 110 | exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) 111 | f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) 112 | return exact_scores, f1_scores 113 | 114 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): 115 | new_scores = {} 116 | for qid, s in scores.items(): 117 | pred_na = na_probs[qid] > na_prob_thresh 118 | if pred_na: 119 | new_scores[qid] = float(not qid_to_has_ans[qid]) 120 | else: 121 | new_scores[qid] = s 122 | return new_scores 123 | 124 | def make_eval_dict(exact_scores, f1_scores, qid_list=None): 125 | if not qid_list: 126 | total = len(exact_scores) 127 | return collections.OrderedDict([ 128 | ('exact', 100.0 * sum(exact_scores.values()) / total), 129 | ('f1', 100.0 * sum(f1_scores.values()) / total), 130 | ('total', total), 131 | ]) 132 | else: 133 | total = len(qid_list) 134 | return collections.OrderedDict([ 135 | ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), 136 | ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), 137 | ('total', total), 138 | ]) 139 | 140 | def merge_eval(main_eval, new_eval, prefix): 141 | for k in new_eval: 142 | main_eval['%s_%s' % (prefix, k)] = new_eval[k] 143 | 144 | def plot_pr_curve(precisions, recalls, out_image, title): 145 | plt.step(recalls, precisions, color='b', alpha=0.2, where='post') 146 | plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') 147 | plt.xlabel('Recall') 148 | plt.ylabel('Precision') 149 | plt.xlim([0.0, 1.05]) 150 | plt.ylim([0.0, 1.05]) 151 | plt.title(title) 152 | plt.savefig(out_image) 153 | plt.clf() 154 | 155 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, 156 | out_image=None, title=None): 157 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 158 | true_pos = 0.0 159 | cur_p = 1.0 160 | cur_r = 0.0 161 | precisions = [1.0] 162 | recalls = [0.0] 163 | avg_prec = 0.0 164 | for i, qid in enumerate(qid_list): 165 | if qid_to_has_ans[qid]: 166 | true_pos += scores[qid] 167 | cur_p = true_pos / float(i+1) 168 | cur_r = true_pos / float(num_true_pos) 169 | if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: 170 | # i.e., if we can put a threshold after this point 171 | avg_prec += cur_p * (cur_r - recalls[-1]) 172 | precisions.append(cur_p) 173 | recalls.append(cur_r) 174 | if out_image: 175 | plot_pr_curve(precisions, recalls, out_image, title) 176 | return {'ap': 100.0 * avg_prec} 177 | 178 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 179 | qid_to_has_ans, out_image_dir): 180 | if out_image_dir and not os.path.exists(out_image_dir): 181 | os.makedirs(out_image_dir) 182 | num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) 183 | if num_true_pos == 0: 184 | return 185 | pr_exact = make_precision_recall_eval( 186 | exact_raw, na_probs, num_true_pos, qid_to_has_ans, 187 | out_image=os.path.join(out_image_dir, 'pr_exact.png'), 188 | title='Precision-Recall curve for Exact Match score') 189 | pr_f1 = make_precision_recall_eval( 190 | f1_raw, na_probs, num_true_pos, qid_to_has_ans, 191 | out_image=os.path.join(out_image_dir, 'pr_f1.png'), 192 | title='Precision-Recall curve for F1 score') 193 | oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} 194 | pr_oracle = make_precision_recall_eval( 195 | oracle_scores, na_probs, num_true_pos, qid_to_has_ans, 196 | out_image=os.path.join(out_image_dir, 'pr_oracle.png'), 197 | title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') 198 | merge_eval(main_eval, pr_exact, 'pr_exact') 199 | merge_eval(main_eval, pr_f1, 'pr_f1') 200 | merge_eval(main_eval, pr_oracle, 'pr_oracle') 201 | 202 | def histogram_na_prob(na_probs, qid_list, image_dir, name): 203 | if not qid_list: 204 | return 205 | x = [na_probs[k] for k in qid_list] 206 | weights = np.ones_like(x) / float(len(x)) 207 | plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) 208 | plt.xlabel('Model probability of no-answer') 209 | plt.ylabel('Proportion of dataset') 210 | plt.title('Histogram of no-answer probability: %s' % name) 211 | plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) 212 | plt.clf() 213 | 214 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): 215 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) 216 | cur_score = num_no_ans 217 | best_score = cur_score 218 | best_thresh = 0.0 219 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 220 | for i, qid in enumerate(qid_list): 221 | if qid not in scores: continue 222 | if qid_to_has_ans[qid]: 223 | diff = scores[qid] 224 | else: 225 | if preds[qid]: 226 | diff = -1 227 | else: 228 | diff = 0 229 | cur_score += diff 230 | if cur_score > best_score: 231 | best_score = cur_score 232 | best_thresh = na_probs[qid] 233 | return 100.0 * best_score / len(scores), best_thresh 234 | 235 | def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): 236 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) 237 | cur_score = num_no_ans 238 | best_score = cur_score 239 | best_thresh = 0.0 240 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 241 | for i, qid in enumerate(qid_list): 242 | if qid not in scores: continue 243 | if qid_to_has_ans[qid]: 244 | diff = scores[qid] 245 | else: 246 | if preds[qid]: 247 | diff = -1 248 | else: 249 | diff = 0 250 | cur_score += diff 251 | if cur_score > best_score: 252 | best_score = cur_score 253 | best_thresh = na_probs[qid] 254 | 255 | has_ans_score, has_ans_cnt = 0, 0 256 | for qid in qid_list: 257 | if not qid_to_has_ans[qid]: continue 258 | has_ans_cnt += 1 259 | 260 | if qid not in scores: continue 261 | has_ans_score += scores[qid] 262 | 263 | return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt 264 | 265 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): 266 | best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) 267 | best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) 268 | main_eval['best_exact'] = best_exact 269 | main_eval['best_exact_thresh'] = exact_thresh 270 | main_eval['best_f1'] = best_f1 271 | main_eval['best_f1_thresh'] = f1_thresh 272 | 273 | def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): 274 | best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) 275 | best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) 276 | main_eval['best_exact'] = best_exact 277 | main_eval['best_exact_thresh'] = exact_thresh 278 | main_eval['best_f1'] = best_f1 279 | main_eval['best_f1_thresh'] = f1_thresh 280 | main_eval['has_ans_exact'] = has_ans_exact 281 | main_eval['has_ans_f1'] = has_ans_f1 282 | 283 | def main(OPTS): 284 | with open(OPTS.data_file) as f: 285 | dataset_json = json.load(f) 286 | dataset = dataset_json['data'] 287 | with open(OPTS.pred_file) as f: 288 | preds = json.load(f) 289 | if OPTS.na_prob_file: 290 | with open(OPTS.na_prob_file) as f: 291 | na_probs = json.load(f) 292 | else: 293 | na_probs = {k: 0.0 for k in preds} 294 | qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False 295 | has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] 296 | no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] 297 | exact_raw, f1_raw = get_raw_scores(dataset, preds) 298 | exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, 299 | OPTS.na_prob_thresh) 300 | f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 301 | OPTS.na_prob_thresh) 302 | out_eval = make_eval_dict(exact_thresh, f1_thresh) 303 | if has_ans_qids: 304 | has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) 305 | merge_eval(out_eval, has_ans_eval, 'HasAns') 306 | if no_ans_qids: 307 | no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) 308 | merge_eval(out_eval, no_ans_eval, 'NoAns') 309 | if OPTS.na_prob_file: 310 | find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) 311 | if OPTS.na_prob_file and OPTS.out_image_dir: 312 | run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 313 | qid_to_has_ans, OPTS.out_image_dir) 314 | histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') 315 | histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') 316 | if OPTS.out_file: 317 | with open(OPTS.out_file, 'w') as f: 318 | json.dump(out_eval, f) 319 | else: 320 | print(json.dumps(out_eval, indent=2)) 321 | return out_eval 322 | 323 | if __name__ == '__main__': 324 | OPTS = parse_args() 325 | if OPTS.out_image_dir: 326 | import matplotlib 327 | matplotlib.use('Agg') 328 | import matplotlib.pyplot as plt 329 | main(OPTS) -------------------------------------------------------------------------------- /images/lr_finder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utterworks/fast-bert/cff2f913c0c01a85d8c998afb3de6c33fa8bf07a/images/lr_finder.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | pytorch-lamb 3 | tensorboardX 4 | fastprogress 5 | scikit-learn 6 | seqeval 7 | transformers==4.22.* 8 | pandas 9 | python-box 10 | more-itertools 11 | onnx 12 | onnxruntime 13 | onnxruntime-tools 14 | -------------------------------------------------------------------------------- /sample_data/imdb_movie_reviews/label/labels.csv: -------------------------------------------------------------------------------- 1 | 0 2 | 1 -------------------------------------------------------------------------------- /sample_data/multi_label_toxic_comments/label/labels.csv: -------------------------------------------------------------------------------- 1 | toxic 2 | severe_toxic 3 | obscene 4 | threat 5 | insult 6 | identity_hate -------------------------------------------------------------------------------- /sample_notebooks/toxic_comments_sagemaker.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sagemaker\n", 10 | "from pathlib import Path\n", 11 | "from sagemaker.predictor import json_serializer\n", 12 | "import json" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "role = sagemaker.get_execution_role()\n", 22 | "session = sagemaker.Session()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Setup Path " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# location for train.csv, val.csv and labels.csv\n", 39 | "DATA_PATH = Path(\"../data/\") \n", 40 | "\n", 41 | "# Location for storing training_config.json\n", 42 | "CONFIG_PATH = DATA_PATH/'config'\n", 43 | "CONFIG_PATH.mkdir(exist_ok=True)\n", 44 | "\n", 45 | "# S3 bucket name\n", 46 | "bucket = 'sagemaker-deep-learning'\n", 47 | "\n", 48 | "# Prefix for S3 bucket for input and output\n", 49 | "prefix = 'toxic_comments/input'\n", 50 | "prefix_output = 'toxic_comments/output'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Hyperparameters & Training Config" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "hyperparameters = {\n", 67 | " \"epochs\": 10,\n", 68 | " \"lr\": 8e-5,\n", 69 | " \"max_seq_length\": 512,\n", 70 | " \"train_batch_size\": 16,\n", 71 | " \"lr_schedule\": \"warmup_cosine\",\n", 72 | " \"warmup_steps\": 1000,\n", 73 | " \"optimizer_type\": \"adamw\"\n", 74 | "}" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "training_config = {\n", 84 | " \"run_text\": \"toxic comments\",\n", 85 | " \"finetuned_model\": None,\n", 86 | " \"do_lower_case\": \"True\",\n", 87 | " \"train_file\": \"train.csv\",\n", 88 | " \"val_file\": \"val.csv\",\n", 89 | " \"label_file\": \"labels.csv\",\n", 90 | " \"text_col\": \"comment_text\",\n", 91 | " \"label_col\": '[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]',\n", 92 | " \"multi_label\": \"True\",\n", 93 | " \"grad_accumulation_steps\": \"1\",\n", 94 | " \"fp16_opt_level\": \"O1\",\n", 95 | " \"fp16\": \"True\",\n", 96 | " \"model_type\": \"roberta\",\n", 97 | " \"model_name\": \"roberta-base\",\n", 98 | " \"logging_steps\": \"300\"\n", 99 | "}\n", 100 | "\n", 101 | "with open(CONFIG_PATH/'training_config.json', 'w') as f:\n", 102 | " json.dump(training_config, f)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Upload Data" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# This is a helper feature to upload data\n", 119 | "# from your local machine to S3 bucket.\n", 120 | "\n", 121 | "s3_input = session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)\n", 122 | "\n", 123 | "session.upload_data(str(DATA_PATH/'labels.csv'), bucket=bucket , key_prefix=prefix)\n", 124 | "session.upload_data(str(DATA_PATH/'train.csv'), bucket=bucket , key_prefix=prefix)\n", 125 | "session.upload_data(str(DATA_PATH/'val.csv'), bucket=bucket , key_prefix=prefix)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Create an Estimator and start training" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "account = session.boto_session.client('sts').get_caller_identity()['Account']\n", 142 | "region = session.boto_session.region_name\n", 143 | "\n", 144 | "image = \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-bert:1.0-gpu-py36\".format(account, region)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "output_path = \"s3://{}/{}\".format(bucket, prefix_output)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "estimator = sagemaker.estimator.Estimator(image, \n", 163 | " role,\n", 164 | " train_instance_count=1, \n", 165 | " train_instance_type='ml.p3.8xlarge', \n", 166 | " output_path=output_path, \n", 167 | " base_job_name='toxic-comments',\n", 168 | " hyperparameters=hyperparameters,\n", 169 | " sagemaker_session=session\n", 170 | " )" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "estimator.fit(s3_input)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Deploy the model to hosting service" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "predictor = estimator.deploy(1, \n", 196 | " 'ml.m5.large', \n", 197 | " endpoint_name='bert-toxic-comments', \n", 198 | " update_endpoint=True, \n", 199 | " serializer=json_serializer)" 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.6.5" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from io import open 3 | from setuptools import setup, find_packages 4 | import subprocess 5 | 6 | 7 | with open("requirements.txt") as f: 8 | install_requires = f.read().strip().split("\n") 9 | 10 | # get version from VERSION.txt 11 | with open("VERSION.txt") as f: 12 | version = f.read().strip() 13 | 14 | setup( 15 | name="fast_bert", 16 | # get version from VERSION file 17 | version=version, 18 | description="AI Library using BERT", 19 | author="Kaushal Trivedi", 20 | author_email="kaushaltrivedi@me.com", 21 | license="Apache2", 22 | url="https://github.com/kaushaltrivedi/fast-bert", 23 | long_description=open("README.md", "r", encoding="utf-8").read(), 24 | long_description_content_type="text/markdown", 25 | keywords="BERT NLP deep learning google", 26 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 27 | install_requires=install_requires, 28 | classifiers=[ 29 | "Intended Audience :: Science/Research", 30 | "License :: OSI Approved :: Apache Software License", 31 | "Programming Language :: Python :: 3", 32 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 33 | ], 34 | zip_safe=False, 35 | ) 36 | -------------------------------------------------------------------------------- /tag_release.sh: -------------------------------------------------------------------------------- 1 | 2 | # get tag name from VERSION file 3 | TAG_NAME=v$(cat VERSION.txt) 4 | push_message="${1:-update}" 5 | git add . && git commit -m "$push_message" && git tag $TAG_NAME -m "tag $TAG_NAME" && git push origin $TAG_NAME 6 | git push origin main 7 | -------------------------------------------------------------------------------- /test/summarisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from fast_bert.data_abs import BertAbsDataBunch\n", 10 | "from fast_bert.learner_abs import BertAbsLearner\n", 11 | "from box import Box\n", 12 | "import logging\n", 13 | "import torch\n", 14 | "from pathlib import Path\n", 15 | "from transformers import BertTokenizer" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from tokenizers import (ByteLevelBPETokenizer,\n", 25 | " BPETokenizer,\n", 26 | " SentencePieceBPETokenizer,\n", 27 | " BertWordPieceTokenizer)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "PATH = Path(\"../../summarisation/\")\n", 37 | "DATA_PATH = PATH/'data'\n", 38 | "MODEL_PATH = PATH/'model'" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "logger = logging.getLogger()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "args = Box({\n", 57 | " \"max_seq_length\": 512,\n", 58 | " \"batch_size\": 8,\n", 59 | " \"learning_rate\": 5e-3,\n", 60 | " \"num_train_epochs\": 6,\n", 61 | " \"fp16\": True,\n", 62 | " \"model_name\": 'bertabs-finetuned-cnndm',\n", 63 | " \"model_type\": 'bert'\n", 64 | "})" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "device = torch.device('cuda') if torch.cuda.device_count() else torch.device('cpu')\n", 81 | "if torch.cuda.device_count() > 1:\n", 82 | " args.multi_gpu = True\n", 83 | "else:\n", 84 | " args.multi_gpu = False" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "tokenizer = BertWordPieceTokenizer(str(MODEL_PATH/'vocab.txt'), lowercase=True)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "databunch = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=tokenizer, device=device)\n", 103 | "databunch_old = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=args.model_name, device=device)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "databunch_with_data = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=args.model_name, device=device)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "databunch_with_data_new_tokeniser = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=tokenizer, device=device)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "learner = BertAbsLearner.from_pretrained_model(databunch, MODEL_PATH, device, logger=logger)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "texts = databunch_with_data.test_dl.dataset[0][1]\n", 140 | "texts" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "%%timeit\n", 150 | "learner.predict_batch(texts)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "learner_old = BertAbsLearner.from_pretrained_model(databunch_old, MODEL_PATH, device, logger=logger)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "%%timeit\n", 169 | "learner_old.predict_batch(texts)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.7.4" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 4 208 | } 209 | --------------------------------------------------------------------------------