├── .flake8
├── .github
    └── workflows
    │   └── publish-to-pypi.yml
├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── VERSION.txt
├── container
    ├── Dockerfile
    ├── batch.Dockerfile
    ├── bert
    │   ├── download_pretrained_models.py
    │   ├── nginx.conf
    │   ├── predictor.py
    │   ├── serve
    │   ├── train
    │   └── wsgi.py
    ├── bert_batch
    │   ├── nginx.conf
    │   ├── predictor.py
    │   ├── train
    │   └── wsgi.py
    ├── build_and_push.sh
    ├── build_and_push_batch.sh
    └── pytorch_build_and_push.sh
├── container_lm
    ├── Dockerfile
    ├── bert
    │   ├── download_pretrained_models.py
    │   ├── nginx.conf
    │   ├── train
    │   └── wsgi.py
    └── build_and_push.sh
├── container_ner
    ├── Dockerfile
    ├── bert
    │   ├── download_pretrained_models.py
    │   ├── nginx.conf
    │   ├── predictor.py
    │   ├── serve
    │   ├── train
    │   └── wsgi.py
    └── build_and_push.sh
├── container_t5
    ├── Dockerfile
    ├── build_and_push.sh
    ├── requirements.txt
    └── t5
    │   ├── download_pretrained_models.py
    │   ├── nginx.conf
    │   ├── predictor.py
    │   ├── serve
    │   ├── train
    │   └── wsgi.py
├── deploy_pip.sh
├── fast_bert
    ├── __init__.py
    ├── bert_layers.py
    ├── data.py
    ├── data_abs.py
    ├── data_cls.py
    ├── data_lm.py
    ├── data_ner.py
    ├── data_qa.py
    ├── learner_abs.py
    ├── learner_cls copy.py
    ├── learner_cls.py
    ├── learner_lm.py
    ├── learner_ner.py
    ├── learner_qa.py
    ├── learner_util.py
    ├── metrics.py
    ├── modeling.py
    ├── onnx_helper.py
    ├── optimization.py
    ├── prediction.py
    ├── prediction_ner.py
    ├── summarisation
    │   ├── __init__.py
    │   ├── configuration_bertabs.py
    │   └── modeling_bertabs.py
    ├── utils
    │   ├── __init__.py
    │   └── spellcheck.py
    └── utils_squad_evaluate.py
├── images
    └── lr_finder.png
├── requirements.txt
├── sample_data
    ├── imdb_movie_reviews
    │   ├── data
    │   │   ├── train_sample.csv
    │   │   └── val_sample.csv
    │   └── label
    │   │   └── labels.csv
    └── multi_label_toxic_comments
    │   ├── data
    │       ├── train_sample.csv
    │       └── val_sample.csv
    │   └── label
    │       └── labels.csv
├── sample_notebooks
    ├── gpu_util.ipynb
    ├── new-toxic-multilabel.ipynb
    ├── new-toxic-predict.ipynb
    └── toxic_comments_sagemaker.ipynb
├── setup.py
├── tag_release.sh
└── test
    ├── multi_class.ipynb
    ├── summarisation.ipynb
    └── tokenizer_vocab
        └── bert-base-uncased-vocab.txt


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, F403, F401
3 | max-line-length = 79
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distribution 📦 to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v[0-9].[0-9]+.[0-9]+*"  # This ensures the action only runs on version tags
 7 | 
 8 | jobs:
 9 |   build-n-publish:
10 |     name: Build and publish to PyPI
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v2
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: '3.10'  # Specify the Python version
20 | 
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install setuptools wheel twine
25 | 
26 |       - name: Build package
27 |         run: |
28 |           python setup.py sdist bdist_wheel
29 | 
30 |       - name: Publish package to PyPI
31 |         uses: pypa/gh-action-pypi-publish@master
32 |         with:
33 |           user: __token__
34 |           password: ${{ secrets.PYPI_API_TOKEN }}
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # Pycharm project settings
101 | .idea
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | .output
110 | cache/*
111 | cached*
112 | 
113 | # OS related
114 | .DS_Store


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.formatting.provider": "black",
3 |     "python.pythonPath": "/home/ubuntu/anaconda3/bin/python",
4 |     "python.linting.pylintEnabled": false,
5 |     "python.linting.flake8Enabled": true,
6 |     "python.linting.enabled": true
7 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 2.0.26


--------------------------------------------------------------------------------
/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | ARG ARCH=gpu
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ARG py_version=3
 5 | 
 6 | # Validate that arguments are specified
 7 | RUN test $py_version || exit 1
 8 | 
 9 | RUN echo $py_version
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends \
12 |     build-essential \
13 |     cmake \
14 |     git \
15 |     curl \
16 |     ca-certificates \
17 |     libjpeg-dev \
18 |     nginx \
19 |     jq \
20 |     libsm6 \
21 |     libxext6 \
22 |     libxrender-dev \
23 |     nginx \
24 |     libpng-dev && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN curl -o ~/miniconda.sh -LO  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
28 |     chmod +x ~/miniconda.sh && \
29 |     ~/miniconda.sh -b -p /opt/conda && \
30 |     rm ~/miniconda.sh && \
31 |     /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \
32 |     /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \
33 |     /opt/conda/bin/conda clean -ya
34 | ENV PATH /opt/conda/bin:$PATH
35 | 
36 | RUN pip install --upgrade pip
37 | 
38 | RUN python --version
39 | RUN pip --version
40 | 
41 | # #RUN df -a
42 | 
43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision
44 | 
45 | 
46 | # Python won’t try to write .pyc or .pyo files on the import of source modules
47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
49 | 
50 | RUN nvcc --version
51 | RUN which nvcc
52 | 
53 | RUN pip --no-cache-dir install \
54 |     flask \
55 |     pathlib \
56 |     gevent \
57 |     gunicorn \
58 |     scipy \
59 |     scikit-learn \
60 |     pandas \
61 |     fastprogress \
62 |     python-box \
63 |     tensorboardX
64 | 
65 | 
66 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
67 | 
68 | # RUN pip --no-cache-dir install fast-bert
69 | RUN pip install fast-bert==1.9.15
70 | 
71 | RUN pip install cryptography --upgrade && \
72 |     pip install urllib3 --upgrade
73 | 
74 | ENV PATH="/opt/ml/code:${PATH}"
75 | COPY /bert /opt/ml/code
76 | 
77 | WORKDIR /opt/ml/code
78 | 
79 | RUN cd $WORKDIR
80 | 
81 | RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased distilroberta-base
82 | 
83 | RUN rm -rf /opt/ml/input/data/training/cache/
84 | 


--------------------------------------------------------------------------------
/container/batch.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | ARG ARCH=gpu
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ARG py_version=3
 5 | 
 6 | # Validate that arguments are specified
 7 | RUN test $py_version || exit 1
 8 | 
 9 | RUN echo $py_version
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends \
12 |     build-essential \
13 |     cmake \
14 |     git \
15 |     curl \
16 |     ca-certificates \
17 |     libjpeg-dev \
18 |     nginx \
19 |     jq \
20 |     libsm6 \
21 |     libxext6 \
22 |     libxrender-dev \
23 |     nginx \
24 |     libpng-dev && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN curl -o ~/miniconda.sh -LO  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
28 |     chmod +x ~/miniconda.sh && \
29 |     ~/miniconda.sh -b -p /opt/conda && \
30 |     rm ~/miniconda.sh && \
31 |     /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \
32 |     /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \
33 |     /opt/conda/bin/conda clean -ya
34 | ENV PATH /opt/conda/bin:$PATH
35 | 
36 | RUN pip install --upgrade pip
37 | 
38 | RUN python --version
39 | RUN pip --version
40 | 
41 | # #RUN df -a
42 | 
43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision
44 | 
45 | 
46 | # Python won’t try to write .pyc or .pyo files on the import of source modules
47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
49 | 
50 | RUN nvcc --version
51 | RUN which nvcc
52 | 
53 | RUN pip --no-cache-dir install \
54 |     flask \
55 |     pathlib \
56 |     gevent \
57 |     gunicorn \
58 |     scipy \
59 |     scikit-learn \
60 |     pandas \
61 |     fastprogress \
62 |     python-box \
63 |     tensorboardX
64 | 
65 | 
66 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
67 | 
68 | # RUN pip --no-cache-dir install fast-bert
69 | RUN pip install fast-bert==1.9.15
70 | 
71 | RUN pip install cryptography --upgrade && \
72 |     pip install urllib3 --upgrade
73 | 
74 | ENV PATH="/opt/ml/code:${PATH}"
75 | COPY /bert_batch /opt/ml/code
76 | 
77 | WORKDIR /opt/ml/code
78 | 
79 | RUN cd $WORKDIR


--------------------------------------------------------------------------------
/container/bert/download_pretrained_models.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | from tqdm import tqdm
 4 | import requests
 5 | import urllib3
 6 | from transformers import AutoModel, AutoTokenizer
 7 | 
 8 | 
 9 | def download_pretrained_files(model_name, location):
10 |     try:
11 |         model_path = model_name.replace("/", ":")
12 |         model = AutoModel.from_pretrained(model_name)
13 |         model.save_pretrained(location / model_path)
14 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
15 |         tokenizer.save_pretrained(location / model_path)
16 |     except Exception as e:
17 |         print(e)
18 |         print("error downloading model {}".format(model_name))
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 | 
24 |     parser.add_argument(
25 |         "--location_dir",
26 |         default=None,
27 |         type=str,
28 |         required=True,
29 |         help="The location where pretrained model needs to be stored",
30 |     )
31 | 
32 |     parser.add_argument(
33 |         "--models",
34 |         default=None,
35 |         type=str,
36 |         required=True,
37 |         nargs="*",
38 |         help="download the pretrained models",
39 |     )
40 | 
41 |     args = parser.parse_args()
42 |     print(args)
43 |     Path(args.location_dir).mkdir(exist_ok=True)
44 | 
45 |     #    [download_pretrained_files(k, location=Path(args.location_dir))
46 |     #     for k, v in BERT_PRETRAINED_MODEL_ARCHIVE_MAP.items()]
47 |     [
48 |         download_pretrained_files(item, location=Path(args.location_dir))
49 |         for item in args.models
50 |     ]
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/container/bert/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 4;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 | 
27 |     location ~ ^/(ping|execution-parameters|invocations) {
28 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
29 |       proxy_set_header Host $http_host;
30 |       proxy_redirect off;
31 |       proxy_pass http://gunicorn;
32 |     }
33 | 
34 |     location / {
35 |       return 404 "{}";
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/container/bert/predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import json
  4 | import pickle
  5 | import sys
  6 | import signal
  7 | import traceback
  8 | import re
  9 | import flask
 10 | import pandas as pd
 11 | import torch
 12 | from collections import OrderedDict
 13 | 
 14 | from fast_bert.prediction import BertClassificationPredictor
 15 | 
 16 | from fast_bert.utils.spellcheck import BingSpellCheck
 17 | from pathlib import Path
 18 | 
 19 | import warnings
 20 | 
 21 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 22 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 23 | 
 24 | prefix = "/opt/ml/"
 25 | 
 26 | # PATH = Path(os.path.join(prefix, "model"))
 27 | PATH = os.path.join(prefix, "model")
 28 | 
 29 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin")
 30 | 
 31 | # request_text = None
 32 | 
 33 | 
 34 | class ScoringService(object):
 35 |     model = None  # Where we keep the model when it's loaded
 36 | 
 37 |     @classmethod
 38 |     def get_predictor_model(cls):
 39 | 
 40 |         # print(cls.searching_all_files(PATH))
 41 |         # Get model predictor
 42 |         if cls.model is None:
 43 |             with open(os.path.join(PATH, "model_config.json")) as f:
 44 |                 model_config = json.load(f)
 45 | 
 46 |             predictor = BertClassificationPredictor(
 47 |                 os.path.join(PATH, "model_out"),
 48 |                 label_path=PATH,
 49 |                 multi_label=bool(model_config["multi_label"]),
 50 |                 model_type=model_config["model_type"],
 51 |                 do_lower_case=bool(model_config["do_lower_case"]),
 52 |             )
 53 |             cls.model = predictor
 54 | 
 55 |         return cls.model
 56 | 
 57 |     @classmethod
 58 |     def predict(cls, text, bing_key=None):
 59 |         """For the input, do the predictions and return them.
 60 |         Args:
 61 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 62 |                 one prediction per row in the dataframe"""
 63 |         predictor_model = cls.get_predictor_model()
 64 |         if bing_key:
 65 |             spellChecker = BingSpellCheck(bing_key)
 66 |             text = spellChecker.spell_check(text)
 67 |         prediction = predictor_model.predict(text)
 68 | 
 69 |         return prediction
 70 | 
 71 |     @classmethod
 72 |     def predict_batch(cls, texts):
 73 |         """For the input, do the predictions and return them.
 74 |         Args:
 75 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 76 |                 one prediction per row in the dataframe"""
 77 |         predictor_model = cls.get_predictor_model()
 78 |         output_labels_count = int(
 79 |             os.environ.get(
 80 |                 "OUTPUT_LABELS_COUNT", len(predictor_model.learner.data.labels)
 81 |             )
 82 |         )
 83 | 
 84 |         print("output_labels_count", output_labels_count)
 85 | 
 86 |         predictions = predictor_model.predict_batch(texts)
 87 |         return cls.process_batch_results(
 88 |             texts, predictions, labels_count=output_labels_count
 89 |         )
 90 | 
 91 |     @classmethod
 92 |     def searching_all_files(cls, directory: Path):
 93 |         file_list = []  # A list for storing files existing in directories
 94 | 
 95 |         for x in directory.iterdir():
 96 |             if x.is_file():
 97 |                 file_list.append(str(x))
 98 |             else:
 99 |                 file_list.append(cls.searching_all_files(x))
100 | 
101 |         return file_list
102 | 
103 |     @classmethod
104 |     def process_batch_results(cls, texts, results, labels_count=None):
105 |         processed_results = []
106 |         for i, result in enumerate(results):
107 |             processed = OrderedDict()
108 |             processed["text"] = texts[i]
109 |             result = result[:labels_count] if labels_count else result
110 |             for index, label in enumerate(result):
111 |                 processed["label_{}".format(index + 1)] = label[0]
112 |                 processed["confidence_{}".format(index + 1)] = label[1]
113 |             processed_results.append(processed)
114 | 
115 |         return processed_results
116 | 
117 | 
118 | # The flask app for serving predictions
119 | app = flask.Flask(__name__)
120 | 
121 | 
122 | @app.route("/ping", methods=["GET"])
123 | def ping():
124 |     """Determine if the container is working and healthy. In this sample container, we declare
125 |     it healthy if we can load the model successfully."""
126 |     health = (
127 |         ScoringService.get_predictor_model() is not None
128 |     )  # You can insert a health check here
129 | 
130 |     status = 200 if health else 404
131 |     return flask.Response(response="\n", status=status, mimetype="application/json")
132 | 
133 | 
134 | @app.route("/execution-parameters", methods=["GET"])
135 | def get_execution_parameters():
136 |     params = {
137 |         "MaxConcurrentTransforms": 3,
138 |         "BatchStrategy": "MULTI_RECORD",
139 |         "MaxPayloadInMB": 6,
140 |     }
141 |     return flask.Response(
142 |         response=json.dumps(params), status="200", mimetype="application/json"
143 |     )
144 | 
145 | 
146 | @app.route("/invocations", methods=["POST"])
147 | def transformation():
148 |     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
149 |     it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
150 |     just means one prediction per line, since there's a single column.
151 |     """
152 |     data = None
153 |     text = None
154 | 
155 |     if flask.request.content_type == "application/json":
156 |         print("calling json launched")
157 |         data = flask.request.get_json(silent=True)
158 | 
159 |         text = data["text"]
160 |         try:
161 |             bing_key = data["bing_key"]
162 |         except Exception:
163 |             bing_key = None
164 | 
165 |         # Do the prediction
166 |         predictions = ScoringService.predict(text, bing_key)
167 |         result = json.dumps(predictions[:10])
168 |         return flask.Response(response=result, status=200, mimetype="application/json")
169 | 
170 |     elif flask.request.content_type == "text/csv":
171 |         data = flask.request.data.decode("utf-8")
172 |         df = pd.read_csv(io.StringIO(data), header="infer")
173 |         predictions = ScoringService.predict_batch(list(df["text"].values))
174 | 
175 |         out = io.StringIO()
176 |         pd.DataFrame(predictions).to_csv(out, index=False)
177 |         result = out.getvalue()
178 |         return flask.Response(response=result, status=200, mimetype="text/csv")
179 | 
180 |     elif flask.request.content_type == "text/plain":
181 |         data = flask.request.data.decode("utf-8")
182 |         s = io.StringIO(data)
183 |         # convert txt file into list of texts
184 |         texts = []
185 |         for line in s:
186 |             texts.append(line)
187 |         predictions = ScoringService.predict_batch(texts)
188 |         out = io.StringIO()
189 |         pd.DataFrame(predictions).to_csv(out, index=False)
190 |         result = out.getvalue()
191 |         return flask.Response(response=result, status=200, mimetype="text/csv")
192 | 
193 |     else:
194 |         return flask.Response(
195 |             response="This predictor only supports JSON, txt or CSV data",
196 |             status=415,
197 |             mimetype="text/plain",
198 |         )
199 | 


--------------------------------------------------------------------------------
/container/bert/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | from __future__ import print_function
17 | import multiprocessing
18 | import os
19 | import signal
20 | import subprocess
21 | import sys
22 | 
23 | cpu_count = multiprocessing.cpu_count()
24 | 
25 | model_server_timeout = os.environ.get("MODEL_SERVER_TIMEOUT", 18000)
26 | model_server_workers = int(os.environ.get("MODEL_SERVER_WORKERS", 4))
27 | 
28 | 
29 | def sigterm_handler(nginx_pid, gunicorn_pid):
30 |     try:
31 |         os.kill(nginx_pid, signal.SIGQUIT)
32 |     except OSError:
33 |         pass
34 |     try:
35 |         os.kill(gunicorn_pid, signal.SIGTERM)
36 |     except OSError:
37 |         pass
38 | 
39 |     sys.exit(0)
40 | 
41 | 
42 | def start_server():
43 |     print("Starting the inference server with {} workers.".format(model_server_workers))
44 | 
45 |     # link the log streams to stdout/err so they will be logged to the container logs
46 |     subprocess.check_call(["ln", "-sf", "/dev/stdout", "/var/log/nginx/access.log"])
47 |     subprocess.check_call(["ln", "-sf", "/dev/stderr", "/var/log/nginx/error.log"])
48 | 
49 |     nginx = subprocess.Popen(["nginx", "-c", "/opt/ml/code/nginx.conf"])
50 |     gunicorn = subprocess.Popen(
51 |         [
52 |             "gunicorn",
53 |             "--timeout",
54 |             str(model_server_timeout),
55 |             "-k",
56 |             "gevent",
57 |             "-b",
58 |             "unix:/tmp/gunicorn.sock",
59 |             "-w",
60 |             str(model_server_workers),
61 |             "wsgi:app",
62 |         ]
63 |     )
64 | 
65 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
66 | 
67 |     # If either subprocess exits, so do we.
68 |     pids = set([nginx.pid, gunicorn.pid])
69 |     while True:
70 |         pid, _ = os.wait()
71 |         if pid in pids:
72 |             break
73 | 
74 |     sigterm_handler(nginx.pid, gunicorn.pid)
75 |     print("Inference server exiting")
76 | 
77 | 
78 | # The main routine just invokes the start function.
79 | 
80 | if __name__ == "__main__":
81 |     start_server()
82 | 


--------------------------------------------------------------------------------
/container/bert/train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import json
  5 | import pickle
  6 | import sys
  7 | import traceback
  8 | import pandas as pd
  9 | import numpy as np
 10 | import random
 11 | import datetime
 12 | from pathlib import Path
 13 | import logging
 14 | import torch
 15 | import shutil
 16 | from transformers import AutoTokenizer
 17 | 
 18 | from fast_bert.data_cls import BertDataBunch
 19 | from fast_bert.learner_cls import BertLearner
 20 | from fast_bert.metrics import (
 21 |     accuracy,
 22 |     accuracy_multilabel,
 23 |     accuracy_thresh,
 24 |     fbeta,
 25 |     roc_auc,
 26 | )
 27 | 
 28 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 29 | 
 30 | channel_name = "training"
 31 | 
 32 | prefix = "/opt/ml/"
 33 | input_path = prefix + "input/data"  # opt/ml/input/data
 34 | code_path = prefix + "code"  # opt/ml/code
 35 | pretrained_model_path = (
 36 |     code_path + "/pretrained_models"
 37 | )  # opt/ml/code/pretrained_models
 38 | 
 39 | finetuned_path = input_path + "/{}/finetuned".format(
 40 |     channel_name
 41 | )  # opt/ml/input/data/training/finetuned
 42 | 
 43 | output_path = os.path.join(prefix, "output")  # opt/ml/output
 44 | model_path = os.path.join(prefix, "model")  # opt/ml/model
 45 | 
 46 | training_config_path = os.path.join(
 47 |     input_path, "{}/config".format(channel_name)
 48 | )  # opt/ml/input/data/training/config
 49 | 
 50 | hyperparam_path = os.path.join(
 51 |     prefix, "input/config/hyperparameters.json"
 52 | )  # opt/ml/input/config/hyperparameters.json
 53 | config_path = os.path.join(
 54 |     training_config_path, "training_config.json"
 55 | )  # opt/ml/input/data/training/config/training_config.json
 56 | 
 57 | 
 58 | # This algorithm has a single channel of input data called 'training'. Since we run in
 59 | # File mode, the input files are copied to the directory specified here.
 60 | 
 61 | training_path = os.path.join(input_path, channel_name)  # opt/ml/input/data/training
 62 | 
 63 | 
 64 | def searching_all_files(directory: Path):
 65 |     file_list = []  # A list for storing files existing in directories
 66 | 
 67 |     for x in directory.iterdir():
 68 |         if x.is_file():
 69 |             file_list.append(str(x))
 70 |         else:
 71 |             file_list.append(searching_all_files(x))
 72 | 
 73 |     return file_list
 74 | 
 75 | 
 76 | # The function to execute the training.
 77 | def train():
 78 | 
 79 |     print("Starting the training.")
 80 | 
 81 |     DATA_PATH = Path(training_path)
 82 |     LABEL_PATH = Path(training_path)
 83 | 
 84 |     try:
 85 |         print(config_path)
 86 |         with open(config_path, "r") as f:
 87 |             training_config = json.load(f)
 88 |             print(training_config)
 89 | 
 90 |         with open(hyperparam_path, "r") as tc:
 91 |             hyperparameters = json.load(tc)
 92 |             print(hyperparameters)
 93 | 
 94 |         # convert string bools to booleans
 95 |         training_config["multi_label"] = training_config["multi_label"] == "True"
 96 |         training_config["fp16"] = training_config["fp16"] == "True"
 97 |         training_config["text_col"] = training_config.get("text_col", "text")
 98 |         training_config["label_col"] = training_config.get("label_col", "label")
 99 |         training_config["train_file"] = training_config.get("train_file", "train.csv")
100 |         training_config["val_file"] = training_config.get("val_file", "val.csv")
101 |         training_config["label_file"] = training_config.get("label_file", "labels.csv")
102 |         training_config["random_state"] = training_config.get("random_state", None)
103 | 
104 |         if training_config["random_state"] is not None:
105 |             print("setting random state {}".format(training_config["random_state"]))
106 |             random_seed(int(training_config["random_state"]))
107 | 
108 |         # Logger
109 |         # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"]))
110 |         logging.basicConfig(
111 |             level=logging.INFO,
112 |             format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
113 |             datefmt="%m/%d/%Y %H:%M:%S",
114 |             handlers=[
115 |                 # logging.FileHandler(logfile),
116 |                 logging.StreamHandler(sys.stdout)
117 |             ],
118 |         )
119 | 
120 |         logger = logging.getLogger()
121 | 
122 |         # Define pretrained model path
123 |         PRETRAINED_PATH = Path(pretrained_model_path) / training_config[
124 |             "model_name"
125 |         ].replace("/", ":")
126 |         if PRETRAINED_PATH.is_dir():
127 |             logger.info("model path used {}".format(PRETRAINED_PATH))
128 |             model_name_path = str(PRETRAINED_PATH)
129 |         else:
130 |             model_name_path = training_config["model_name"]
131 |             logger.info(
132 |                 "model {} is not preloaded. Will try to download.".format(
133 |                     model_name_path
134 |                 )
135 |             )
136 | 
137 |         finetuned_model_name = training_config.get("finetuned_model", None)
138 |         if finetuned_model_name is not None:
139 |             finetuned_model = os.path.join(finetuned_path, finetuned_model_name)
140 |             logger.info("finetuned model loaded from {}".format(finetuned_model))
141 |         else:
142 |             logger.info(
143 |                 "finetuned model not available - loading standard pretrained model"
144 |             )
145 |             finetuned_model = None
146 | 
147 |         # use auto-tokenizer
148 |         tokenizer = AutoTokenizer.from_pretrained(model_name_path, use_fast=True)
149 | 
150 |         device = torch.device("cuda")
151 |         if torch.cuda.device_count() > 1:
152 |             multi_gpu = True
153 |         else:
154 |             multi_gpu = False
155 | 
156 |         logger.info("Number of GPUs: {}".format(torch.cuda.device_count()))
157 | 
158 |         if training_config["multi_label"] is True:
159 |             label_col = json.loads(training_config["label_col"])
160 |         else:
161 |             label_col = training_config["label_col"]
162 | 
163 |         logger.info("label columns: {}".format(label_col))
164 |         test_data = None
165 |         test_df = None
166 |         if training_config.get("test_file", None):
167 |             try:
168 |                 test_df = pd.read_csv(DATA_PATH / training_config["test_file"])
169 |             except Exception:
170 |                 test_df = pd.read_csv(
171 |                     DATA_PATH / training_config["test_file"], encoding="latin1"
172 |                 )
173 |             test_data = list(test_df["text"])
174 |             logger.info("Test file available. Test count {}".format(len(test_df)))
175 | 
176 |         # Create databunch
177 |         databunch = BertDataBunch(
178 |             DATA_PATH,
179 |             LABEL_PATH,
180 |             tokenizer,
181 |             train_file=training_config["train_file"],
182 |             val_file=training_config["val_file"],
183 |             label_file=training_config["label_file"],
184 |             text_col=training_config["text_col"],
185 |             test_data=test_data,
186 |             label_col=label_col,
187 |             batch_size_per_gpu=int(hyperparameters["train_batch_size"]),
188 |             max_seq_length=int(hyperparameters["max_seq_length"]),
189 |             multi_gpu=multi_gpu,
190 |             multi_label=training_config["multi_label"],
191 |             model_type=training_config["model_type"],
192 |             logger=logger,
193 |             no_cache=True,
194 |         )
195 | 
196 |         metrics = []
197 |         if training_config["multi_label"] is False:
198 |             metrics.append({"name": "accuracy", "function": accuracy})
199 |         else:
200 |             metrics.append({"name": "accuracy_thresh", "function": accuracy_thresh})
201 |             metrics.append({"name": "roc_auc", "function": roc_auc})
202 |             metrics.append({"name": "fbeta", "function": fbeta})
203 | 
204 |         logger.info("databunch labels: {}".format(len(databunch.labels)))
205 | 
206 |         # Initialise the learner
207 |         learner = BertLearner.from_pretrained_model(
208 |             databunch,
209 |             model_name_path,
210 |             metrics=metrics,
211 |             device=device,
212 |             logger=logger,
213 |             output_dir=Path(model_path),
214 |             finetuned_wgts_path=finetuned_model,
215 |             is_fp16=training_config["fp16"],
216 |             fp16_opt_level=training_config["fp16_opt_level"],
217 |             warmup_steps=int(hyperparameters["warmup_steps"]),
218 |             grad_accumulation_steps=int(training_config["grad_accumulation_steps"]),
219 |             multi_gpu=multi_gpu,
220 |             multi_label=training_config["multi_label"],
221 |             logging_steps=int(training_config["logging_steps"]),
222 |         )
223 | 
224 |         learner.fit(
225 |             int(hyperparameters["epochs"]),
226 |             float(hyperparameters["lr"]),
227 |             schedule_type=hyperparameters["lr_schedule"],
228 |             optimizer_type=hyperparameters["optimizer_type"],
229 |         )
230 | 
231 |         results = learner.validate(return_preds=True)
232 |         logger.info("y_pred: {}".format(json.dumps(results["y_preds"].tolist())))
233 |         logger.info("y_true: {}".format(json.dumps(results["y_true"].tolist())))
234 |         logger.info("labels: {}".format(json.dumps(databunch.labels)))
235 | 
236 |         if test_data is not None:
237 |             predictions = learner.predict_batch()
238 |             results = []
239 |             for index, row in test_df.iterrows():
240 |                 preds = predictions[index][:3]
241 |                 result = {"text": row.get("text"), "ground_truth": row.get("label")}
242 |                 for i, pred in enumerate(preds):
243 |                     result["label_{}".format(i + 1)] = pred[0]
244 |                     result["confidence_{}".format(i + 1)] = pred[1]
245 | 
246 |                 results.append(result)
247 | 
248 |             # save test results with model outcome
249 |             pd.DataFrame(results).to_csv(
250 |                 os.path.join(model_path, "test_result.csv"), index=None
251 |             )
252 | 
253 |             pd.DataFrame(results).to_csv(
254 |                 os.path.join(output_path, "test_result.csv"), index=None
255 |             )
256 | 
257 |         # save model and tokenizer artefacts
258 |         learner.save_model()
259 | 
260 |         # save model config file
261 |         with open(os.path.join(model_path, "model_config.json"), "w") as f:
262 |             json.dump(training_config, f)
263 | 
264 |         # save label file
265 |         with open(os.path.join(model_path, "labels.csv"), "w") as f:
266 |             f.write("\n".join(databunch.labels))
267 | 
268 |         # save label_metadata csv file from LABEL_PATH to model_path
269 |         shutil.copyfile(
270 |             os.path.join(LABEL_PATH, "labels_metadata.json"),
271 |             os.path.join(model_path, "labels_metadata.json"),
272 |         )
273 | 
274 |     except Exception as e:
275 |         # Write out an error file. This will be returned as the failureReason in the
276 |         # DescribeTrainingJob result.
277 |         trc = traceback.format_exc()
278 |         with open(os.path.join(output_path, "failure"), "w") as s:
279 |             s.write("Exception during training: " + str(e) + "\n" + trc)
280 |         # Printing this causes the exception to be in the training job logs, as well.
281 |         print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr)
282 |         # A non-zero exit code causes the training job to be marked as Failed.
283 |         sys.exit(255)
284 | 
285 | 
286 | def random_seed(seed_value):
287 |     random.seed(seed_value)  # Python
288 |     np.random.seed(seed_value)  # cpu vars
289 | 
290 |     torch.manual_seed(seed_value)  # cpu  vars
291 | 
292 |     if torch.cuda.is_available():
293 |         torch.cuda.manual_seed(seed_value)
294 |         torch.cuda.manual_seed_all(seed_value)  # gpu vars
295 |         torch.backends.cudnn.deterministic = True  # needed
296 |         torch.backends.cudnn.benchmark = False
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     train()
301 | 
302 |     # A zero exit code causes the job to be marked a Succeeded.
303 |     sys.exit(0)
304 | 


--------------------------------------------------------------------------------
/container/bert/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app


--------------------------------------------------------------------------------
/container/bert_batch/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 4;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 | 
27 |     location ~ ^/(ping|execution-parameters|invocations) {
28 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
29 |       proxy_set_header Host $http_host;
30 |       proxy_redirect off;
31 |       proxy_pass http://gunicorn;
32 |     }
33 | 
34 |     location / {
35 |       return 404 "{}";
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/container/bert_batch/predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import json
  4 | import pickle
  5 | import sys
  6 | import signal
  7 | import traceback
  8 | import re
  9 | import flask
 10 | import pandas as pd
 11 | import torch
 12 | from collections import OrderedDict
 13 | 
 14 | from fast_bert.prediction import BertClassificationPredictor
 15 | 
 16 | from fast_bert.utils.spellcheck import BingSpellCheck
 17 | from pathlib import Path
 18 | 
 19 | import warnings
 20 | 
 21 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 22 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 23 | 
 24 | prefix = "/opt/ml/"
 25 | 
 26 | # PATH = Path(os.path.join(prefix, "model"))
 27 | PATH = os.path.join(prefix, "model")
 28 | 
 29 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin")
 30 | 
 31 | # request_text = None
 32 | 
 33 | 
 34 | class ScoringService(object):
 35 |     model = None  # Where we keep the model when it's loaded
 36 | 
 37 |     @classmethod
 38 |     def get_predictor_model(cls):
 39 | 
 40 |         # print(cls.searching_all_files(PATH))
 41 |         # Get model predictor
 42 |         if cls.model is None:
 43 |             with open(os.path.join(PATH, "model_config.json")) as f:
 44 |                 model_config = json.load(f)
 45 | 
 46 |             predictor = BertClassificationPredictor(
 47 |                 os.path.join(PATH, "model_out"),
 48 |                 label_path=PATH,
 49 |                 multi_label=bool(model_config["multi_label"]),
 50 |                 model_type=model_config["model_type"],
 51 |                 do_lower_case=bool(model_config["do_lower_case"]),
 52 |             )
 53 |             cls.model = predictor
 54 | 
 55 |         return cls.model
 56 | 
 57 |     @classmethod
 58 |     def predict(cls, text, bing_key=None):
 59 |         """For the input, do the predictions and return them.
 60 |         Args:
 61 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 62 |                 one prediction per row in the dataframe"""
 63 |         predictor_model = cls.get_predictor_model()
 64 |         if bing_key:
 65 |             spellChecker = BingSpellCheck(bing_key)
 66 |             text = spellChecker.spell_check(text)
 67 |         prediction = predictor_model.predict(text)
 68 | 
 69 |         return prediction
 70 | 
 71 |     @classmethod
 72 |     def predict_batch(cls, texts):
 73 |         """For the input, do the predictions and return them.
 74 |         Args:
 75 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 76 |                 one prediction per row in the dataframe"""
 77 |         predictor_model = cls.get_predictor_model()
 78 |         output_labels_count = int(
 79 |             os.environ.get(
 80 |                 "OUTPUT_LABELS_COUNT", len(predictor_model.learner.data.labels)
 81 |             )
 82 |         )
 83 | 
 84 |         print("output_labels_count", output_labels_count)
 85 | 
 86 |         predictions = predictor_model.predict_batch(texts)
 87 |         return cls.process_batch_results(
 88 |             texts, predictions, labels_count=output_labels_count
 89 |         )
 90 | 
 91 |     @classmethod
 92 |     def searching_all_files(cls, directory: Path):
 93 |         file_list = []  # A list for storing files existing in directories
 94 | 
 95 |         for x in directory.iterdir():
 96 |             if x.is_file():
 97 |                 file_list.append(str(x))
 98 |             else:
 99 |                 file_list.append(cls.searching_all_files(x))
100 | 
101 |         return file_list
102 | 
103 |     @classmethod
104 |     def process_batch_results(cls, texts, results, labels_count=None):
105 |         processed_results = []
106 |         for i, result in enumerate(results):
107 |             processed = OrderedDict()
108 |             processed["text"] = texts[i]
109 |             result = result[:labels_count] if labels_count else result
110 |             for index, label in enumerate(result):
111 |                 processed["label_{}".format(index + 1)] = label[0]
112 |                 processed["confidence_{}".format(index + 1)] = label[1]
113 |             processed_results.append(processed)
114 | 
115 |         return processed_results
116 | 
117 | 
118 | # The flask app for serving predictions
119 | app = flask.Flask(__name__)
120 | 
121 | 
122 | @app.route("/ping", methods=["GET"])
123 | def ping():
124 |     """Determine if the container is working and healthy. In this sample container, we declare
125 |     it healthy if we can load the model successfully."""
126 |     health = (
127 |         ScoringService.get_predictor_model() is not None
128 |     )  # You can insert a health check here
129 | 
130 |     status = 200 if health else 404
131 |     return flask.Response(response="\n", status=status, mimetype="application/json")
132 | 
133 | 
134 | @app.route("/execution-parameters", methods=["GET"])
135 | def get_execution_parameters():
136 |     params = {
137 |         "MaxConcurrentTransforms": 3,
138 |         "BatchStrategy": "MULTI_RECORD",
139 |         "MaxPayloadInMB": 6,
140 |     }
141 |     return flask.Response(
142 |         response=json.dumps(params), status="200", mimetype="application/json"
143 |     )
144 | 
145 | 
146 | @app.route("/invocations", methods=["POST"])
147 | def transformation():
148 |     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
149 |     it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
150 |     just means one prediction per line, since there's a single column.
151 |     """
152 |     data = None
153 |     text = None
154 | 
155 |     if flask.request.content_type == "application/json":
156 |         print("calling json launched")
157 |         data = flask.request.get_json(silent=True)
158 | 
159 |         text = data["text"]
160 |         try:
161 |             bing_key = data["bing_key"]
162 |         except Exception:
163 |             bing_key = None
164 | 
165 |         # Do the prediction
166 |         predictions = ScoringService.predict(text, bing_key)
167 |         result = json.dumps(predictions[:10])
168 |         return flask.Response(response=result, status=200, mimetype="application/json")
169 | 
170 |     elif flask.request.content_type == "text/csv":
171 |         data = flask.request.data.decode("utf-8")
172 |         df = pd.read_csv(io.StringIO(data), header="infer")
173 |         predictions = ScoringService.predict_batch(list(df["text"].values))
174 | 
175 |         out = io.StringIO()
176 |         pd.DataFrame(predictions).to_csv(out, index=False)
177 |         result = out.getvalue()
178 |         return flask.Response(response=result, status=200, mimetype="text/csv")
179 | 
180 |     elif flask.request.content_type == "text/plain":
181 |         data = flask.request.data.decode("utf-8")
182 |         s = io.StringIO(data)
183 |         # convert txt file into list of texts
184 |         texts = []
185 |         for line in s:
186 |             texts.append(line)
187 |         predictions = ScoringService.predict_batch(texts)
188 |         out = io.StringIO()
189 |         pd.DataFrame(predictions).to_csv(out, index=False)
190 |         result = out.getvalue()
191 |         return flask.Response(response=result, status=200, mimetype="text/csv")
192 | 
193 |     else:
194 |         return flask.Response(
195 |             response="This predictor only supports JSON, txt or CSV data",
196 |             status=415,
197 |             mimetype="text/plain",
198 |         )
199 | 


--------------------------------------------------------------------------------
/container/bert_batch/train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import json
  5 | import logging
  6 | import sys
  7 | import traceback
  8 | import pandas as pd
  9 | import numpy as np
 10 | import random
 11 | import datetime
 12 | from pathlib import Path
 13 | import torch
 14 | import shutil
 15 | import tarfile
 16 | from collections import OrderedDict
 17 | from transformers import AutoTokenizer
 18 | 
 19 | from fast_bert.data_cls import BertDataBunch
 20 | from fast_bert.learner_cls import BertLearner
 21 | from fast_bert.metrics import (
 22 |     accuracy,
 23 |     accuracy_multilabel,
 24 |     accuracy_thresh,
 25 |     fbeta,
 26 |     roc_auc,
 27 | )
 28 | 
 29 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 30 | # Logger
 31 | # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"]))
 32 | logging.basicConfig(
 33 |     level=logging.INFO,
 34 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 35 |     datefmt="%m/%d/%Y %H:%M:%S",
 36 |     handlers=[
 37 |         # logging.FileHandler(logfile),
 38 |         logging.StreamHandler(sys.stdout)
 39 |     ],
 40 | )
 41 | 
 42 | logger = logging.getLogger()
 43 | 
 44 | channel_name = "training"
 45 | 
 46 | prefix = "/opt/ml/"
 47 | input_path = prefix + "input/data"  # opt/ml/input/data
 48 | output_path = os.path.join(prefix, "model")
 49 | 
 50 | finetuned_path = input_path + "/{}/finetuned".format(
 51 |     channel_name
 52 | )  # opt/ml/input/data/training/finetuned
 53 | 
 54 | training_config_path = os.path.join(
 55 |     input_path, "{}/config".format(channel_name)
 56 | )  # opt/ml/input/data/training/config
 57 | 
 58 | hyperparam_path = os.path.join(
 59 |     prefix, "input/config/hyperparameters.json"
 60 | )  # opt/ml/input/config/hyperparameters.json
 61 | config_path = os.path.join(
 62 |     training_config_path, "training_config.json"
 63 | )  # opt/ml/input/data/training/config/training_config.json
 64 | 
 65 | 
 66 | # This algorithm has a single channel of input data called 'training'. Since we run in
 67 | # File mode, the input files are copied to the directory specified here.
 68 | 
 69 | training_path = os.path.join(input_path, channel_name)  # opt/ml/input/data/training
 70 | 
 71 | 
 72 | # The function to execute the training.
 73 | def train():
 74 |     logger.info("Starting batch inference...")
 75 | 
 76 |     DATA_PATH = Path(training_path)
 77 |     MODEL_PATH = DATA_PATH / "model"
 78 |     ARTIFACTS_PATH = MODEL_PATH / "model_out"
 79 | 
 80 |     # untar model.tar.gz to model directory
 81 |     with tarfile.open(DATA_PATH / "model.tar.gz", "r:gz") as tar:
 82 |         tar.extractall(MODEL_PATH)
 83 |         tar.close()
 84 | 
 85 |     try:
 86 |         with open(config_path, "r") as f:
 87 |             training_config = json.load(f)
 88 |             logger.info(training_config)
 89 | 
 90 |         with open(hyperparam_path, "r") as tc:
 91 |             hyperparameters = json.load(tc)
 92 |             logger.info(hyperparameters)
 93 | 
 94 |         # convert string bools to booleans
 95 |         training_config["multi_label"] = training_config["multi_label"] == "True"
 96 |         training_config["fp16"] = training_config["fp16"] == "True"
 97 |         training_config["text_col"] = training_config.get("text_col", "text")
 98 |         training_config["label_col"] = training_config.get("label_col", "label")
 99 |         training_config["train_file"] = training_config.get("train_file", "train.csv")
100 |         training_config["val_file"] = training_config.get("val_file", "val.csv")
101 |         training_config["label_file"] = training_config.get("label_file", "labels.csv")
102 |         training_config["random_state"] = training_config.get("random_state", None)
103 |         training_config["labels_count"] = int(training_config.get("labels_count", 10))
104 |         if training_config["random_state"] is not None:
105 |             print("setting random state {}".format(training_config["random_state"]))
106 |             random_seed(int(training_config["random_state"]))
107 | 
108 |         # use auto-tokenizer
109 |         tokenizer = AutoTokenizer.from_pretrained(str(ARTIFACTS_PATH), use_fast=True)
110 | 
111 |         device = torch.device("cuda")
112 |         if torch.cuda.device_count() > 1:
113 |             multi_gpu = True
114 |         else:
115 |             multi_gpu = False
116 | 
117 |         logger.info("Number of GPUs: {}".format(torch.cuda.device_count()))
118 | 
119 |         # Create databunch
120 |         databunch = BertDataBunch(
121 |             MODEL_PATH,
122 |             MODEL_PATH,
123 |             tokenizer,
124 |             train_file=None,
125 |             val_file=None,
126 |             batch_size_per_gpu=int(hyperparameters["train_batch_size"]),
127 |             max_seq_length=int(hyperparameters["max_seq_length"]),
128 |             multi_gpu=multi_gpu,
129 |             multi_label=training_config["multi_label"],
130 |             model_type=training_config["model_type"],
131 |             logger=logger,
132 |             no_cache=True,
133 |         )
134 | 
135 |         # Initialise the learner
136 |         learner = BertLearner.from_pretrained_model(
137 |             databunch,
138 |             str(ARTIFACTS_PATH),
139 |             metrics=[],
140 |             device=device,
141 |             logger=logger,
142 |             output_dir=None,
143 |             is_fp16=False,
144 |             multi_gpu=multi_gpu,
145 |             multi_label=training_config["multi_label"],
146 |             logging_steps=0,
147 |         )
148 | 
149 |         df = pd.read_csv(str(DATA_PATH / "data.csv"), header=None)
150 |         df = df.iloc[:, 0:1]
151 |         # if first row is header, remove it
152 |         if df.iloc[0, 0] == "text":
153 |             df = df.iloc[1:]
154 |         df.columns = ["text"]
155 |         df.dropna(subset=["text"], inplace=True)
156 | 
157 |         texts = list(df["text"].values)
158 | 
159 |         predictions = learner.predict_batch(texts)
160 | 
161 |         processed_predictions = process_batch_results(
162 |             texts, results=predictions, labels_count=training_config["labels_count"]
163 |         )
164 | 
165 |         # save test results with model outcome
166 |         pd.DataFrame(processed_predictions).to_csv(
167 |             os.path.join(output_path, "out.csv"), index=None
168 |         )
169 | 
170 |     except Exception as e:
171 |         # Write out an error file. This will be returned as the failureReason in the
172 |         # DescribeTrainingJob result.
173 |         trc = traceback.format_exc()
174 |         with open(os.path.join(output_path, "failure"), "w") as s:
175 |             s.write("Exception during batch inference: " + str(e) + "\n" + trc)
176 |         # Printing this causes the exception to be in the training job logs, as well.
177 |         logger.error(
178 |             "Exception during training: " + str(e) + "\n" + trc, file=sys.stderr
179 |         )
180 |         # A non-zero exit code causes the training job to be marked as Failed.
181 |         sys.exit(255)
182 | 
183 | 
184 | def process_batch_results(texts, results, labels_count=None):
185 |     processed_results = []
186 |     for i, result in enumerate(results):
187 |         processed = OrderedDict()
188 |         processed["text"] = texts[i]
189 |         result = result[:labels_count] if labels_count else result
190 |         for index, label in enumerate(result):
191 |             processed["label_{}".format(index + 1)] = label[0]
192 |             processed["confidence_{}".format(index + 1)] = label[1]
193 |         processed_results.append(processed)
194 | 
195 |     return processed_results
196 | 
197 | 
198 | def random_seed(seed_value):
199 |     random.seed(seed_value)  # Python
200 |     np.random.seed(seed_value)  # cpu vars
201 | 
202 |     torch.manual_seed(seed_value)  # cpu  vars
203 | 
204 |     if torch.cuda.is_available():
205 |         torch.cuda.manual_seed(seed_value)
206 |         torch.cuda.manual_seed_all(seed_value)  # gpu vars
207 |         torch.backends.cudnn.deterministic = True  # needed
208 |         torch.backends.cudnn.benchmark = False
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     train()
213 | 
214 |     # A zero exit code causes the job to be marked a Succeeded.
215 |     sys.exit(0)
216 | 


--------------------------------------------------------------------------------
/container/bert_batch/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app


--------------------------------------------------------------------------------
/container/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-fast-bert"
 9 | 
10 | TAG="$1"
11 | 
12 | # parameters
13 | FASTAI_VERSION="1.0"
14 | PY_VERSION="py36"
15 | 
16 | # Get the account number associated with the current IAM credentials
17 | account=$(aws sts get-caller-identity --query Account --output text)
18 | 
19 | if [ $? -ne 0 ]
20 | then
21 |     exit 255
22 | fi
23 | 
24 | chmod +x bert/train
25 | chmod +x bert/serve
26 | 
27 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
28 | region=$(aws configure get region)
29 | region=${region:-us-west-2}
30 | 
31 | # If the repository doesn't exist in ECR, create it.
32 | 
33 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
34 | 
35 | if [ $? -ne 0 ]
36 | then
37 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
38 | fi
39 | 
40 | # Get the login command from ECR and execute it directly
41 | $(aws ecr get-login --region ${region} --no-include-email)
42 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert
43 | 
44 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
45 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
46 | 
47 | # loop for each architecture (cpu & gpu)
48 | for arch in gpu
49 | do  
50 |     echo "Building image with arch=${arch}, region=${region}"
51 |     
52 |     FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}"
53 |     docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" -f "Dockerfile" .
54 |     docker tag ${IMAGE}:${TAG} ${FULLNAME}
55 |     docker push ${FULLNAME}
56 | done
57 | 


--------------------------------------------------------------------------------
/container/build_and_push_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-fast-bert"
 9 | 
10 | TAG="$1"
11 | 
12 | # parameters
13 | FASTAI_VERSION="1.0"
14 | PY_VERSION="py36"
15 | 
16 | # Get the account number associated with the current IAM credentials
17 | account=$(aws sts get-caller-identity --query Account --output text)
18 | 
19 | if [ $? -ne 0 ]
20 | then
21 |     exit 255
22 | fi
23 | 
24 | chmod +x bert_batch/train
25 | 
26 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
27 | region=$(aws configure get region)
28 | region=${region:-us-west-2}
29 | 
30 | # If the repository doesn't exist in ECR, create it.
31 | 
32 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
33 | 
34 | if [ $? -ne 0 ]
35 | then
36 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
37 | fi
38 | 
39 | # Get the login command from ECR and execute it directly
40 | $(aws ecr get-login --region ${region} --no-include-email)
41 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert
42 | 
43 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
44 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
45 | 
46 | # loop for each architecture (cpu & gpu)
47 | for arch in gpu
48 | do  
49 |     echo "Building image with arch=${arch}, region=${region}"
50 |     
51 |     FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}-batch"
52 |     docker build -t ${IMAGE}:${TAG}-batch --build-arg ARCH="$arch" -f "batch.Dockerfile" .
53 |     docker tag ${IMAGE}:${TAG}-batch ${FULLNAME}
54 |     docker push ${FULLNAME}
55 | done
56 | 


--------------------------------------------------------------------------------
/container/pytorch_build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-sagemaker-fast-bert"
 9 | 
10 | # parameters
11 | FASTAI_VERSION="1.0"
12 | PY_VERSION="py36"
13 | 
14 | # Get the account number associated with the current IAM credentials
15 | account=$(aws sts get-caller-identity --query Account --output text)
16 | 
17 | if [ $? -ne 0 ]
18 | then
19 |     exit 255
20 | fi
21 | 
22 | chmod +x bert/train.py
23 | chmod +x bert/serve.py
24 | 
25 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
26 | region=$(aws configure get region)
27 | region=${region:-eu-west-1}
28 | 
29 | # If the repository doesn't exist in ECR, create it.
30 | 
31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
32 | 
33 | if [ $? -ne 0 ]
34 | then
35 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
36 | fi
37 | 
38 | # Get the login command from ECR and execute it directly
39 | $(aws ecr get-login --region ${region} --no-include-email)
40 | 
41 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
42 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
43 | 
44 | # loop for each architecture (cpu & gpu)
45 | 
46 | echo "Building image with arch=gpu, region=${region}"
47 | TAG="pytorch-gpu-${PY_VERSION}"
48 | FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}"
49 | docker build -t ${IMAGE}:${TAG} --no-cache --build-arg ARCH="gpu" -f "Dockerfile_pytorch_nvidia" .
50 | docker tag ${IMAGE}:${TAG} ${FULLNAME}
51 | docker push ${FULLNAME}
52 | 


--------------------------------------------------------------------------------
/container_lm/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | ARG ARCH=gpu
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ARG py_version=3
 5 | 
 6 | # Validate that arguments are specified
 7 | RUN test $py_version || exit 1
 8 | 
 9 | RUN echo $py_version
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends \
12 |     build-essential \
13 |     cmake \
14 |     git \
15 |     curl \
16 |     ca-certificates \
17 |     libjpeg-dev \
18 |     nginx \
19 |     jq \
20 |     libsm6 \
21 |     libxext6 \
22 |     libxrender-dev \
23 |     nginx \
24 |     libpng-dev && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN curl -o ~/miniconda.sh -LO  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
28 |     chmod +x ~/miniconda.sh && \
29 |     ~/miniconda.sh -b -p /opt/conda && \
30 |     rm ~/miniconda.sh && \
31 |     /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \
32 |     /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \
33 |     /opt/conda/bin/conda clean -ya
34 | ENV PATH /opt/conda/bin:$PATH
35 | 
36 | RUN pip install --upgrade pip
37 | 
38 | RUN python --version
39 | RUN pip --version
40 | 
41 | # #RUN df -a
42 | 
43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision
44 | 
45 | 
46 | # Python won’t try to write .pyc or .pyo files on the import of source modules
47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
49 | 
50 | RUN nvcc --version
51 | RUN which nvcc
52 | 
53 | RUN pip --no-cache-dir install \
54 |     flask \
55 |     pathlib \
56 |     gunicorn \
57 |     scipy \
58 |     scikit-learn \
59 |     pandas \
60 |     fastprogress \
61 |     python-box \
62 |     tensorboardX
63 | 
64 | # RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
65 | # RUN pip --no-cache-dir install fast-bert
66 | RUN pip install fast-bert==1.9.9
67 | # RUN pip install transformers==3.2.0
68 | 
69 | RUN pip install cryptography --upgrade && \
70 |     pip install urllib3 --upgrade
71 | 
72 | ENV PATH="/opt/ml/code:${PATH}"
73 | COPY /bert /opt/ml/code
74 | 
75 | WORKDIR /opt/ml/code
76 | 
77 | RUN cd $WORKDIR
78 | 
79 | 


--------------------------------------------------------------------------------
/container_lm/bert/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 | 
27 |     location ~ ^/(ping|invocations) {
28 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
29 |       proxy_set_header Host $http_host;
30 |       proxy_redirect off;
31 |       proxy_pass http://gunicorn;
32 |     }
33 | 
34 |     location / {
35 |       return 404 "{}";
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/container_lm/bert/train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import json
  5 | import pickle
  6 | import sys
  7 | import traceback
  8 | import pandas as pd
  9 | import datetime
 10 | from pathlib import Path
 11 | 
 12 | import logging
 13 | import math
 14 | from dataclasses import dataclass, field
 15 | from typing import Optional
 16 | 
 17 | import torch
 18 | 
 19 | from transformers import (
 20 |     CONFIG_MAPPING,
 21 |     MODEL_WITH_LM_HEAD_MAPPING,
 22 |     AutoConfig,
 23 |     AutoModelWithLMHead,
 24 |     AutoTokenizer,
 25 |     DataCollatorForLanguageModeling,
 26 |     HfArgumentParser,
 27 |     LineByLineTextDataset,
 28 |     PreTrainedTokenizer,
 29 |     TextDataset,
 30 |     Trainer,
 31 |     TrainingArguments,
 32 |     set_seed,
 33 | )
 34 | 
 35 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 36 | 
 37 | channel_name = "training"
 38 | 
 39 | prefix = "/opt/ml/"
 40 | input_path = prefix + "input/data"  # opt/ml/input/data
 41 | code_path = prefix + "code"  # opt/ml/code
 42 | 
 43 | output_path = os.path.join(prefix, "output")  # opt/ml/output
 44 | model_path = os.path.join(prefix, "model")  # opt/ml/model
 45 | 
 46 | training_config_path = os.path.join(
 47 |     input_path, "{}/config".format(channel_name)
 48 | )  # opt/ml/input/data/training/config
 49 | 
 50 | hyperparam_path = os.path.join(
 51 |     prefix, "input/config/hyperparameters.json"
 52 | )  # opt/ml/input/config/hyperparameters.json
 53 | config_path = os.path.join(
 54 |     training_config_path, "training_config.json"
 55 | )  # opt/ml/input/data/training/config/training_config.json
 56 | 
 57 | 
 58 | training_path = os.path.join(input_path, channel_name)  # opt/ml/input/data/training
 59 | 
 60 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 61 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 62 | 
 63 | 
 64 | @dataclass
 65 | class ModelArguments:
 66 |     """
 67 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 68 |     """
 69 | 
 70 |     model_name_or_path: Optional[str] = field(
 71 |         default=None,
 72 |         metadata={
 73 |             "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
 74 |         },
 75 |     )
 76 |     model_type: Optional[str] = field(
 77 |         default=None,
 78 |         metadata={
 79 |             "help": "If training from scratch, pass a model type from the list: "
 80 |             + ", ".join(MODEL_TYPES)
 81 |         },
 82 |     )
 83 |     config_name: Optional[str] = field(
 84 |         default=None,
 85 |         metadata={
 86 |             "help": "Pretrained config name or path if not the same as model_name"
 87 |         },
 88 |     )
 89 |     tokenizer_name: Optional[str] = field(
 90 |         default=None,
 91 |         metadata={
 92 |             "help": "Pretrained tokenizer name or path if not the same as model_name"
 93 |         },
 94 |     )
 95 |     cache_dir: Optional[str] = field(
 96 |         default=None,
 97 |         metadata={
 98 |             "help": "Where do you want to store the pretrained models downloaded from s3"
 99 |         },
100 |     )
101 | 
102 | 
103 | @dataclass
104 | class DataTrainingArguments:
105 |     """
106 |     Arguments pertaining to what data we are going to input our model for training and eval.
107 |     """
108 | 
109 |     train_data_file: Optional[str] = field(
110 |         default=None, metadata={"help": "The input training data file (a text file)."}
111 |     )
112 |     eval_data_file: Optional[str] = field(
113 |         default=None,
114 |         metadata={
115 |             "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
116 |         },
117 |     )
118 |     line_by_line: bool = field(
119 |         default=False,
120 |         metadata={
121 |             "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
122 |         },
123 |     )
124 | 
125 |     mlm: bool = field(
126 |         default=False,
127 |         metadata={
128 |             "help": "Train with masked-language modeling loss instead of language modeling."
129 |         },
130 |     )
131 |     mlm_probability: float = field(
132 |         default=0.15,
133 |         metadata={"help": "Ratio of tokens to mask for masked language modeling loss"},
134 |     )
135 |     plm_probability: float = field(
136 |         default=1 / 6,
137 |         metadata={
138 |             "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling."
139 |         },
140 |     )
141 |     max_span_length: int = field(
142 |         default=5,
143 |         metadata={
144 |             "help": "Maximum length of a span of masked tokens for permutation language modeling."
145 |         },
146 |     )
147 | 
148 |     block_size: int = field(
149 |         default=-1,
150 |         metadata={
151 |             "help": "Optional input sequence length after tokenization."
152 |             "The training dataset will be truncated in block of this size for training."
153 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
154 |         },
155 |     )
156 |     overwrite_cache: bool = field(
157 |         default=False,
158 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
159 |     )
160 | 
161 | 
162 | def get_dataset(
163 |     args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False
164 | ):
165 |     file_path = args.eval_data_file if evaluate else args.train_data_file
166 |     if args.line_by_line:
167 |         return LineByLineTextDataset(
168 |             tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
169 |         )
170 |     else:
171 |         return TextDataset(
172 |             tokenizer=tokenizer,
173 |             file_path=file_path,
174 |             block_size=args.block_size,
175 |             overwrite_cache=args.overwrite_cache,
176 |         )
177 | 
178 | 
179 | # The function to execute the training.
180 | def train():
181 | 
182 |     print("Starting the training.")
183 | 
184 |     DATA_PATH = Path(training_path)
185 | 
186 |     try:
187 |         print(config_path)
188 |         with open(config_path, "r") as f:
189 |             training_config = json.load(f)
190 |             print(training_config)
191 | 
192 |         with open(hyperparam_path, "r") as tc:
193 |             hyperparameters = json.load(tc)
194 |             print(hyperparameters)
195 | 
196 |         # convert string bools to booleans
197 |         training_config["train_file"] = training_config.get("train_file", "train.csv")
198 |         training_config["val_file"] = training_config.get("val_file", "val.csv")
199 |         training_config["fp16"] = training_config["fp16"] == "True"
200 |         training_config["line_by_line"] = training_config["line_by_line"] == "True"
201 |         training_config["use_fast_tokenizer"] = (
202 |             training_config.get("use_fast_tokenizer", "True") == "True"
203 |         )
204 |         training_config["mlm"] = training_config["mlm"] == "True"
205 |         training_config["mlm_probability"] = float(
206 |             training_config.get("mlm_probability", 0.15)
207 |         )
208 |         training_config["block_size"] = int(training_config.get("block_size", -1))
209 | 
210 |         training_config["random_state"] = (
211 |             int(training_config.get("random_state"))
212 |             if training_config.get("random_state")
213 |             else None
214 |         )
215 | 
216 |         training_config["train_size"] = float(training_config.get("train_size", 0.8))
217 | 
218 |         data_args = DataTrainingArguments(
219 |             train_data_file=str(DATA_PATH / training_config["train_file"]),
220 |             eval_data_file=str(DATA_PATH / training_config["val_file"]),
221 |             line_by_line=training_config["line_by_line"],
222 |             mlm=training_config["mlm"],
223 |             mlm_probability=training_config["mlm_probability"],
224 |             block_size=training_config["block_size"],
225 |         )
226 | 
227 |         training_args = TrainingArguments(
228 |             output_dir=model_path,
229 |             overwrite_output_dir=True,
230 |             do_train=True,
231 |             do_eval=True,
232 |             evaluate_during_training=True,
233 |             per_device_train_batch_size=int(hyperparameters["train_batch_size"]),
234 |             per_device_eval_batch_size=int(hyperparameters["train_batch_size"]) * 2,
235 |             gradient_accumulation_steps=int(training_config["grad_accumulation_steps"]),
236 |             warmup_steps=int(hyperparameters["warmup_steps"]),
237 |             logging_steps=int(training_config["logging_steps"]),
238 |             fp16=training_config["fp16"],
239 |             fp16_opt_level=training_config["fp16_opt_level"],
240 |             seed=training_config["random_state"],
241 |             num_train_epochs=int(hyperparameters["epochs"]),
242 |             learning_rate=float(hyperparameters["lr"]),
243 |             save_steps=0,
244 |         )
245 | 
246 |         # Logger
247 |         # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"]))
248 |         logging.basicConfig(
249 |             level=logging.INFO,
250 |             format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
251 |             datefmt="%m/%d/%Y %H:%M:%S",
252 |             handlers=[
253 |                 # logging.FileHandler(logfile),
254 |                 logging.StreamHandler(sys.stdout)
255 |             ],
256 |         )
257 | 
258 |         logger = logging.getLogger()
259 | 
260 |         set_seed(training_args.seed)
261 | 
262 |         # use auto-tokenizer
263 |         tokenizer = AutoTokenizer.from_pretrained(
264 |             training_config["model_name"],
265 |             use_fast=training_config["use_fast_tokenizer"],
266 |         )
267 | 
268 |         config = AutoConfig.from_pretrained(training_config["model_name"])
269 | 
270 |         model = AutoModelWithLMHead.from_pretrained(
271 |             training_config["model_name"], config=config
272 |         )
273 |         model.resize_token_embeddings(len(tokenizer))
274 | 
275 |         if (
276 |             config.model_type in ["bert", "roberta", "distilbert", "camembert"]
277 |             and not data_args.mlm
278 |         ):
279 |             raise ValueError(
280 |                 "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
281 |                 "--mlm flag (masked language modeling)."
282 |             )
283 | 
284 |         if data_args.block_size <= 0:
285 |             data_args.block_size = tokenizer.max_len
286 |             # Our input block size will be the max possible for the model
287 |         else:
288 |             data_args.block_size = min(data_args.block_size, tokenizer.max_len)
289 | 
290 |         # Get datasets
291 | 
292 |         train_dataset = get_dataset(data_args, tokenizer=tokenizer)
293 |         eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
294 | 
295 |         data_collator = DataCollatorForLanguageModeling(
296 |             tokenizer=tokenizer,
297 |             mlm=data_args.mlm,
298 |             mlm_probability=data_args.mlm_probability,
299 |         )
300 | 
301 |         # Initialize our Trainer
302 |         trainer = Trainer(
303 |             model=model,
304 |             args=training_args,
305 |             data_collator=data_collator,
306 |             train_dataset=train_dataset,
307 |             eval_dataset=eval_dataset,
308 |             prediction_loss_only=True,
309 |         )
310 | 
311 |         # Run pre-validation
312 |         if training_args.do_eval:
313 |             logger.info("*** Evaluate before training ***")
314 |             logger.info(validate(trainer, logger))
315 | 
316 |         trainer.train()
317 | 
318 |         trainer.save_model()
319 |         # For convenience, we also re-save the tokenizer to the same directory,
320 |         # so that you can share your model easily on huggingface.co/models =)
321 |         if trainer.is_world_master():
322 |             tokenizer.save_pretrained(training_args.output_dir)
323 | 
324 |         # Run validation
325 |         if training_args.do_eval:
326 |             logger.info("*** Evaluate ***")
327 |             # logger.info(validate(trainer, logger))
328 | 
329 |         # save model config file
330 |         with open(os.path.join(model_path, "model_config.json"), "w") as f:
331 |             json.dump(training_config, f)
332 | 
333 |     except Exception as e:
334 |         # Write out an error file. This will be returned as the failureReason in the
335 |         # DescribeTrainingJob result.
336 |         trc = traceback.format_exc()
337 |         with open(os.path.join(output_path, "failure"), "w") as s:
338 |             s.write("Exception during training: " + str(e) + "\n" + trc)
339 |         # Printing this causes the exception to be in the training job logs, as well.
340 |         print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr)
341 |         # A non-zero exit code causes the training job to be marked as Failed.
342 |         sys.exit(255)
343 | 
344 | 
345 | def validate(trainer: Trainer, logger):
346 |     results = {}
347 |     eval_output = trainer.evaluate()
348 | 
349 |     perplexity = math.exp(eval_output["eval_loss"])
350 |     result = {"perplexity": perplexity}
351 | 
352 |     results.update(result)
353 | 
354 |     return results
355 | 
356 | 
357 | if __name__ == "__main__":
358 |     train()
359 | 
360 |     # A zero exit code causes the job to be marked a Succeeded.
361 |     sys.exit(0)
362 | 


--------------------------------------------------------------------------------
/container_lm/bert/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app


--------------------------------------------------------------------------------
/container_lm/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-fast-bert-lm"
 9 | 
10 | TAG="$1"
11 | 
12 | # parameters
13 | FASTAI_VERSION="1.0"
14 | PY_VERSION="py36"
15 | 
16 | # Get the account number associated with the current IAM credentials
17 | account=$(aws sts get-caller-identity --query Account --output text)
18 | 
19 | if [ $? -ne 0 ]
20 | then
21 |     exit 255
22 | fi
23 | 
24 | chmod +x bert/train
25 | 
26 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
27 | region=$(aws configure get region)
28 | region=${region:-us-west-2}
29 | 
30 | # If the repository doesn't exist in ECR, create it.
31 | 
32 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
33 | 
34 | if [ $? -ne 0 ]
35 | then
36 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
37 | fi
38 | 
39 | # Get the login command from ECR and execute it directly
40 | $(aws ecr get-login --region ${region} --no-include-email)
41 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert
42 | 
43 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
44 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
45 | 
46 | # loop for each architecture (cpu & gpu)
47 | for arch in gpu
48 | do  
49 |     echo "Building image with arch=${arch}, region=${region}"
50 |     FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}"
51 |     docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" -f "Dockerfile" .
52 |     docker tag ${IMAGE}:${TAG} ${FULLNAME}
53 |     docker push ${FULLNAME}
54 | done
55 | 


--------------------------------------------------------------------------------
/container_ner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | ARG ARCH=gpu
 3 | 
 4 | ARG py_version=3
 5 | 
 6 | # Validate that arguments are specified
 7 | RUN test $py_version || exit 1
 8 | 
 9 | RUN echo $py_version
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends \
12 |          build-essential \
13 |          cmake \
14 |          git \
15 |          curl \
16 |          ca-certificates \
17 |          libjpeg-dev \
18 |          nginx \
19 |          jq \
20 |          libsm6 \
21 |          libxext6 \
22 |          libxrender-dev \
23 |          nginx \
24 |          libpng-dev && \
25 |      rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN curl -o ~/miniconda.sh -LO  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
28 |      chmod +x ~/miniconda.sh && \
29 |      ~/miniconda.sh -b -p /opt/conda && \
30 |      rm ~/miniconda.sh && \
31 |      /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython && \
32 |      /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \
33 |      /opt/conda/bin/conda clean -ya
34 | ENV PATH /opt/conda/bin:$PATH
35 | 
36 | RUN pip install --upgrade pip
37 | 
38 | RUN python --version
39 | RUN pip --version
40 | 
41 | # #RUN df -a
42 | 
43 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision
44 | 
45 | 
46 | # Python won’t try to write .pyc or .pyo files on the import of source modules
47 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
48 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
49 | 
50 | RUN nvcc --version
51 | RUN which nvcc
52 | 
53 | RUN pip --no-cache-dir install \
54 |     flask \
55 |     pathlib \
56 |     gunicorn \
57 |     gevent \
58 |     scipy \
59 |     scikit-learn \
60 |     pandas \
61 |     fastprogress \
62 |     python-box \
63 |     tensorboardX \
64 |     fastai
65 | 
66 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
67 | 
68 | # RUN pip --no-cache-dir install fast-bert
69 | RUN pip install fast-bert
70 | 
71 | ENV PATH="/opt/ml/code:${PATH}"
72 | COPY /bert /opt/ml/code
73 | 
74 | WORKDIR /opt/ml/code
75 | 
76 | RUN cd $WORKDIR
77 | 
78 | RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased distilroberta-base
79 | 
80 | RUN rm -rf /opt/ml/input/data/training/cache/
81 | 


--------------------------------------------------------------------------------
/container_ner/bert/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 | 
27 |     location ~ ^/(ping|invocations) {
28 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
29 |       proxy_set_header Host $http_host;
30 |       proxy_redirect off;
31 |       proxy_pass http://gunicorn;
32 |     }
33 | 
34 |     location / {
35 |       return 404 "{}";
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/container_ner/bert/predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pickle
  4 | import sys
  5 | import signal
  6 | import traceback
  7 | import re
  8 | import flask
  9 | 
 10 | import torch
 11 | 
 12 | from fast_bert.prediction_ner import BertNERPredictor
 13 | 
 14 | from fast_bert.utils.spellcheck import BingSpellCheck
 15 | from pathlib import Path
 16 | 
 17 | import warnings
 18 | 
 19 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 20 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 21 | 
 22 | prefix = "/opt/ml/"
 23 | 
 24 | # PATH = Path(os.path.join(prefix, "model"))
 25 | PATH = os.path.join(prefix, "model")
 26 | 
 27 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin")
 28 | 
 29 | # request_text = None
 30 | 
 31 | 
 32 | class ScoringService(object):
 33 |     model = None  # Where we keep the model when it's loaded
 34 | 
 35 |     @classmethod
 36 |     def get_predictor_model(cls):
 37 | 
 38 |         # print(cls.searching_all_files(PATH))
 39 |         # Get model predictor
 40 |         if cls.model is None:
 41 |             with open(os.path.join(PATH, "model_config.json")) as f:
 42 |                 model_config = json.load(f)
 43 | 
 44 |             predictor = BertNERPredictor(
 45 |                 os.path.join(PATH, "model_out"),
 46 |                 label_path=PATH,
 47 |                 model_type=model_config["model_type"],
 48 |                 do_lower_case=model_config.get("do_lower_case", "True") == "True",
 49 |                 use_fast_tokenizer=model_config.get("use_fast_tokenizer", "True")
 50 |                 == "True",
 51 |             )
 52 |             cls.model = predictor
 53 | 
 54 |         return cls.model
 55 | 
 56 |     @classmethod
 57 |     def predict(cls, text, bing_key=None):
 58 |         """For the input, do the predictions and return them.
 59 |         Args:
 60 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 61 |                 one prediction per row in the dataframe"""
 62 |         predictor_model = cls.get_predictor_model()
 63 |         if bing_key:
 64 |             spellChecker = BingSpellCheck(bing_key)
 65 |             text = spellChecker.spell_check(text)
 66 |         prediction = predictor_model.predict(text)
 67 | 
 68 |         return prediction
 69 | 
 70 |     @classmethod
 71 |     def searching_all_files(cls, directory: Path):
 72 |         file_list = []  # A list for storing files existing in directories
 73 | 
 74 |         for x in directory.iterdir():
 75 |             if x.is_file():
 76 |                 file_list.append(str(x))
 77 |             else:
 78 |                 file_list.append(cls.searching_all_files(x))
 79 | 
 80 |         return file_list
 81 | 
 82 | 
 83 | # The flask app for serving predictions
 84 | app = flask.Flask(__name__)
 85 | 
 86 | 
 87 | @app.route("/ping", methods=["GET"])
 88 | def ping():
 89 |     """Determine if the container is working and healthy. In this sample container, we declare
 90 |     it healthy if we can load the model successfully."""
 91 |     health = (
 92 |         ScoringService.get_predictor_model() is not None
 93 |     )  # You can insert a health check here
 94 | 
 95 |     status = 200 if health else 404
 96 |     return flask.Response(response="\n", status=status, mimetype="application/json")
 97 | 
 98 | 
 99 | @app.route("/invocations", methods=["POST"])
100 | def transformation():
101 |     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
102 |     it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
103 |     just means one prediction per line, since there's a single column.
104 |     """
105 |     data = None
106 |     text = None
107 | 
108 |     if flask.request.content_type == "application/json":
109 |         print("calling json launched")
110 |         data = flask.request.get_json(silent=True)
111 | 
112 |         text = data["text"]
113 |         try:
114 |             bing_key = data["bing_key"]
115 |         except Exception:
116 |             bing_key = None
117 | 
118 |     else:
119 |         return flask.Response(
120 |             response="This predictor only supports JSON data",
121 |             status=415,
122 |             mimetype="text/plain",
123 |         )
124 | 
125 |     print("Invoked with text: {}.".format(text.encode("utf-8")))
126 | 
127 |     # Do the prediction
128 |     predictions = ScoringService.predict(text, bing_key)
129 | 
130 |     result = json.dumps(predictions[:10])
131 | 
132 |     return flask.Response(response=result, status=200, mimetype="application/json")
133 | 


--------------------------------------------------------------------------------
/container_ner/bert/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | from __future__ import print_function
17 | import multiprocessing
18 | import os
19 | import signal
20 | import subprocess
21 | import sys
22 | 
23 | cpu_count = multiprocessing.cpu_count()
24 | 
25 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60)
26 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count))
27 | 
28 | def sigterm_handler(nginx_pid, gunicorn_pid):
29 |     try:
30 |         os.kill(nginx_pid, signal.SIGQUIT)
31 |     except OSError:
32 |         pass
33 |     try:
34 |         os.kill(gunicorn_pid, signal.SIGTERM)
35 |     except OSError:
36 |         pass
37 | 
38 |     sys.exit(0)
39 | 
40 | def start_server():
41 |     print('Starting the inference server with {} workers.'.format(model_server_workers))
42 | 
43 | 
44 |     # link the log streams to stdout/err so they will be logged to the container logs
45 |     subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
46 |     subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
47 | 
48 |     nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf'])
49 |     gunicorn = subprocess.Popen(['gunicorn',
50 |                                  '--timeout', str(model_server_timeout),
51 |                                  '-k', 'gevent',
52 |                                  '-b', 'unix:/tmp/gunicorn.sock',
53 |                                  '-w', str(model_server_workers),
54 |                                  'wsgi:app'])
55 | 
56 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
57 | 
58 |     # If either subprocess exits, so do we.
59 |     pids = set([nginx.pid, gunicorn.pid])
60 |     while True:
61 |         pid, _ = os.wait()
62 |         if pid in pids:
63 |             break
64 | 
65 |     sigterm_handler(nginx.pid, gunicorn.pid)
66 |     print('Inference server exiting')
67 | 
68 | # The main routine just invokes the start function.
69 | 
70 | if __name__ == '__main__':
71 |     start_server()


--------------------------------------------------------------------------------
/container_ner/bert/train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import json
  5 | import pickle
  6 | import sys
  7 | import traceback
  8 | import pandas as pd
  9 | import datetime
 10 | from pathlib import Path
 11 | import logging
 12 | 
 13 | import torch
 14 | 
 15 | from transformers import AutoTokenizer
 16 | 
 17 | 
 18 | from fast_bert.data_ner import BertNERDataBunch
 19 | from fast_bert.learner_ner import BertNERLearner
 20 | 
 21 | run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 22 | 
 23 | channel_name = "training"
 24 | 
 25 | prefix = "/opt/ml/"
 26 | input_path = prefix + "input/data"  # opt/ml/input/data
 27 | code_path = prefix + "code"  # opt/ml/code
 28 | pretrained_model_path = (
 29 |     code_path + "/pretrained_models"
 30 | )  # opt/ml/code/pretrained_models
 31 | 
 32 | finetuned_path = input_path + "/{}/finetuned".format(
 33 |     channel_name
 34 | )  # opt/ml/input/data/training/finetuned
 35 | 
 36 | output_path = os.path.join(prefix, "output")  # opt/ml/output
 37 | model_path = os.path.join(prefix, "model")  # opt/ml/model
 38 | 
 39 | training_config_path = os.path.join(
 40 |     input_path, "{}/config".format(channel_name)
 41 | )  # opt/ml/input/data/training/config
 42 | 
 43 | hyperparam_path = os.path.join(
 44 |     prefix, "input/config/hyperparameters.json"
 45 | )  # opt/ml/input/config/hyperparameters.json
 46 | config_path = os.path.join(
 47 |     training_config_path, "training_config.json"
 48 | )  # opt/ml/input/data/training/config/training_config.json
 49 | 
 50 | 
 51 | # This algorithm has a single channel of input data called 'training'. Since we run in
 52 | # File mode, the input files are copied to the directory specified here.
 53 | 
 54 | training_path = os.path.join(input_path, channel_name)  # opt/ml/input/data/training
 55 | 
 56 | 
 57 | def searching_all_files(directory: Path):
 58 |     file_list = []  # A list for storing files existing in directories
 59 | 
 60 |     for x in directory.iterdir():
 61 |         if x.is_file():
 62 |             file_list.append(str(x))
 63 |         else:
 64 |             file_list.append(searching_all_files(x))
 65 | 
 66 |     return file_list
 67 | 
 68 | 
 69 | # The function to execute the training.
 70 | def train():
 71 | 
 72 |     print("Starting the training.")
 73 | 
 74 |     DATA_PATH = Path(training_path)
 75 | 
 76 |     try:
 77 |         print(config_path)
 78 |         with open(config_path, "r") as f:
 79 |             training_config = json.load(f)
 80 |             print(training_config)
 81 | 
 82 |         with open(hyperparam_path, "r") as tc:
 83 |             hyperparameters = json.load(tc)
 84 |             print(hyperparameters)
 85 | 
 86 |         # convert string bools to booleans
 87 |         training_config["fp16"] = training_config["fp16"] == "True"
 88 |         training_config["use_fast_tokenizer"] = (
 89 |             training_config.get("use_fast_tokenizer", "True") == "True"
 90 |         )
 91 |         training_config["jsonl_file"] = training_config.get("jsonl_file", "data.jsonl")
 92 | 
 93 |         training_config["random_state"] = (
 94 |             int(training_config.get("random_state"))
 95 |             if training_config.get("random_state")
 96 |             else None
 97 |         )
 98 | 
 99 |         training_config["train_size"] = float(training_config.get("train_size", 0.8))
100 | 
101 |         # Logger
102 |         # logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, training_config["run_text"]))
103 |         logging.basicConfig(
104 |             level=logging.INFO,
105 |             format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
106 |             datefmt="%m/%d/%Y %H:%M:%S",
107 |             handlers=[
108 |                 # logging.FileHandler(logfile),
109 |                 logging.StreamHandler(sys.stdout)
110 |             ],
111 |         )
112 | 
113 |         logger = logging.getLogger()
114 | 
115 |         # Define pretrained model path
116 |         PRETRAINED_PATH = Path(pretrained_model_path) / training_config["model_name"]
117 |         if PRETRAINED_PATH.is_dir():
118 |             logger.info("model path used {}".format(PRETRAINED_PATH))
119 |             model_name_path = str(PRETRAINED_PATH)
120 |         else:
121 |             model_name_path = training_config["model_name"]
122 |             logger.info(
123 |                 "model {} is not preloaded. Will try to download.".format(
124 |                     model_name_path
125 |                 )
126 |             )
127 | 
128 |         finetuned_model_name = training_config.get("finetuned_model", None)
129 |         if finetuned_model_name is not None:
130 |             finetuned_model = os.path.join(finetuned_path, finetuned_model_name)
131 |             logger.info("finetuned model loaded from {}".format(finetuned_model))
132 |         else:
133 |             logger.info(
134 |                 "finetuned model not available - loading standard pretrained model"
135 |             )
136 |             finetuned_model = None
137 | 
138 |         # use auto-tokenizer
139 |         tokenizer = AutoTokenizer.from_pretrained(model_name_path, use_fast=True)
140 | 
141 |         device = torch.device("cuda")
142 |         if torch.cuda.device_count() > 1:
143 |             multi_gpu = True
144 |         else:
145 |             multi_gpu = False
146 | 
147 |         logger.info("Number of GPUs: {}".format(torch.cuda.device_count()))
148 | 
149 |         # Create databunch
150 |         databunch = BertNERDataBunch.from_jsonl(
151 |             DATA_PATH,
152 |             training_config["jsonl_file"],
153 |             tokenizer,
154 |             clear_cache=True,
155 |             batch_size_per_gpu=int(hyperparameters["train_batch_size"]),
156 |             max_seq_length=int(hyperparameters["max_seq_length"]),
157 |             multi_gpu=multi_gpu,
158 |             model_type=training_config["model_type"],
159 |             logger=logger,
160 |             use_fast_tokenizer=training_config["use_fast_tokenizer"],
161 |             train_size=training_config["train_size"],
162 |             random_state=training_config["random_state"],
163 |         )
164 | 
165 |         logger.info("databunch labels: {}".format(len(databunch.labels)))
166 | 
167 |         # Initialise the learner
168 |         learner = BertNERLearner.from_pretrained_model(
169 |             databunch,
170 |             model_name_path,
171 |             output_dir=Path(model_path),
172 |             device=device,
173 |             logger=logger,
174 |             finetuned_wgts_path=finetuned_model,
175 |             is_fp16=training_config["fp16"],
176 |             fp16_opt_level=training_config["fp16_opt_level"],
177 |             warmup_steps=int(hyperparameters["warmup_steps"]),
178 |             grad_accumulation_steps=int(training_config["grad_accumulation_steps"]),
179 |             multi_gpu=multi_gpu,
180 |             logging_steps=int(training_config["logging_steps"]),
181 |             save_steps=int(training_config.get("save_steps", 0)),
182 |         )
183 | 
184 |         learner.fit(int(hyperparameters["epochs"]), float(hyperparameters["lr"]))
185 | 
186 |         # Run validation
187 |         logger.info(learner.validate())
188 | 
189 |         # save model and tokenizer artefacts
190 |         learner.save_model()
191 | 
192 |         # save model config file
193 |         with open(os.path.join(model_path, "model_config.json"), "w") as f:
194 |             json.dump(training_config, f)
195 | 
196 |         # save label file
197 |         with open(os.path.join(model_path, "labels.txt"), "w") as f:
198 |             f.write("\n".join(databunch.labels))
199 | 
200 |     except Exception as e:
201 |         # Write out an error file. This will be returned as the failureReason in the
202 |         # DescribeTrainingJob result.
203 |         trc = traceback.format_exc()
204 |         with open(os.path.join(output_path, "failure"), "w") as s:
205 |             s.write("Exception during training: " + str(e) + "\n" + trc)
206 |         # Printing this causes the exception to be in the training job logs, as well.
207 |         print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr)
208 |         # A non-zero exit code causes the training job to be marked as Failed.
209 |         sys.exit(255)
210 | 
211 | 
212 | if __name__ == "__main__":
213 |     train()
214 | 
215 |     # A zero exit code causes the job to be marked a Succeeded.
216 |     sys.exit(0)
217 | 


--------------------------------------------------------------------------------
/container_ner/bert/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app


--------------------------------------------------------------------------------
/container_ner/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-fast-bert-ner"
 9 | 
10 | # parameters
11 | FASTAI_VERSION="1.0"
12 | PY_VERSION="py36"
13 | 
14 | # Get the account number associated with the current IAM credentials
15 | account=$(aws sts get-caller-identity --query Account --output text)
16 | 
17 | if [ $? -ne 0 ]
18 | then
19 |     exit 255
20 | fi
21 | 
22 | chmod +x bert/train
23 | chmod +x bert/serve
24 | 
25 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
26 | region=$(aws configure get region)
27 | region=${region:-us-west-2}
28 | 
29 | # If the repository doesn't exist in ECR, create it.
30 | 
31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
32 | 
33 | if [ $? -ne 0 ]
34 | then
35 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
36 | fi
37 | 
38 | # Get the login command from ECR and execute it directly
39 | $(aws ecr get-login --region ${region} --no-include-email)
40 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert
41 | 
42 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
43 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
44 | 
45 | # loop for each architecture (cpu & gpu)
46 | for arch in gpu
47 | do  
48 |     echo "Building image with arch=${arch}, region=${region}"
49 |     TAG="${FASTAI_VERSION}-${arch}-${PY_VERSION}"
50 |     FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}"
51 |     docker build -t ${IMAGE}:${TAG} --no-cache --build-arg ARCH="$arch" -f "Dockerfile" .
52 |     docker tag ${IMAGE}:${TAG} ${FULLNAME}
53 |     docker push ${FULLNAME}
54 | done
55 | 


--------------------------------------------------------------------------------
/container_t5/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | ARG ARCH=gpu
 3 | 
 4 | ARG py_version=3
 5 | 
 6 | # Validate that arguments are specified
 7 | RUN test $py_version || exit 1
 8 | 
 9 | RUN echo $py_version
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends \
12 |          build-essential \
13 |          cmake \
14 |          git \
15 |          curl \
16 |          ca-certificates \
17 |          libjpeg-dev \
18 |          nginx \
19 |          jq \
20 |          libsm6 \
21 |          libxext6 \
22 |          libxrender-dev \
23 |          nginx \
24 |          libpng-dev && \
25 |      rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN curl -o ~/miniconda.sh -LO  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
28 |      chmod +x ~/miniconda.sh && \
29 |      ~/miniconda.sh -b -p /opt/conda && \
30 |      rm ~/miniconda.sh && \
31 |      /opt/conda/bin/conda install -y python=3.7 numpy pyyaml scipy ipython mkl mkl-include ninja cython typing && \
32 |      /opt/conda/bin/conda install -y -c pytorch magma-cuda100 && \
33 |      /opt/conda/bin/conda clean -ya
34 | ENV PATH /opt/conda/bin:$PATH
35 | 
36 | RUN pip install --upgrade pip
37 | 
38 | RUN python --version
39 | RUN pip --version
40 | 
41 | # #RUN df -a
42 | RUN pip install --trusted-host pypi.python.org -v --log /tmp/pip.log torch torchvision
43 | 
44 | 
45 | # Python won’t try to write .pyc or .pyo files on the import of source modules
46 | # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
47 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
48 | 
49 | RUN nvcc --version
50 | RUN which nvcc
51 | 
52 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
53 | 
54 | COPY requirements.txt ./requirements.txt
55 | RUN pip install -r requirements.txt
56 | 
57 | ENV PATH="/opt/ml/code:${PATH}"
58 | COPY /t5 /opt/ml/code
59 | 
60 | WORKDIR /opt/ml/code
61 | 


--------------------------------------------------------------------------------
/container_t5/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | IMAGE="fluent-fast-bert-t5"
 9 | 
10 | # parameters
11 | FASTAI_VERSION="1.0"
12 | PY_VERSION="py36"
13 | 
14 | # Get the account number associated with the current IAM credentials
15 | account=$(aws sts get-caller-identity --query Account --output text)
16 | 
17 | if [ $? -ne 0 ]
18 | then
19 |     exit 255
20 | fi
21 | 
22 | chmod +x t5/train
23 | chmod +x t5/serve
24 | 
25 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
26 | region=$(aws configure get region)
27 | region=${region:-eu-west-1}
28 | 
29 | # If the repository doesn't exist in ECR, create it.
30 | 
31 | aws ecr describe-repositories --repository-names "${IMAGE}" > /dev/null 2>&1
32 | 
33 | if [ $? -ne 0 ]
34 | then
35 |     aws ecr create-repository --repository-name "${IMAGE}" > /dev/null
36 | fi
37 | 
38 | # Get the login command from ECR and execute it directly
39 | $(aws ecr get-login --region ${region} --no-include-email)
40 | # aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 579360261297.dkr.ecr.eu-west-1.amazonaws.com/fluent-fast-bert
41 | 
42 | # Get the login command from ECR in order to pull down the SageMaker PyTorch image
43 | $(aws ecr get-login --registry-ids 520713654638 --region ${region} --no-include-email)
44 | 
45 | # loop for each architecture (cpu & gpu)
46 | for arch in gpu
47 | do  
48 |     echo "Building image with arch=${arch}, region=${region}"
49 |     TAG="${FASTAI_VERSION}-${arch}-${PY_VERSION}"
50 |     FULLNAME="${account}.dkr.ecr.${region}.amazonaws.com/${IMAGE}:${TAG}"
51 |     docker build -t ${IMAGE}:${TAG} --build-arg ARCH="$arch" .
52 |     docker tag ${IMAGE}:${TAG} ${FULLNAME}
53 |     docker push ${FULLNAME}
54 | done
55 | 


--------------------------------------------------------------------------------
/container_t5/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask 
 2 | pathlib 
 3 | gunicorn 
 4 | gevent 
 5 | scipy 
 6 | scikit-learn 
 7 | pandas 
 8 | fastprogress 
 9 | python-box 
10 | tensorboardX
11 | transformers==2.11.0
12 | pytorch_lightning


--------------------------------------------------------------------------------
/container_t5/t5/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 | 
27 |     location ~ ^/(ping|invocations) {
28 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
29 |       proxy_set_header Host $http_host;
30 |       proxy_redirect off;
31 |       proxy_pass http://gunicorn;
32 |     }
33 | 
34 |     location / {
35 |       return 404 "{}";
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/container_t5/t5/predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pickle
  4 | import sys
  5 | import signal
  6 | import traceback
  7 | import re
  8 | import flask
  9 | 
 10 | import torch
 11 | 
 12 | from fast_bert.prediction import BertClassificationPredictor
 13 | 
 14 | from fast_bert.utils.spellcheck import BingSpellCheck
 15 | from pathlib import Path
 16 | 
 17 | import warnings
 18 | 
 19 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 20 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 21 | 
 22 | prefix = "/opt/ml/"
 23 | 
 24 | # PATH = Path(os.path.join(prefix, "model"))
 25 | PATH = os.path.join(prefix, "model")
 26 | 
 27 | MODEL_PATH = os.path.join(PATH, "pytorch_model.bin")
 28 | 
 29 | # request_text = None
 30 | 
 31 | 
 32 | class ScoringService(object):
 33 |     model = None  # Where we keep the model when it's loaded
 34 | 
 35 |     @classmethod
 36 |     def get_predictor_model(cls):
 37 | 
 38 |         # print(cls.searching_all_files(PATH))
 39 |         # Get model predictor
 40 |         if cls.model is None:
 41 |             with open(os.path.join(PATH, "model_config.json")) as f:
 42 |                 model_config = json.load(f)
 43 | 
 44 |             predictor = BertClassificationPredictor(
 45 |                 os.path.join(PATH, "model_out"),
 46 |                 label_path=PATH,
 47 |                 multi_label=bool(model_config["multi_label"]),
 48 |                 model_type=model_config["model_type"],
 49 |                 do_lower_case=bool(model_config["do_lower_case"]),
 50 |             )
 51 |             cls.model = predictor
 52 | 
 53 |         return cls.model
 54 | 
 55 |     @classmethod
 56 |     def predict(cls, text, bing_key=None):
 57 |         """For the input, do the predictions and return them.
 58 |         Args:
 59 |             input (a pandas dataframe): The data on which to do the predictions. There will be
 60 |                 one prediction per row in the dataframe"""
 61 |         predictor_model = cls.get_predictor_model()
 62 |         if bing_key:
 63 |             spellChecker = BingSpellCheck(bing_key)
 64 |             text = spellChecker.spell_check(text)
 65 |         prediction = predictor_model.predict(text)
 66 | 
 67 |         return prediction
 68 | 
 69 |     @classmethod
 70 |     def searching_all_files(cls, directory: Path):
 71 |         file_list = []  # A list for storing files existing in directories
 72 | 
 73 |         for x in directory.iterdir():
 74 |             if x.is_file():
 75 |                 file_list.append(str(x))
 76 |             else:
 77 |                 file_list.append(cls.searching_all_files(x))
 78 | 
 79 |         return file_list
 80 | 
 81 | 
 82 | # The flask app for serving predictions
 83 | app = flask.Flask(__name__)
 84 | 
 85 | 
 86 | @app.route("/ping", methods=["GET"])
 87 | def ping():
 88 |     """Determine if the container is working and healthy. In this sample container, we declare
 89 |     it healthy if we can load the model successfully."""
 90 |     health = (
 91 |         ScoringService.get_predictor_model() is not None
 92 |     )  # You can insert a health check here
 93 | 
 94 |     status = 200 if health else 404
 95 |     return flask.Response(response="\n", status=status, mimetype="application/json")
 96 | 
 97 | 
 98 | # @app.route("/execution-parameters", method=["GET"])
 99 | # def get_execution_parameters():
100 | #     params = {
101 | #         "MaxConcurrentTransforms": 3,
102 | #         "BatchStrategy": "MULTI_RECORD",
103 | #         "MaxPayloadInMB": 6,
104 | #     }
105 | #     return flask.Response(
106 | #         response=json.dumps(params), status="200", mimetype="application/json"
107 | #     )
108 | 
109 | 
110 | @app.route("/invocations", methods=["POST"])
111 | def transformation():
112 |     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
113 |     it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
114 |     just means one prediction per line, since there's a single column.
115 |     """
116 |     data = None
117 |     text = None
118 | 
119 |     if flask.request.content_type == "application/json":
120 |         print("calling json launched")
121 |         data = flask.request.get_json(silent=True)
122 | 
123 |         text = data["text"]
124 |         try:
125 |             bing_key = data["bing_key"]
126 |         except Exception:
127 |             bing_key = None
128 | 
129 |     else:
130 |         return flask.Response(
131 |             response="This predictor only supports JSON data",
132 |             status=415,
133 |             mimetype="text/plain",
134 |         )
135 | 
136 |     print("Invoked with text: {}.".format(text.encode("utf-8")))
137 | 
138 |     # Do the prediction
139 |     predictions = ScoringService.predict(text, bing_key)
140 | 
141 |     result = json.dumps(predictions[:10])
142 | 
143 |     return flask.Response(response=result, status=200, mimetype="application/json")
144 | 


--------------------------------------------------------------------------------
/container_t5/t5/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | from __future__ import print_function
17 | import multiprocessing
18 | import os
19 | import signal
20 | import subprocess
21 | import sys
22 | 
23 | cpu_count = multiprocessing.cpu_count()
24 | 
25 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60)
26 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count))
27 | 
28 | def sigterm_handler(nginx_pid, gunicorn_pid):
29 |     try:
30 |         os.kill(nginx_pid, signal.SIGQUIT)
31 |     except OSError:
32 |         pass
33 |     try:
34 |         os.kill(gunicorn_pid, signal.SIGTERM)
35 |     except OSError:
36 |         pass
37 | 
38 |     sys.exit(0)
39 | 
40 | def start_server():
41 |     print('Starting the inference server with {} workers.'.format(model_server_workers))
42 | 
43 | 
44 |     # link the log streams to stdout/err so they will be logged to the container logs
45 |     subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
46 |     subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
47 | 
48 |     nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf'])
49 |     gunicorn = subprocess.Popen(['gunicorn',
50 |                                  '--timeout', str(model_server_timeout),
51 |                                  '-k', 'gevent',
52 |                                  '-b', 'unix:/tmp/gunicorn.sock',
53 |                                  '-w', str(model_server_workers),
54 |                                  'wsgi:app'])
55 | 
56 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
57 | 
58 |     # If either subprocess exits, so do we.
59 |     pids = set([nginx.pid, gunicorn.pid])
60 |     while True:
61 |         pid, _ = os.wait()
62 |         if pid in pids:
63 |             break
64 | 
65 |     sigterm_handler(nginx.pid, gunicorn.pid)
66 |     print('Inference server exiting')
67 | 
68 | # The main routine just invokes the start function.
69 | 
70 | if __name__ == '__main__':
71 |     start_server()


--------------------------------------------------------------------------------
/container_t5/t5/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app


--------------------------------------------------------------------------------
/deploy_pip.sh:
--------------------------------------------------------------------------------
1 | rm -rf dist
2 | python3 setup.py sdist bdist_wheel   
3 | twine upload  dist/*  


--------------------------------------------------------------------------------
/fast_bert/__init__.py:
--------------------------------------------------------------------------------
 1 | from .modeling import BertForMultiLabelSequenceClassification
 2 | 
 3 | # from .data import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
 4 | from .data_cls import (
 5 |     BertDataBunch,
 6 |     InputExample,
 7 |     InputFeatures,
 8 |     MultiLabelTextProcessor,
 9 |     convert_examples_to_features,
10 | )
11 | 
12 | 
13 | from .learner_cls import BertLearner
14 | 
15 | 
16 | # from .prediction import BertClassificationPredictor
17 | from .utils.spellcheck import BingSpellCheck
18 | 
19 | 
20 | 
21 | from .onnx_helper import *
22 | 


--------------------------------------------------------------------------------
/fast_bert/bert_layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class BertLayerNorm(nn.Module):
 6 |     def __init__(self, hidden_size, eps=1e-12):
 7 |         """Construct a layernorm module in the TF style (epsilon inside the square root).
 8 |         """
 9 |         super(BertLayerNorm, self).__init__()
10 |         self.weight = nn.Parameter(torch.ones(hidden_size))
11 |         self.bias = nn.Parameter(torch.zeros(hidden_size))
12 |         self.variance_epsilon = eps
13 | 
14 |     def forward(self, x):
15 |         u = x.mean(-1, keepdim=True)
16 |         s = (x - u).pow(2).mean(-1, keepdim=True)
17 |         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
18 |         return self.weight * x + self.bias
19 | 


--------------------------------------------------------------------------------
/fast_bert/data_abs.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import html
  3 | import logging
  4 | import pandas as pd
  5 | import os
  6 | import random
  7 | import torch
  8 | from pathlib import Path
  9 | import pickle
 10 | import shutil
 11 | from collections import deque, namedtuple
 12 | from torch.utils.data import Dataset, DataLoader, SequentialSampler
 13 | from tokenizers import BertWordPieceTokenizer
 14 | from transformers import BertTokenizer
 15 | 
 16 | Batch = namedtuple(
 17 |     "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
 18 | )
 19 | 
 20 | 
 21 | class SummarizationDataset(Dataset):
 22 |     """ Abstracts the dataset used to train seq2seq models.
 23 |     The class will process the documents that are located in the specified
 24 |     folder. The preprocessing will work on any document that is reasonably
 25 |     formatted. On the CNN/DailyMail dataset it will extract both the story
 26 |     and the summary.
 27 |     CNN/Daily News:
 28 |     The CNN/Daily News raw datasets are downloaded from [1]. The stories are
 29 |     stored in different files; the summary appears at the end of the story as
 30 |     sentences that are prefixed by the special `@highlight` line. To process
 31 |     the data, untar both datasets in the same folder, and pass the path to this
 32 |     folder as the "data_dir argument. The formatting code was inspired by [2].
 33 |     [1] https://cs.nyu.edu/~kcho/
 34 |     [2] https://github.com/abisee/cnn-dailymail/
 35 |     """
 36 | 
 37 |     def __init__(self, path="", prefix="train"):
 38 |         """ We initialize the class by listing all the documents to summarize.
 39 |         Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
 40 |         """
 41 |         assert os.path.isdir(path)
 42 | 
 43 |         self.documents = []
 44 |         filenames_list = os.listdir(path)
 45 |         for filename in filenames_list:
 46 |             if "summary" in filename:
 47 |                 continue
 48 |             path_to_text = os.path.join(path, filename)
 49 |             if not os.path.isfile(path_to_text):
 50 |                 continue
 51 |             self.documents.append(path_to_text)
 52 | 
 53 |     def __len__(self):
 54 |         """ Returns the number of documents. """
 55 |         return len(self.documents)
 56 | 
 57 |     def __getitem__(self, idx):
 58 |         document_path = self.documents[idx]
 59 |         document_name = document_path.split("/")[-1]
 60 |         with open(document_path, encoding="utf-8") as source:
 61 |             raw_doc = source.read()
 62 |             doc_lines = process_document(raw_doc)
 63 |         return document_name, doc_lines, []
 64 | 
 65 | 
 66 | class SummarizationInMemoryDataset(Dataset):
 67 |     """ Abstracts the dataset used to train seq2seq models.
 68 |     The class will process the documents that are located in the specified
 69 |     folder. The preprocessing will work on any document that is reasonably
 70 |     formatted. On the CNN/DailyMail dataset it will extract both the story
 71 |     and the summary.
 72 |     CNN/Daily News:
 73 |     The CNN/Daily News raw datasets are downloaded from [1]. The stories are
 74 |     stored in different files; the summary appears at the end of the story as
 75 |     sentences that are prefixed by the special `@highlight` line. To process
 76 |     the data, untar both datasets in the same folder, and pass the path to this
 77 |     folder as the "data_dir argument. The formatting code was inspired by [2].
 78 |     [1] https://cs.nyu.edu/~kcho/
 79 |     [2] https://github.com/abisee/cnn-dailymail/
 80 |     """
 81 | 
 82 |     def __init__(self, texts=[]):
 83 |         """ We initialize the class by listing all the documents to summarize.
 84 |         Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
 85 |         """
 86 |         self.documents = texts
 87 | 
 88 |     def __len__(self):
 89 |         """ Returns the number of documents. """
 90 |         return len(self.documents)
 91 | 
 92 |     def __getitem__(self, idx):
 93 |         raw_doc = self.documents[idx]
 94 |         doc_lines = process_document(raw_doc)
 95 | 
 96 |         return None, doc_lines, []
 97 | 
 98 | 
 99 | def process_document(raw_doc):
100 |     """ Extract the story and summary from a story file.
101 |     Attributes:
102 |         raw_story (str): content of the story file as an utf-8 encoded string.
103 |     Raises:
104 |         IndexError: If the stoy is empty or contains no highlights.
105 |     """
106 |     nonempty_lines = list(
107 |         filter(lambda x: len(x) != 0, [line.strip() for line in raw_doc.split("\n")])
108 |     )
109 | 
110 |     # for some unknown reason some lines miss a period, add it
111 |     nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
112 | 
113 |     # gather article lines
114 |     doc_lines = []
115 |     lines = deque(nonempty_lines)
116 |     while True:
117 |         try:
118 |             element = lines.popleft()
119 |             if element.startswith("@highlight"):
120 |                 break
121 |             doc_lines.append(element)
122 |         except IndexError:
123 |             # if "@highlight" is absent from the file we pop
124 |             # all elements until there is None, raising an exception.
125 |             return doc_lines
126 | 
127 |     return doc_lines
128 | 
129 | 
130 | def _add_missing_period(line):
131 |     END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
132 |     if line.startswith("@highlight"):
133 |         return line
134 |     if line[-1] in END_TOKENS:
135 |         return line
136 |     return line + "."
137 | 
138 | 
139 | # Abstractive databunch
140 | class BertAbsDataBunch(object):
141 |     def __init__(
142 |         self,
143 |         tokenizer,
144 |         device,
145 |         data_dir=None,
146 |         test_data=None,
147 |         batch_size_per_gpu=16,
148 |         max_seq_length=512,
149 |         multi_gpu=True,
150 |         multi_label=False,
151 |         model_type="bert",
152 |         logger=None,
153 |         clear_cache=False,
154 |         no_cache=False,
155 |     ):
156 | 
157 |         # just in case someone passes string instead of Path
158 |         if isinstance(data_dir, str):
159 |             data_dir = Path(data_dir)
160 | 
161 |         if isinstance(tokenizer, str):
162 |             # instantiate the new tokeniser object using the tokeniser name
163 |             tokenizer = BertTokenizer.from_pretrained(
164 |                 "bert-base-uncased", do_lower_case=True
165 |             )
166 |         self.tokenizer = tokenizer
167 | 
168 |         if type(self.tokenizer) == BertWordPieceTokenizer:
169 | 
170 |             self.tokenizer.cls_token_id = self.tokenizer.token_to_id("[CLS]")
171 |             self.tokenizer.pad_token_id = self.tokenizer.token_to_id("[PAD]")
172 | 
173 |         self.max_seq_length = max_seq_length
174 |         self.batch_size_per_gpu = batch_size_per_gpu
175 |         self.device = device
176 |         if data_dir:
177 |             self.data_dir = data_dir
178 |             self.cache_dir = data_dir / "lm_cache"
179 |             # Create folder if it doesn't exist
180 |             self.cache_dir.mkdir(exist_ok=True)
181 |             self.no_cache = no_cache
182 |             if clear_cache:
183 |                 shutil.rmtree(self.cache_dir, ignore_errors=True)
184 |         else:
185 |             self.no_cache = True
186 |             self.data_dir = None
187 | 
188 |         self.model_type = model_type
189 |         if logger is None:
190 |             logger = logging.getLogger()
191 |         self.logger = logger
192 |         self.n_gpu = 1
193 |         if multi_gpu:
194 |             self.n_gpu = torch.cuda.device_count()
195 | 
196 |         # get dataset
197 |         if self.data_dir:
198 |             dataset = SummarizationDataset(self.data_dir)
199 |         elif test_data:
200 |             dataset = SummarizationInMemoryDataset(test_data)
201 |         else:
202 |             dataset = None
203 | 
204 |         if dataset:
205 |             sampler = SequentialSampler(dataset)
206 | 
207 |             collate_fn = lambda data: collate(
208 |                 data, self.tokenizer, block_size=self.max_seq_length, device=self.device
209 |             )
210 | 
211 |             self.test_dl = DataLoader(
212 |                 dataset,
213 |                 sampler=sampler,
214 |                 batch_size=self.batch_size_per_gpu,
215 |                 collate_fn=collate_fn,
216 |             )
217 |         else:
218 |             self.test_dl = None
219 | 
220 |     def get_dl_from_texts(self, texts):
221 | 
222 |         dataset = SummarizationInMemoryDataset(texts)
223 | 
224 |         sampler = SequentialSampler(dataset)
225 | 
226 |         collate_fn = lambda data: collate(
227 |             data, self.tokenizer, block_size=self.max_seq_length, device=self.device
228 |         )
229 |         return DataLoader(
230 |             dataset,
231 |             sampler=sampler,
232 |             batch_size=self.batch_size_per_gpu,
233 |             collate_fn=collate_fn,
234 |         )
235 | 
236 | 
237 | def collate(data, tokenizer, block_size, device):
238 |     """ Collate formats the data passed to the data loader.
239 |     In particular we tokenize the data batch after batch to avoid keeping them
240 |     all in memory. We output the data as a namedtuple to fit the original BertAbs's
241 |     API.
242 |     """
243 |     data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
244 |     names = [name for name, _, _ in data]
245 |     summaries = [" ".join(summary_list) for _, _, summary_list in data]
246 | 
247 |     if type(tokenizer) == BertWordPieceTokenizer:
248 |         encoded_text = [
249 |             encode_for_summarization_new_tokenizer(story, summary, tokenizer)
250 |             for _, story, summary in data
251 |         ]
252 |     else:
253 |         encoded_text = [
254 |             encode_for_summarization(story, summary, tokenizer)
255 |             for _, story, summary in data
256 |         ]
257 |     encoded_stories = torch.tensor(
258 |         [
259 |             fit_to_block_size(story, block_size, tokenizer.pad_token_id)
260 |             for story, _ in encoded_text
261 |         ]
262 |     )
263 |     encoder_token_type_ids = compute_token_type_ids(
264 |         encoded_stories, tokenizer.cls_token_id
265 |     )
266 |     encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
267 | 
268 |     batch = Batch(
269 |         document_names=names,
270 |         batch_size=len(encoded_stories),
271 |         src=encoded_stories.to(device),
272 |         segs=encoder_token_type_ids.to(device),
273 |         mask_src=encoder_mask.to(device),
274 |         tgt_str=summaries,
275 |     )
276 | 
277 |     return batch
278 | 
279 | 
280 | def encode_for_summarization(story_lines, summary_lines, tokenizer):
281 |     """ Encode the story and summary lines, and join them
282 |     as specified in [1] by using `[SEP] [CLS]` tokens to separate
283 |     sentences.
284 |     """
285 |     story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
286 |     story_token_ids = [
287 |         token for sentence in story_lines_token_ids for token in sentence
288 |     ]
289 |     summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
290 |     summary_token_ids = [
291 |         token for sentence in summary_lines_token_ids for token in sentence
292 |     ]
293 | 
294 |     return story_token_ids, summary_token_ids
295 | 
296 | 
297 | def encode_for_summarization_new_tokenizer(story_lines, summary_lines, tokenizer):
298 |     """ Encode the story and summary lines, and join them
299 |     as specified in [1] by using `[SEP] [CLS]` tokens to separate
300 |     sentences.
301 |     """
302 |     story_lines_token_ids = [tokenizer.encode(line).ids for line in story_lines]
303 |     story_token_ids = [
304 |         token for sentence in story_lines_token_ids for token in sentence
305 |     ]
306 |     summary_lines_token_ids = [tokenizer.encode(line).ids for line in summary_lines]
307 |     summary_token_ids = [
308 |         token for sentence in summary_lines_token_ids for token in sentence
309 |     ]
310 | 
311 |     return story_token_ids, summary_token_ids
312 | 
313 | 
314 | def fit_to_block_size(sequence, block_size, pad_token_id):
315 |     """ Adapt the source and target sequences' lengths to the block size.
316 |     If the sequence is shorter we append padding token to the right of the sequence.
317 |     """
318 |     if len(sequence) > block_size:
319 |         return sequence[:block_size]
320 |     else:
321 |         sequence.extend([pad_token_id] * (block_size - len(sequence)))
322 |         return sequence
323 | 
324 | 
325 | def build_mask(sequence, pad_token_id):
326 |     """ Builds the mask. The attention mechanism will only attend to positions
327 |     with value 1. """
328 |     mask = torch.ones_like(sequence)
329 |     idx_pad_tokens = sequence == pad_token_id
330 |     mask[idx_pad_tokens] = 0
331 |     return mask
332 | 
333 | 
334 | def compute_token_type_ids(batch, separator_token_id):
335 |     """ Segment embeddings as described in [1]
336 |     The values {0,1} were found in the repository [2].
337 |     Attributes:
338 |         batch: torch.Tensor, size [batch_size, block_size]
339 |             Batch of input.
340 |         separator_token_id: int
341 |             The value of the token that separates the segments.
342 |     [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
343 |         arXiv preprint arXiv:1908.08345 (2019).
344 |     [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
345 |     """
346 |     batch_embeddings = []
347 |     for sequence in batch:
348 |         sentence_num = -1
349 |         embeddings = []
350 |         for s in sequence:
351 |             if s == separator_token_id:
352 |                 sentence_num += 1
353 |             embeddings.append(sentence_num % 2)
354 |         batch_embeddings.append(embeddings)
355 |     return torch.tensor(batch_embeddings)
356 | 


--------------------------------------------------------------------------------
/fast_bert/data_lm.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import train_test_split
  2 | import re
  3 | import html
  4 | import logging
  5 | import pandas as pd
  6 | import os
  7 | import random
  8 | import torch
  9 | from pathlib import Path
 10 | import pickle
 11 | import shutil
 12 | import itertools
 13 | import more_itertools
 14 | 
 15 | 
 16 | from torch.utils.data import (
 17 |     TensorDataset,
 18 |     DataLoader,
 19 |     RandomSampler,
 20 |     SequentialSampler,
 21 |     Dataset,
 22 | )
 23 | from torch.utils.data.distributed import DistributedSampler
 24 | 
 25 | from tqdm import tqdm, trange
 26 | from fastprogress.fastprogress import master_bar, progress_bar
 27 | 
 28 | from transformers import (
 29 |     WEIGHTS_NAME,
 30 |     BertConfig,
 31 |     BertForSequenceClassification,
 32 |     BertTokenizer,
 33 |     XLMConfig,
 34 |     XLMForSequenceClassification,
 35 |     XLMTokenizer,
 36 |     XLNetConfig,
 37 |     XLNetForSequenceClassification,
 38 |     XLNetTokenizer,
 39 |     RobertaConfig,
 40 |     RobertaForSequenceClassification,
 41 |     RobertaTokenizer,
 42 |     DistilBertConfig,
 43 |     DistilBertForSequenceClassification,
 44 |     DistilBertTokenizer,
 45 |     CamembertConfig,
 46 |     CamembertForSequenceClassification,
 47 |     CamembertTokenizer,
 48 |     ElectraConfig,
 49 |     ElectraForSequenceClassification,
 50 |     ElectraTokenizer,
 51 | )
 52 | 
 53 | MODEL_CLASSES = {
 54 |     "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
 55 |     "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
 56 |     "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 57 |     "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 58 |     "distilbert": (
 59 |         DistilBertConfig,
 60 |         DistilBertForSequenceClassification,
 61 |         DistilBertTokenizer,
 62 |     ),
 63 |     "camembert-base": (
 64 |         CamembertConfig,
 65 |         CamembertForSequenceClassification,
 66 |         CamembertTokenizer,
 67 |     ),
 68 |     "electra": (ElectraConfig, ElectraForSequenceClassification, ElectraTokenizer),
 69 | }
 70 | 
 71 | # Create text corpus suitable for language model training
 72 | 
 73 | 
 74 | def create_corpus(text_list, target_path, logger=None):
 75 | 
 76 |     with open(target_path, "w") as f:
 77 |         #  Split sentences for each document
 78 |         logger.info("Formatting corpus for {}".format(target_path))
 79 |         for text in progress_bar(text_list):
 80 | 
 81 |             text = fix_html(text)
 82 |             text = replace_multi_newline(text)
 83 |             text = spec_add_spaces(text)
 84 |             text = rm_useless_spaces(text)
 85 |             text = text.strip()
 86 | 
 87 |             f.write(text + "\n")
 88 | 
 89 | 
 90 | #            text_lines = [re.sub(r"\n(\s)*","",str(sent)) for i, sent in enumerate(nlp(str(text)).sents)]
 91 | #            text_lines = [text_line for text_line in text_lines if re.search(r'[a-zA-Z]', text_line)]
 92 | 
 93 | #            f.write('\n'.join(text_lines))
 94 | #            f.write("\n  \n")
 95 | 
 96 | 
 97 | def spec_add_spaces(t: str) -> str:
 98 |     "Add spaces around / and # in `t`. \n"
 99 |     return re.sub(r"([/#\n])", r" \1 ", t)
100 | 
101 | 
102 | def rm_useless_spaces(t: str) -> str:
103 |     "Remove multiple spaces in `t`."
104 |     return re.sub(" {2,}", " ", t)
105 | 
106 | 
107 | def replace_multi_newline(t: str) -> str:
108 |     return re.sub(r"(\n(\s)*){2,}", "\n", t)
109 | 
110 | 
111 | def fix_html(x: str) -> str:
112 |     "List of replacements from html strings in `x`."
113 |     re1 = re.compile(r"  +")
114 |     x = (
115 |         x.replace("#39;", "'")
116 |         .replace("amp;", "&")
117 |         .replace("#146;", "'")
118 |         .replace("nbsp;", " ")
119 |         .replace("#36;", "$")
120 |         .replace("\\n", "\n")
121 |         .replace("quot;", "'")
122 |         .replace("<br />", "\n")
123 |         .replace('\\"', '"')
124 |         .replace(" @.@ ", ".")
125 |         .replace(" @-@ ", "-")
126 |         .replace(" @,@ ", ",")
127 |         .replace("\\", " \\ ")
128 |     )
129 |     return re1.sub(" ", html.unescape(x))
130 | 
131 | 
132 | class TextDataset(Dataset):
133 |     def __init__(self, tokenizer, file_path, cache_path, logger, block_size=512):
134 |         assert os.path.isfile(file_path)
135 | 
136 |         if os.path.exists(cache_path):
137 |             logger.info("Loading features from cached file %s", cache_path)
138 |             with open(cache_path, "rb") as handle:
139 |                 self.examples = pickle.load(handle)
140 |         else:
141 |             logger.info("Creating features from dataset file %s", file_path)
142 | 
143 |             self.examples = []
144 |             text = (line.strip() for line in open(file_path, encoding="utf-8"))
145 |             text = progress_bar(list(text))
146 |             text = map(
147 |                 lambda x: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)), text
148 |             )
149 |             text = itertools.chain.from_iterable(text)
150 |             text = more_itertools.chunked(text, block_size)
151 |             self.examples = list(text)[:-1]
152 |             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
153 |             # If your dataset is small, first you should loook for a bigger one :-) and second you
154 |             # can change this behavior by adding (model specific) padding.
155 | 
156 |             logger.info("Saving features into cached file %s", cache_path)
157 |             with open(cache_path, "wb") as handle:
158 |                 pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
159 | 
160 |     def __len__(self):
161 |         return len(self.examples)
162 | 
163 |     def __getitem__(self, item):
164 |         return torch.tensor(self.examples[item])
165 | 
166 | 
167 | # DataBunch object for language models
168 | class BertLMDataBunch(object):
169 |     @staticmethod
170 |     def from_raw_corpus(
171 |         data_dir,
172 |         text_list,
173 |         tokenizer,
174 |         batch_size_per_gpu=32,
175 |         max_seq_length=512,
176 |         multi_gpu=True,
177 |         test_size=0.1,
178 |         model_type="bert",
179 |         logger=None,
180 |         clear_cache=False,
181 |         no_cache=False,
182 |     ):
183 | 
184 |         train_file = "lm_train.txt"
185 |         val_file = "lm_val.txt"
186 | 
187 |         train_list, val_list = train_test_split(
188 |             text_list, test_size=test_size, shuffle=True
189 |         )
190 |         # Create train corpus
191 |         create_corpus(train_list, str(data_dir / train_file), logger=logger)
192 | 
193 |         # Create val corpus
194 |         create_corpus(val_list, str(data_dir / val_file), logger=logger)
195 | 
196 |         return BertLMDataBunch(
197 |             data_dir,
198 |             tokenizer,
199 |             train_file=train_file,
200 |             val_file=val_file,
201 |             batch_size_per_gpu=batch_size_per_gpu,
202 |             max_seq_length=max_seq_length,
203 |             multi_gpu=multi_gpu,
204 |             model_type=model_type,
205 |             logger=logger,
206 |             clear_cache=clear_cache,
207 |             no_cache=no_cache,
208 |         )
209 | 
210 |     def __init__(
211 |         self,
212 |         data_dir,
213 |         tokenizer,
214 |         train_file="lm_train.txt",
215 |         val_file="lm_val.txt",
216 |         batch_size_per_gpu=32,
217 |         max_seq_length=512,
218 |         multi_gpu=True,
219 |         model_type="bert",
220 |         logger=None,
221 |         clear_cache=False,
222 |         no_cache=False,
223 |     ):
224 | 
225 |         # just in case someone passes string instead of Path
226 |         if isinstance(data_dir, str):
227 |             data_dir = Path(data_dir)
228 | 
229 |         # Instantiate correct tokenizer if the tokenizer name is passed instead of object
230 |         if isinstance(tokenizer, str):
231 |             _, _, tokenizer_class = MODEL_CLASSES[model_type]
232 |             # instantiate the new tokeniser object using the tokeniser name
233 |             tokenizer = tokenizer_class.from_pretrained(
234 |                 tokenizer, do_lower_case=("uncased" in tokenizer)
235 |             )
236 | 
237 |         # Bug workaround for RoBERTa
238 |         if model_type == "roberta":
239 |             tokenizer.max_len_single_sentence = tokenizer.max_len - 2
240 | 
241 |         self.tokenizer = tokenizer
242 |         self.max_seq_length = max_seq_length
243 |         self.batch_size_per_gpu = batch_size_per_gpu
244 |         self.train_dl = None
245 |         self.val_dl = None
246 |         self.data_dir = data_dir
247 |         self.cache_dir = data_dir / "lm_cache"
248 |         self.no_cache = no_cache
249 |         self.model_type = model_type
250 |         if logger is None:
251 |             logger = logging.getLogger()
252 |         self.logger = logger
253 |         self.n_gpu = 1
254 |         if multi_gpu:
255 |             self.n_gpu = torch.cuda.device_count()
256 | 
257 |         if clear_cache:
258 |             shutil.rmtree(self.cache_dir, ignore_errors=True)
259 | 
260 |         # Create folder if it doesn't exist
261 |         self.cache_dir.mkdir(exist_ok=True)
262 | 
263 |         if train_file:
264 |             # Train DataLoader
265 |             # train_examples = None
266 |             cached_features_file = os.path.join(
267 |                 self.cache_dir,
268 |                 "cached_{}_{}_{}".format(
269 |                     self.model_type, "train", str(self.max_seq_length)
270 |                 ),
271 |             )
272 | 
273 |             train_filepath = str(self.data_dir / train_file)
274 |             train_dataset = TextDataset(
275 |                 self.tokenizer,
276 |                 train_filepath,
277 |                 cached_features_file,
278 |                 self.logger,
279 |                 block_size=self.tokenizer.max_len_single_sentence,
280 |             )
281 | 
282 |             self.train_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu)
283 | 
284 |             train_sampler = RandomSampler(train_dataset)
285 |             self.train_dl = DataLoader(
286 |                 train_dataset, sampler=train_sampler, batch_size=self.train_batch_size
287 |             )
288 | 
289 |         if val_file:
290 |             # Val DataLoader
291 |             # val_examples = None
292 |             cached_features_file = os.path.join(
293 |                 self.cache_dir,
294 |                 "cached_{}_{}_{}".format(
295 |                     self.model_type, "dev", str(self.max_seq_length)
296 |                 ),
297 |             )
298 | 
299 |             val_filepath = str(self.data_dir / val_file)
300 |             val_dataset = TextDataset(
301 |                 self.tokenizer,
302 |                 val_filepath,
303 |                 cached_features_file,
304 |                 self.logger,
305 |                 block_size=self.tokenizer.max_len_single_sentence,
306 |             )
307 | 
308 |             self.val_batch_size = self.batch_size_per_gpu * 2 * max(1, self.n_gpu)
309 | 
310 |             val_sampler = RandomSampler(val_dataset)
311 |             self.val_dl = DataLoader(
312 |                 val_dataset, sampler=val_sampler, batch_size=self.val_batch_size
313 |             )
314 | 
315 |     # Mask tokens
316 | 
317 |     def mask_tokens(self, inputs, mlm_probability=0.15):
318 |         """Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original."""
319 |         labels = inputs.clone()
320 |         # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa)
321 | 
322 |         masked_indices = torch.bernoulli(
323 |             torch.full(labels.shape, mlm_probability)
324 |         ).bool()
325 |         # do not mask special tokens
326 |         masked_indices[:, 0] = False
327 |         masked_indices[:, -1] = False
328 | 
329 |         labels[~masked_indices] = -100  # We only compute loss on masked tokens
330 | 
331 |         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
332 |         indices_replaced = (
333 |             torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
334 |         )
335 |         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(
336 |             self.tokenizer.mask_token
337 |         )
338 | 
339 |         # 10% of the time, we replace masked input tokens with random word
340 |         indices_random = (
341 |             torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
342 |             & masked_indices
343 |             & ~indices_replaced
344 |         )
345 |         random_words = torch.randint(
346 |             len(self.tokenizer), labels.shape, dtype=torch.long
347 |         )
348 |         inputs[indices_random] = random_words[indices_random]
349 | 
350 |         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
351 |         return inputs, labels
352 | 
353 |     def save(self, filename="databunch.pkl"):
354 |         tmp_path = self.data_dir / "tmp"
355 |         tmp_path.mkdir(exist_ok=True)
356 |         with open(str(tmp_path / filename), "wb") as f:
357 |             pickle.dump(self, f)
358 | 


--------------------------------------------------------------------------------
/fast_bert/learner_abs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from .data_abs import BertAbsDataBunch
  3 | from .learner_util import Learner
  4 | from torch import nn
  5 | from typing import List
  6 | import torch
  7 | from box import Box
  8 | from tokenizers import BertWordPieceTokenizer
  9 | 
 10 | from .summarisation import BertAbs, build_predictor
 11 | from .summarisation import BertAbsConfig
 12 | from fastprogress.fastprogress import master_bar, progress_bar
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | 
 17 | from pathlib import Path
 18 | 
 19 | 
 20 | MODEL_CLASSES = {"bert": (BertAbsConfig, BertAbs)}
 21 | 
 22 | 
 23 | class BertAbsLearner(Learner):
 24 |     @staticmethod
 25 |     def from_pretrained_model(
 26 |         databunch,
 27 |         pretrained_path,
 28 |         device,
 29 |         logger,
 30 |         metrics=None,
 31 |         finetuned_wgts_path=None,
 32 |         multi_gpu=True,
 33 |         is_fp16=True,
 34 |         loss_scale=0,
 35 |         warmup_steps=0,
 36 |         fp16_opt_level="O1",
 37 |         grad_accumulation_steps=1,
 38 |         max_grad_norm=1.0,
 39 |         adam_epsilon=1e-8,
 40 |         logging_steps=100,
 41 |         alpha=0.95,
 42 |         beam_size=5,
 43 |         min_length=50,
 44 |         max_length=200,
 45 |         block_trigram=True,
 46 |     ):
 47 | 
 48 |         model_state_dict = None
 49 | 
 50 |         model_type = databunch.model_type
 51 | 
 52 |         config_class, model_class = MODEL_CLASSES[model_type]
 53 | 
 54 |         if torch.cuda.is_available():
 55 |             map_location = lambda storage, loc: storage.cuda()
 56 |         else:
 57 |             map_location = 'cpu'
 58 | 
 59 |         if finetuned_wgts_path:
 60 |             model_state_dict = torch.load(finetuned_wgts_path, map_location=map_location)
 61 |         else:
 62 |             model_state_dict = None
 63 | 
 64 |         model = model_class.from_pretrained(
 65 |             str(pretrained_path), state_dict=model_state_dict
 66 |         )
 67 | 
 68 |         model.to(device)
 69 | 
 70 |         return BertAbsLearner(
 71 |             databunch,
 72 |             model,
 73 |             str(pretrained_path),
 74 |             device,
 75 |             logger,
 76 |             metrics,
 77 |             multi_gpu,
 78 |             is_fp16,
 79 |             loss_scale,
 80 |             warmup_steps,
 81 |             fp16_opt_level,
 82 |             grad_accumulation_steps,
 83 |             max_grad_norm,
 84 |             adam_epsilon,
 85 |             logging_steps,
 86 |             alpha,
 87 |             beam_size,
 88 |             min_length,
 89 |             max_length,
 90 |             block_trigram,
 91 |         )
 92 | 
 93 |     def __init__(
 94 |         self,
 95 |         data: BertAbsDataBunch,
 96 |         model: nn.Module,
 97 |         pretrained_model_path,
 98 |         device,
 99 |         logger,
100 |         metrics=None,
101 |         multi_gpu=True,
102 |         is_fp16=True,
103 |         loss_scale=0,
104 |         warmup_steps=0,
105 |         fp16_opt_level="O1",
106 |         grad_accumulation_steps=1,
107 |         max_grad_norm=1.0,
108 |         adam_epsilon=1e-8,
109 |         logging_steps=100,
110 |         alpha=0.95,
111 |         beam_size=5,
112 |         min_length=50,
113 |         max_length=200,
114 |         block_trigram=True,
115 |     ):
116 | 
117 |         super(BertAbsLearner, self).__init__(
118 |             data,
119 |             model,
120 |             pretrained_model_path,
121 |             None,
122 |             device,
123 |             logger,
124 |             multi_gpu,
125 |             is_fp16,
126 |             warmup_steps,
127 |             fp16_opt_level,
128 |             grad_accumulation_steps,
129 |             max_grad_norm,
130 |             adam_epsilon,
131 |             logging_steps,
132 |         )
133 | 
134 |         # Classification specific attributes
135 |         self.metrics = metrics
136 | 
137 |         # Summarisation specific features
138 |         if type(self.data.tokenizer) == BertWordPieceTokenizer:
139 |             symbols = {
140 |                 "BOS": self.data.tokenizer.token_to_id("[unused0]"),
141 |                 "EOS": self.data.tokenizer.token_to_id("[unused1]"),
142 |                 "PAD": self.data.tokenizer.token_to_id("[PAD]"),
143 |             }
144 |         else:
145 |             symbols = {
146 |                 "BOS": self.data.tokenizer.vocab["[unused0]"],
147 |                 "EOS": self.data.tokenizer.vocab["[unused1]"],
148 |                 "PAD": self.data.tokenizer.vocab["[PAD]"],
149 |             }
150 | 
151 |         self.predictor_args = Box(
152 |             {
153 |                 "alpha": alpha,
154 |                 "beam_size": beam_size,
155 |                 "min_length": min_length,
156 |                 "max_length": max_length,
157 |                 "block_trigram": block_trigram,
158 |             }
159 |         )
160 | 
161 |         # predictor object
162 |         self.predictor = build_predictor(
163 |             self.predictor_args, self.data.tokenizer, symbols, self.model
164 |         )
165 | 
166 |     ### Train the model ###
167 |     def fit(
168 |         self,
169 |         epochs,
170 |         lr,
171 |         validate=True,
172 |         schedule_type="warmup_cosine",
173 |         optimizer_type="lamb",
174 |     ):
175 |         self.logger.info(
176 |             "Irony...fit is not implmented yet. This is a pretrained-only inference model"
177 |         )
178 | 
179 |     ### Evaluate the model
180 |     def validate(self):
181 |         self.logger.info(
182 |             "Irony...fit is not implmented yet. This is a pretrained-only inference model"
183 |         )
184 | 
185 |     ### Return Predictions ###
186 |     def predict_batch(self, texts=None):
187 | 
188 |         if texts:
189 |             dl = self.data.get_dl_from_texts(texts)
190 |         else:
191 |             dl = self.data.test_dl
192 | 
193 |         all_summaries = []
194 | 
195 |         self.model.eval()
196 |         for step, batch in enumerate(dl):
197 |             # batch = tuple(t.to(self.device) for t in batch)
198 | 
199 |             batch_data = self.predictor.translate_batch(batch)
200 |             translations = self.predictor.from_batch(batch_data)
201 | 
202 |             summaries = [format_summary(t) for t in translations]
203 |             all_summaries.extend(summaries)
204 | 
205 |         return all_summaries
206 | 
207 | 
208 | def format_summary(translation):
209 |     """ Transforms the output of the `from_batch` function
210 |     into nicely formatted summaries.
211 |     """
212 |     raw_summary, _, _ = translation
213 |     summary = (
214 |         raw_summary.replace("[unused0]", "")
215 |         .replace("[unused3]", "")
216 |         .replace("[PAD]", "")
217 |         .replace("[unused1]", "")
218 |         .replace(r" +", " ")
219 |         .replace(" [unused2] ", ". ")
220 |         .replace("[unused2]", "")
221 |         .strip()
222 |     )
223 | 
224 |     return summary
225 | 


--------------------------------------------------------------------------------
/fast_bert/learner_lm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from packaging import version
  4 | from pathlib import Path
  5 | import numpy as np
  6 | 
  7 | from fastprogress.fastprogress import master_bar, progress_bar
  8 | from tensorboardX import SummaryWriter
  9 | 
 10 | from .learner_util import Learner
 11 | 
 12 | from .data_lm import BertLMDataBunch
 13 | 
 14 | from transformers import (
 15 |     WEIGHTS_NAME,
 16 |     BertConfig,
 17 |     BertForMaskedLM,
 18 |     RobertaConfig,
 19 |     RobertaForMaskedLM,
 20 |     DistilBertConfig,
 21 |     DistilBertForMaskedLM,
 22 |     CamembertConfig,
 23 |     CamembertForMaskedLM,
 24 |     ElectraConfig,
 25 |     ElectraForMaskedLM,
 26 | )
 27 | 
 28 | from torch.optim.lr_scheduler import _LRScheduler, Optimizer
 29 | 
 30 | MODEL_CLASSES = {
 31 |     "bert": (BertConfig, BertForMaskedLM),
 32 |     "roberta": (RobertaConfig, RobertaForMaskedLM),
 33 |     "distilbert": (DistilBertConfig, DistilBertForMaskedLM),
 34 |     "camembert-base": (CamembertConfig, CamembertForMaskedLM),
 35 |     "electra": (ElectraConfig, ElectraForMaskedLM),
 36 | }
 37 | 
 38 | if version.parse(torch.__version__) >= version.parse("1.6"):
 39 |     IS_AMP_AVAILABLE = True
 40 |     from torch.cuda.amp import autocast
 41 | else:
 42 |     IS_AMP_AVAILABLE = False
 43 | 
 44 | 
 45 | class BertLMLearner(Learner):
 46 |     @staticmethod
 47 |     def from_pretrained_model(
 48 |         dataBunch,
 49 |         pretrained_path,
 50 |         output_dir,
 51 |         metrics,
 52 |         device,
 53 |         logger,
 54 |         multi_gpu=True,
 55 |         is_fp16=True,
 56 |         warmup_steps=0,
 57 |         fp16_opt_level="O1",
 58 |         grad_accumulation_steps=1,
 59 |         max_grad_norm=1.0,
 60 |         adam_epsilon=1e-8,
 61 |         logging_steps=100,
 62 |     ):
 63 | 
 64 |         if is_fp16 and (IS_AMP_AVAILABLE is False):
 65 |             logger.debug("Apex not installed. switching off FP16 training")
 66 |             is_fp16 = False
 67 | 
 68 |         model_type = dataBunch.model_type
 69 | 
 70 |         config_class, model_class = MODEL_CLASSES[model_type]
 71 | 
 72 |         config = config_class.from_pretrained(pretrained_path)
 73 |         model = model_class.from_pretrained(pretrained_path, config=config)
 74 |         model.to(device)
 75 | 
 76 |         return BertLMLearner(
 77 |             dataBunch,
 78 |             model,
 79 |             pretrained_path,
 80 |             output_dir,
 81 |             metrics,
 82 |             device,
 83 |             logger,
 84 |             multi_gpu,
 85 |             is_fp16,
 86 |             warmup_steps,
 87 |             fp16_opt_level,
 88 |             grad_accumulation_steps,
 89 |             max_grad_norm,
 90 |             adam_epsilon,
 91 |             logging_steps,
 92 |         )
 93 | 
 94 |     # Learner initialiser
 95 |     def __init__(
 96 |         self,
 97 |         data: BertLMDataBunch,
 98 |         model: torch.nn.Module,
 99 |         pretrained_model_path,
100 |         output_dir,
101 |         metrics,
102 |         device,
103 |         logger,
104 |         multi_gpu=True,
105 |         is_fp16=True,
106 |         warmup_steps=0,
107 |         fp16_opt_level="O1",
108 |         grad_accumulation_steps=1,
109 |         max_grad_norm=1.0,
110 |         adam_epsilon=1e-8,
111 |         logging_steps=100,
112 |     ):
113 | 
114 |         if isinstance(output_dir, str):
115 |             output_dir = Path(output_dir)
116 | 
117 |         self.data = data
118 |         self.model = model
119 |         self.pretrained_model_path = pretrained_model_path
120 |         self.metrics = metrics
121 |         self.multi_gpu = multi_gpu
122 |         self.is_fp16 = is_fp16
123 |         self.fp16_opt_level = fp16_opt_level
124 |         self.adam_epsilon = adam_epsilon
125 |         self.warmup_steps = warmup_steps
126 |         self.grad_accumulation_steps = grad_accumulation_steps
127 |         self.device = device
128 |         self.logger = logger
129 |         self.optimizer = None
130 |         self.n_gpu = 0
131 |         self.max_grad_norm = max_grad_norm
132 |         self.logging_steps = logging_steps
133 |         self.max_steps = -1
134 |         self.weight_decay = 0.0
135 |         self.model_type = data.model_type
136 | 
137 |         self.output_dir = output_dir
138 | 
139 |         self.scaler = torch.cuda.amp.GradScaler() if is_fp16 is True else None
140 | 
141 |         if self.multi_gpu:
142 |             self.n_gpu = torch.cuda.device_count()
143 | 
144 |     ### Train the model ###
145 |     def fit(
146 |         self,
147 |         epochs,
148 |         lr,
149 |         validate=True,
150 |         schedule_type="warmup_cosine",
151 |         optimizer_type="lamb",
152 |     ):
153 | 
154 |         tensorboard_dir = self.output_dir / "tensorboard"
155 |         tensorboard_dir.mkdir(exist_ok=True)
156 | 
157 |         # Train the model
158 |         tb_writer = SummaryWriter(tensorboard_dir)
159 | 
160 |         train_dataloader = self.data.train_dl
161 |         if self.max_steps > 0:
162 |             t_total = self.max_steps
163 |             self.epochs = (
164 |                 self.max_steps // len(train_dataloader) // self.grad_accumulation_steps
165 |                 + 1
166 |             )
167 |         else:
168 |             t_total = len(train_dataloader) // self.grad_accumulation_steps * epochs
169 | 
170 |         # Prepare optimiser and schedule
171 |         optimizer = self.get_optimizer(lr, optimizer_type=optimizer_type)
172 | 
173 |         # get the base model if its already wrapped around DataParallel
174 |         if hasattr(self.model, "module"):
175 |             self.model = self.model.module
176 | 
177 |         # Get scheduler
178 |         scheduler = self.get_scheduler(
179 |             optimizer, t_total=t_total, schedule_type=schedule_type
180 |         )
181 | 
182 |         # Parallelize the model architecture
183 |         if self.multi_gpu is True:
184 |             self.model = torch.nn.DataParallel(self.model)
185 | 
186 |         # Start Training
187 |         self.logger.info("***** Running training *****")
188 |         self.logger.info("  Num examples = %d", len(train_dataloader.dataset))
189 |         self.logger.info("  Num Epochs = %d", epochs)
190 |         self.logger.info(
191 |             "  Total train batch size (w. parallel, distributed & accumulation) = %d",
192 |             self.data.train_batch_size * self.grad_accumulation_steps,
193 |         )
194 |         self.logger.info(
195 |             "  Gradient Accumulation steps = %d", self.grad_accumulation_steps
196 |         )
197 |         self.logger.info("  Total optimization steps = %d", t_total)
198 | 
199 |         global_step = 0
200 |         epoch_step = 0
201 |         tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0
202 |         self.model.zero_grad()
203 |         pbar = master_bar(range(epochs))
204 | 
205 |         for epoch in pbar:
206 |             epoch_step = 0
207 |             epoch_loss = 0.0
208 |             for step, batch in enumerate(progress_bar(train_dataloader, parent=pbar)):
209 |                 inputs, labels = self.data.mask_tokens(batch)
210 |                 cpu_device = torch.device("cpu")
211 |                 loss = self.training_step(batch)
212 | 
213 |                 tr_loss += loss.item()
214 |                 epoch_loss += loss.item()
215 | 
216 |                 batch.to(cpu_device)
217 |                 inputs.to(cpu_device)
218 |                 labels.to(cpu_device)
219 |                 torch.cuda.empty_cache()
220 | 
221 |                 if (step + 1) % self.grad_accumulation_steps == 0:
222 |                     # gradient clipping
223 |                     torch.nn.utils.clip_grad_norm_(
224 |                         self.model.parameters(), self.max_grad_norm
225 |                     )
226 | 
227 |                     if self.is_fp16:
228 |                         # AMP: gradients need unscaling
229 |                         self.scaler.unscale_(optimizer)
230 | 
231 |                     if self.is_fp16:
232 |                         self.scaler.step(optimizer)
233 |                         self.scaler.update()
234 |                     else:
235 |                         optimizer.step()
236 |                     scheduler.step()
237 | 
238 |                     self.model.zero_grad()
239 |                     global_step += 1
240 |                     epoch_step += 1
241 | 
242 |                     if self.logging_steps > 0 and global_step % self.logging_steps == 0:
243 |                         if validate:
244 |                             # evaluate model
245 |                             results = self.validate()
246 |                             for key, value in results.items():
247 |                                 tb_writer.add_scalar(
248 |                                     "eval_{}".format(key), value, global_step
249 |                                 )
250 |                                 self.logger.info(
251 |                                     "eval_{} after step {}: {}: ".format(
252 |                                         key, global_step, value
253 |                                     )
254 |                                 )
255 | 
256 |                         # Log metrics
257 |                         self.logger.info(
258 |                             "lr after step {}: {}".format(
259 |                                 global_step, scheduler.get_lr()[0]
260 |                             )
261 |                         )
262 |                         self.logger.info(
263 |                             "train_loss after step {}: {}".format(
264 |                                 global_step,
265 |                                 (tr_loss - logging_loss) / self.logging_steps,
266 |                             )
267 |                         )
268 |                         tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
269 |                         tb_writer.add_scalar(
270 |                             "loss",
271 |                             (tr_loss - logging_loss) / self.logging_steps,
272 |                             global_step,
273 |                         )
274 | 
275 |                         logging_loss = tr_loss
276 | 
277 |             # Evaluate the model after every epoch
278 |             if validate:
279 |                 results = self.validate()
280 |                 for key, value in results.items():
281 |                     self.logger.info(
282 |                         "eval_{} after epoch {}: {}: ".format(key, (epoch + 1), value)
283 |                     )
284 | 
285 |             # Log metrics
286 |             self.logger.info(
287 |                 "lr after epoch {}: {}".format((epoch + 1), scheduler.get_lr()[0])
288 |             )
289 |             self.logger.info(
290 |                 "train_loss after epoch {}: {}".format(
291 |                     (epoch + 1), epoch_loss / epoch_step
292 |                 )
293 |             )
294 |             self.logger.info("\n")
295 | 
296 |         tb_writer.close()
297 |         return global_step, tr_loss / global_step
298 | 
299 |     ### Training step
300 |     def training_step(self, batch):
301 |         inputs, labels = self.data.mask_tokens(batch)
302 | 
303 |         inputs = inputs.to(self.device)
304 |         labels = labels.to(self.device)
305 | 
306 |         self.model.train()
307 | 
308 |         if self.is_fp16:
309 |             with autocast():
310 |                 outputs = self.model(inputs, masked_lm_labels=labels)
311 |         else:
312 |             outputs = self.model(inputs, masked_lm_labels=labels)
313 | 
314 |         loss = outputs[0]
315 | 
316 |         if self.n_gpu > 1:
317 |             loss = loss.mean()
318 |         if self.grad_accumulation_steps > 1:
319 |             loss = loss / self.grad_accumulation_steps
320 | 
321 |         if self.is_fp16:
322 |             self.scaler.scale(loss).backward()
323 |         else:
324 |             loss.backward()
325 | 
326 |         return loss
327 | 
328 |     ### Evaluate the model
329 |     def validate(self):
330 |         self.logger.info("Running evaluation")
331 | 
332 |         self.logger.info("Num examples = %d", len(self.data.val_dl.dataset))
333 |         self.logger.info("Validation Batch size = %d", self.data.val_batch_size)
334 | 
335 |         eval_loss = 0
336 |         nb_eval_steps = 0
337 | 
338 |         validation_scores = {metric["name"]: 0.0 for metric in self.metrics}
339 | 
340 |         for step, batch in enumerate(progress_bar(self.data.val_dl)):
341 |             self.model.eval()
342 |             batch = batch.to(self.device)
343 | 
344 |             with torch.no_grad():
345 |                 outputs = self.model(batch, masked_lm_labels=batch)
346 |                 tmp_eval_loss = outputs[0]
347 |                 eval_loss += tmp_eval_loss.mean().item()
348 | 
349 |                 cpu_device = torch.device("cpu")
350 |                 batch.to(cpu_device)
351 |                 torch.cuda.empty_cache()
352 | 
353 |             nb_eval_steps += 1
354 | 
355 |         eval_loss = eval_loss / nb_eval_steps
356 |         perplexity = torch.exp(torch.tensor(eval_loss))
357 | 
358 |         results = {"loss": eval_loss, "perplexity": float(perplexity)}
359 | 
360 |         results.update(validation_scores)
361 | 
362 |         return results
363 | 
364 |     def save_model(self, path=None):
365 | 
366 |         if not path:
367 |             path = self.output_dir / "model_out"
368 | 
369 |         path.mkdir(exist_ok=True)
370 | 
371 |         torch.cuda.empty_cache()
372 |         # Save a trained model
373 |         model_to_save = (
374 |             self.model.module if hasattr(self.model, "module") else self.model
375 |         )  # Only save the model it-self
376 |         model_to_save.save_pretrained(path)
377 | 
378 |         # save the tokenizer
379 |         self.data.tokenizer.save_pretrained(path)
380 | 


--------------------------------------------------------------------------------
/fast_bert/learner_util.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from pathlib import Path
  3 | import logging
  4 | 
  5 | from transformers import (
  6 |     AdamW,
  7 |     get_constant_schedule,
  8 |     get_constant_schedule_with_warmup,
  9 |     get_linear_schedule_with_warmup,
 10 |     get_cosine_schedule_with_warmup,
 11 |     get_cosine_with_hard_restarts_schedule_with_warmup,
 12 | )
 13 | 
 14 | from pytorch_lamb import Lamb
 15 | 
 16 | 
 17 | class Learner(object):
 18 |     def __init__(
 19 |         self,
 20 |         data,
 21 |         model,
 22 |         pretrained_model_path,
 23 |         output_dir,
 24 |         device,
 25 |         logger=logging.getLogger(__name__),
 26 |         multi_gpu=True,
 27 |         is_fp16=True,
 28 |         warmup_steps=0,
 29 |         fp16_opt_level="O1",
 30 |         grad_accumulation_steps=1,
 31 |         max_grad_norm=1.0,
 32 |         adam_epsilon=1e-8,
 33 |         logging_steps=100,
 34 |     ):
 35 | 
 36 |         if isinstance(output_dir, str):
 37 |             output_dir = Path(output_dir)
 38 | 
 39 |         self.data = data
 40 |         self.model = model
 41 |         self.pretrained_model_path = pretrained_model_path
 42 |         self.multi_gpu = multi_gpu
 43 |         self.is_fp16 = is_fp16
 44 |         self.fp16_opt_level = fp16_opt_level
 45 |         self.adam_epsilon = adam_epsilon
 46 |         self.warmup_steps = warmup_steps
 47 |         self.grad_accumulation_steps = grad_accumulation_steps
 48 |         self.device = device
 49 |         self.logger = logger
 50 |         self.layer_groups = None
 51 |         self.optimizer = None
 52 |         self.n_gpu = 0
 53 |         self.max_grad_norm = max_grad_norm
 54 |         self.logging_steps = logging_steps
 55 |         self.max_steps = -1
 56 |         self.weight_decay = 0.0
 57 |         self.model_type = data.model_type
 58 | 
 59 |         self.output_dir = output_dir
 60 | 
 61 |         if self.multi_gpu:
 62 |             self.n_gpu = torch.cuda.device_count()
 63 | 
 64 |     # Get the optimiser object
 65 |     def get_optimizer(self, lr, optimizer_type="lamb"):
 66 | 
 67 |         # Prepare optimiser and schedule
 68 |         no_decay = ["bias", "LayerNorm.weight"]
 69 | 
 70 |         optimizer_grouped_parameters = [
 71 |             {
 72 |                 "params": [
 73 |                     p
 74 |                     for n, p in self.model.named_parameters()
 75 |                     if not any(nd in n for nd in no_decay)
 76 |                 ],
 77 |                 "weight_decay": self.weight_decay,
 78 |             },
 79 |             {
 80 |                 "params": [
 81 |                     p
 82 |                     for n, p in self.model.named_parameters()
 83 |                     if any(nd in n for nd in no_decay)
 84 |                 ],
 85 |                 "weight_decay": 0.0,
 86 |             },
 87 |         ]
 88 | 
 89 |         if optimizer_type == "lamb":
 90 |             optimizer = Lamb(
 91 |                 optimizer_grouped_parameters, weight_decay=0.1, lr=lr, eps=1e-12
 92 |             )
 93 |         elif optimizer_type == "adamw":
 94 |             optimizer = AdamW(
 95 |                 optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon
 96 |             )
 97 | 
 98 |         return optimizer
 99 | 
100 |     # Get learning rate scheduler
101 |     def get_scheduler(self, optimizer, t_total, schedule_type="warmup_cosine"):
102 | 
103 |         SCHEDULES = {
104 |             None: get_constant_schedule,
105 |             "none": get_constant_schedule,
106 |             "warmup_cosine": get_cosine_schedule_with_warmup,
107 |             "warmup_constant": get_constant_schedule_with_warmup,
108 |             "warmup_linear": get_linear_schedule_with_warmup,
109 |             "warmup_cosine_hard_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
110 |         }
111 | 
112 |         if schedule_type is None or schedule_type == "none":
113 |             return SCHEDULES[schedule_type](optimizer)
114 | 
115 |         elif schedule_type == "warmup_constant":
116 |             return SCHEDULES[schedule_type](
117 |                 optimizer, num_warmup_steps=self.warmup_steps
118 |             )
119 | 
120 |         else:
121 |             return SCHEDULES[schedule_type](
122 |                 optimizer,
123 |                 num_warmup_steps=self.warmup_steps,
124 |                 num_training_steps=t_total,
125 |             )
126 | 
127 |     def save_model(self, path=None):
128 | 
129 |         if not path:
130 |             path = self.output_dir / "model_out"
131 | 
132 |         path.mkdir(exist_ok=True)
133 | 
134 |         # Convert path to str for save_pretrained calls
135 |         path = str(path)
136 | 
137 |         torch.cuda.empty_cache()
138 |         # Save a trained model
139 |         model_to_save = (
140 |             self.model.module if hasattr(self.model, "module") else self.model
141 |         )  # Only save the model it-self
142 |         model_to_save.save_pretrained(path)
143 | 
144 |         # save the tokenizer
145 |         self.data.tokenizer.save_pretrained(path)
146 | 


--------------------------------------------------------------------------------
/fast_bert/metrics.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import (
  2 |     roc_curve,
  3 |     auc,
  4 |     hamming_loss,
  5 |     accuracy_score,
  6 |     confusion_matrix as sklearn_confusion_matrix,
  7 | )
  8 | import numpy as np
  9 | from torch import Tensor
 10 | 
 11 | import pdb
 12 | import logging
 13 | 
 14 | logger = logging.getLogger()
 15 | 
 16 | CLASSIFICATION_THRESHOLD: float = 0.5  # Best keep it in [0.0, 1.0] range
 17 | 
 18 | # def accuracy(out, labels):
 19 | #     outputs = np.argmax(out, axis=1)
 20 | #     return np.sum(outputs == labels)
 21 | 
 22 | 
 23 | def accuracy(y_pred: Tensor, y_true: Tensor, **kwargs):
 24 |     y_pred = y_pred.cpu()
 25 |     outputs = np.argmax(y_pred, axis=1)
 26 |     return np.mean(outputs.numpy() == y_true.detach().cpu().numpy())
 27 | 
 28 | 
 29 | def accuracy_multilabel(y_pred: Tensor, y_true: Tensor, sigmoid: bool = True, **kwargs):
 30 |     if sigmoid:
 31 |         y_pred = y_pred.sigmoid()
 32 |     y_pred = y_pred.cpu()
 33 |     y_true = y_true.cpu()
 34 |     outputs = np.argmax(y_pred, axis=1)
 35 |     real_vals = np.argmax(y_true, axis=1)
 36 |     return np.mean(outputs.numpy() == real_vals.numpy())
 37 | 
 38 | 
 39 | def accuracy_thresh(
 40 |     y_pred: Tensor,
 41 |     y_true: Tensor,
 42 |     thresh: float = CLASSIFICATION_THRESHOLD,
 43 |     sigmoid: bool = True,
 44 |     **kwargs
 45 | ):
 46 |     "Compute accuracy when `y_pred` and `y_true` are the same size."
 47 |     if sigmoid:
 48 |         y_pred = y_pred.sigmoid()
 49 |     return ((y_pred > thresh) == y_true.bool()).float().mean().item()
 50 | 
 51 | 
 52 | #     return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()
 53 | 
 54 | 
 55 | def fbeta(
 56 |     y_pred: Tensor,
 57 |     y_true: Tensor,
 58 |     thresh: float = 0.3,
 59 |     beta: float = 2,
 60 |     eps: float = 1e-9,
 61 |     sigmoid: bool = True,
 62 |     **kwargs
 63 | ):
 64 |     "Computes the f_beta between `preds` and `targets`"
 65 |     beta2 = beta ** 2
 66 |     if sigmoid:
 67 |         y_pred = y_pred.sigmoid()
 68 |     y_pred = (y_pred > thresh).float()
 69 |     y_true = y_true.float()
 70 |     TP = (y_pred * y_true).sum(dim=1)
 71 |     prec = TP / (y_pred.sum(dim=1) + eps)
 72 |     rec = TP / (y_true.sum(dim=1) + eps)
 73 |     res = (prec * rec) / (prec * beta2 + rec + eps) * (1 + beta2)
 74 |     return res.mean().item()
 75 | 
 76 | 
 77 | def roc_auc(y_pred: Tensor, y_true: Tensor, **kwargs):
 78 |     # ROC-AUC calcualation
 79 |     # Compute ROC curve and ROC area for each class
 80 |     fpr = dict()
 81 |     tpr = dict()
 82 |     roc_auc = dict()
 83 | 
 84 |     y_true = y_true.detach().cpu().numpy()
 85 |     y_pred = y_pred.detach().cpu().numpy()
 86 | 
 87 |     # Compute micro-average ROC curve and ROC area
 88 |     fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
 89 |     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 90 | 
 91 |     return roc_auc["micro"]
 92 | 
 93 | 
 94 | def Hamming_loss(
 95 |     y_pred: Tensor,
 96 |     y_true: Tensor,
 97 |     sigmoid: bool = True,
 98 |     thresh: float = CLASSIFICATION_THRESHOLD,
 99 |     sample_weight=None,
100 |     **kwargs
101 | ):
102 |     if sigmoid:
103 |         y_pred = y_pred.sigmoid()
104 |     y_pred = (y_pred > thresh).float()
105 |     return hamming_loss(y_true, y_pred, sample_weight=sample_weight)
106 | 
107 | 
108 | def Exact_Match_Ratio(
109 |     y_pred: Tensor,
110 |     y_true: Tensor,
111 |     sigmoid: bool = True,
112 |     thresh: float = CLASSIFICATION_THRESHOLD,
113 |     normalize: bool = True,
114 |     sample_weight=None,
115 |     **kwargs
116 | ):
117 |     if sigmoid:
118 |         y_pred = y_pred.sigmoid()
119 |     y_pred = (y_pred > thresh).float()
120 |     return accuracy_score(
121 |         y_true, y_pred, normalize=normalize, sample_weight=sample_weight
122 |     )
123 | 
124 | 
125 | def F1(
126 |     y_pred: Tensor,
127 |     y_true: Tensor,
128 |     threshold: float = CLASSIFICATION_THRESHOLD,
129 |     **kwargs
130 | ):
131 |     return fbeta(y_pred, y_true, thresh=threshold, beta=1)
132 | 
133 | 
134 | def confusion_matrix(y_pred: Tensor, y_true: Tensor, **kwargs):
135 |     try:
136 |         y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1)
137 |         return sklearn_confusion_matrix(
138 |             y_true.detach().cpu().numpy(), y_pred, labels=kwargs.get("labels"),
139 |         )
140 |     except Exception as e:
141 |         logger.error(e)
142 | 
143 | 


--------------------------------------------------------------------------------
/fast_bert/onnx_helper.py:
--------------------------------------------------------------------------------
 1 | from onnxruntime import (
 2 |     GraphOptimizationLevel,
 3 |     InferenceSession,
 4 |     SessionOptions,
 5 |     get_all_providers,
 6 | )
 7 | 
 8 | import logging
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | logger = logging.getLogger()
13 | 
14 | 
15 | def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:
16 | 
17 |     assert (
18 |         provider in get_all_providers()
19 |     ), f"provider {provider} not found, {get_all_providers()}"
20 | 
21 |     # Few properties that might have an impact on performances (provided by MS)
22 |     options = SessionOptions()
23 |     options.intra_op_num_threads = 1
24 |     options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
25 | 
26 |     # Load the model as a graph and prepare the CPU backend
27 |     session = InferenceSession(model_path, options, providers=[provider])
28 |     session.disable_fallback()
29 | 
30 |     return session
31 | 
32 | 
33 | def load_model(model_path: Path):
34 |     try:
35 |         quantized_model = create_model_for_provider(
36 |             model_path.as_posix(), "CPUExecutionProvider"
37 |         )
38 |         return quantized_model
39 |     except Exception as e:
40 |         logger.error(e)
41 |         raise e
42 | 


--------------------------------------------------------------------------------
/fast_bert/prediction.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from pathlib import Path
  3 | 
  4 | from .onnx_helper import load_model
  5 | 
  6 | 
  7 | from transformers import AutoTokenizer
  8 | import numpy as np
  9 | 
 10 | import warnings
 11 | 
 12 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 13 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 14 | 
 15 | 
 16 | class BertClassificationPredictor(object):
 17 |     def __init__(
 18 |         self,
 19 |         model_path,
 20 |         label_path,
 21 |         multi_label=False,
 22 |         model_type="bert",
 23 |         use_fast_tokenizer=True,
 24 |         do_lower_case=True,
 25 |         device=None,
 26 |     ):
 27 |         if device is None:
 28 |             device = (
 29 |                 torch.device("cuda")
 30 |                 if torch.cuda.is_available()
 31 |                 else torch.device("cpu")
 32 |             )
 33 | 
 34 |         self.model_path = model_path
 35 |         self.label_path = label_path
 36 |         self.multi_label = multi_label
 37 |         self.model_type = model_type
 38 |         self.do_lower_case = do_lower_case
 39 |         self.device = device
 40 | 
 41 |         # Use auto-tokenizer
 42 |         self.tokenizer = AutoTokenizer.from_pretrained(
 43 |             self.model_path, use_fast=use_fast_tokenizer
 44 |         )
 45 | 
 46 |         self.learner = self.get_learner()
 47 | 
 48 |     def get_learner(self):
 49 |         from .learner_cls import BertLearner
 50 |         from .data_cls import BertDataBunch
 51 | 
 52 |         databunch = BertDataBunch(
 53 |             self.label_path,
 54 |             self.label_path,
 55 |             self.tokenizer,
 56 |             train_file=None,
 57 |             val_file=None,
 58 |             batch_size_per_gpu=32,
 59 |             max_seq_length=512,
 60 |             multi_gpu=False,
 61 |             multi_label=self.multi_label,
 62 |             model_type=self.model_type,
 63 |             no_cache=True,
 64 |         )
 65 | 
 66 |         learner = BertLearner.from_pretrained_model(
 67 |             databunch,
 68 |             self.model_path,
 69 |             metrics=[],
 70 |             device=self.device,
 71 |             logger=None,
 72 |             output_dir=None,
 73 |             warmup_steps=0,
 74 |             multi_gpu=False,
 75 |             is_fp16=False,
 76 |             multi_label=self.multi_label,
 77 |             logging_steps=0,
 78 |         )
 79 | 
 80 |         return learner
 81 | 
 82 |     def predict_batch(self, texts, verbose=False):
 83 |         return self.learner.predict_batch(texts, verbose=verbose)
 84 | 
 85 |     def predict(self, text, verbose=False):
 86 |         predictions = self.predict_batch([text], verbose=verbose)[0]
 87 |         return predictions
 88 | 
 89 | 
 90 | class BertOnnxClassificationPredictor(object):
 91 |     def __init__(
 92 |         self,
 93 |         model_path,
 94 |         label_path,
 95 |         model_name="model.onnx",
 96 |         multi_label=False,
 97 |         model_type="bert",
 98 |         use_fast_tokenizer=True,
 99 |         do_lower_case=True,
100 |         device=None,
101 |     ):
102 |         if device is None:
103 |             device = (
104 |                 torch.device("cuda")
105 |                 if torch.cuda.is_available()
106 |                 else torch.device("cpu")
107 |             )
108 | 
109 |         self.model_path = model_path
110 |         self.label_path = label_path
111 |         self.multi_label = multi_label
112 |         self.model_type = model_type
113 |         self.do_lower_case = do_lower_case
114 |         self.device = device
115 |         self.labels = []
116 | 
117 |         # Use auto-tokenizer
118 |         self.tokenizer = AutoTokenizer.from_pretrained(
119 |             self.model_path, use_fast=use_fast_tokenizer
120 |         )
121 | 
122 |         with open(label_path / "labels.csv", "r") as f:
123 |             self.labels = f.read().split("\n")
124 | 
125 |         self.model = load_model(Path(self.model_path) / model_name)
126 | 
127 |     def predict(self, text, verbose=False):
128 |         # Inputs are provided through numpy array
129 |         model_inputs = self.tokenizer(text, return_tensors="pt")
130 |         inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
131 |         outputs = self.model.run(None, inputs_onnx)
132 |         softmax_preds = softmax(outputs[0])
133 |         preds = list(zip(self.labels, softmax_preds[0]))
134 |         return sorted(preds, key=lambda x: x[1], reverse=True)
135 |     
136 | 
137 | 
138 | def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
139 |     x_max = np.max(x, axis=axis, keepdims=True)
140 |     tmp = np.exp(x - x_max)
141 |     s = np.sum(tmp, axis=axis, keepdims=True)
142 |     return tmp / s


--------------------------------------------------------------------------------
/fast_bert/prediction_ner.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from pathlib import Path
  3 | 
  4 | from .onnx_helper import load_model
  5 | 
  6 | from .learner_ner import group_entities
  7 | from .data_ner import get_labels
  8 | 
  9 | from transformers import AutoTokenizer
 10 | import numpy as np
 11 | 
 12 | import warnings
 13 | 
 14 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 15 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 16 |     
 17 | 
 18 | class BertNERPredictor(object):
 19 |     def __init__(
 20 |         self,
 21 |         model_path,
 22 |         label_path,
 23 |         model_type="bert",
 24 |         use_fast_tokenizer=True,
 25 |         do_lower_case=True,
 26 |         device=None,
 27 |     ):
 28 |         if device is None:
 29 |             device = (
 30 |                 torch.device("cuda")
 31 |                 if torch.cuda.is_available()
 32 |                 else torch.device("cpu")
 33 |             )
 34 | 
 35 |         self.model_path = model_path
 36 |         self.label_path = label_path
 37 |         self.model_type = model_type
 38 |         self.do_lower_case = do_lower_case
 39 |         self.device = device
 40 | 
 41 |         # Use auto-tokenizer
 42 |         self.tokenizer = AutoTokenizer.from_pretrained(
 43 |             self.model_path, use_fast=use_fast_tokenizer
 44 |         )
 45 | 
 46 |         self.learner = self.get_learner()
 47 | 
 48 |     def get_learner(self):
 49 |         from .data_ner import BertNERDataBunch
 50 |         from .learner_ner import BertNERLearner
 51 | 
 52 |         databunch = BertNERDataBunch(
 53 |             self.label_path,
 54 |             self.tokenizer,
 55 |             train_file=None,
 56 |             val_file=None,
 57 |             batch_size_per_gpu=32,
 58 |             max_seq_length=512,
 59 |             multi_gpu=False,
 60 |             model_type=self.model_type,
 61 |             no_cache=True,
 62 |         )
 63 | 
 64 |         learner = BertNERLearner.from_pretrained_model(
 65 |             databunch,
 66 |             self.model_path,
 67 |             device=self.device,
 68 |             logger=None,
 69 |             output_dir=None,
 70 |             warmup_steps=0,
 71 |             multi_gpu=False,
 72 |             is_fp16=False,
 73 |             logging_steps=0,
 74 |         )
 75 | 
 76 |         return learner
 77 | 
 78 |     def predict_batch(self, texts, group=True, exclude_entities=["O"]):
 79 |         predictions = []
 80 | 
 81 |         for text in texts:
 82 |             pred = self.predict(text, group=group, exclude_entities=exclude_entities)
 83 |             if pred:
 84 |                 predictions.append({"text": text, "results": pred})
 85 | 
 86 |     def predict(self, text, group=True, exclude_entities=["O"]):
 87 |         predictions = self.learner.predict(
 88 |             text, group=group, exclude_entities=exclude_entities
 89 |         )
 90 |         return predictions
 91 | 
 92 | 
 93 | class BertOnnxNERPredictor(object):
 94 |     def __init__(
 95 |         self,
 96 |         model_path,
 97 |         label_path,
 98 |         model_name="model.onnx",
 99 |         model_type="bert",
100 |         use_fast_tokenizer=True,
101 |         do_lower_case=True,
102 |         device=None,
103 |     ):
104 |         if device is None:
105 |             device = (
106 |                 torch.device("cuda")
107 |                 if torch.cuda.is_available()
108 |                 else torch.device("cpu")
109 |             )
110 | 
111 |         self.model_path = model_path
112 |         self.label_path = label_path
113 |         self.model_type = model_type
114 |         self.do_lower_case = do_lower_case
115 |         self.device = device
116 |         self.labels = []
117 | 
118 |         # Use auto-tokenizer
119 |         self.tokenizer = AutoTokenizer.from_pretrained(
120 |             self.model_path, use_fast=use_fast_tokenizer
121 |         )
122 | 
123 |         self.labels = get_labels(str(label_path / "labels.txt"))
124 | 
125 |         self.model = load_model(Path(self.model_path) / model_name)
126 | 
127 |     def predict(self, text, group=True, exclude_entities=["O"]):
128 |         # Inputs are provided through numpy array
129 |         tokens = self.tokenizer.tokenize(
130 |             self.tokenizer.decode(self.tokenizer.encode(text))
131 |         )
132 | 
133 |         model_inputs = self.tokenizer(text, return_tensors="pt")
134 |         inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
135 |         outputs = self.model.run(None, inputs_onnx)[0]
136 |         outputs = softmax(outputs)
137 | 
138 |         predictions = np.argmax(outputs, axis=2)
139 | 
140 |         preds = [
141 |             (token, self.labels[prediction], output[prediction])
142 |             for token, output, prediction in zip(tokens, outputs[0], predictions[0])
143 |         ][1:-1]
144 | 
145 |         preds = [
146 |             {
147 |                 "index": index,
148 |                 "word": prediction[0],
149 |                 "entity": prediction[1],
150 |                 "score": prediction[2],
151 |             }
152 |             for index, prediction in enumerate(preds)
153 |         ]
154 | 
155 |         if group is True:
156 |             preds = group_entities(preds)
157 | 
158 |         out_preds = []
159 |         for pred in preds:
160 |             if pred["entity"] not in exclude_entities:
161 |                 try:
162 |                     pred["entity"] = pred["entity"].split("-")[1]
163 |                 except Exception:
164 |                     pass
165 | 
166 |                 out_preds.append(pred)
167 | 
168 |         return out_preds
169 | 
170 |     def predict_batch(self, texts, group=True, exclude_entities=["O"]):
171 |         predictions = []
172 | 
173 |         for text in texts:
174 |             pred = self.predict(text, group=group, exclude_entities=exclude_entities)
175 |             if pred:
176 |                 predictions.append({"text": text, "results": pred})
177 | 
178 | 
179 | def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
180 |     x_max = np.max(x, axis=axis, keepdims=True)
181 |     tmp = np.exp(x - x_max)
182 |     s = np.sum(tmp, axis=axis, keepdims=True)
183 |     return tmp / s
184 | 


--------------------------------------------------------------------------------
/fast_bert/summarisation/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_bertabs import *
2 | from .modeling_bertabs import *
3 | 


--------------------------------------------------------------------------------
/fast_bert/summarisation/configuration_bertabs.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BertAbs configuration """
 17 | import json
 18 | import logging
 19 | import sys
 20 | 
 21 | from transformers import PretrainedConfig
 22 | 
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | BERTABS_FINETUNED_CONFIG_MAP = {
 28 |     "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
 29 | }
 30 | 
 31 | 
 32 | class BertAbsConfig(PretrainedConfig):
 33 |     r""" Class to store the configuration of the BertAbs model.
 34 | 
 35 |     Arguments:
 36 |         vocab_size: int
 37 |             Number of tokens in the vocabulary.
 38 |         max_pos: int
 39 |             The maximum sequence length that this model will be used with.
 40 |         enc_layer: int
 41 |             The numner of hidden layers in the Transformer encoder.
 42 |         enc_hidden_size: int
 43 |             The size of the encoder's layers.
 44 |         enc_heads: int
 45 |             The number of attention heads for each attention layer in the encoder.
 46 |         enc_ff_size: int
 47 |             The size of the encoder's feed-forward layers.
 48 |         enc_dropout: int
 49 |             The dropout probabilitiy for all fully connected layers in the
 50 |             embeddings, layers, pooler and also the attention probabilities in
 51 |             the encoder.
 52 |         dec_layer: int
 53 |             The numner of hidden layers in the decoder.
 54 |         dec_hidden_size: int
 55 |             The size of the decoder's layers.
 56 |         dec_heads: int
 57 |             The number of attention heads for each attention layer in the decoder.
 58 |         dec_ff_size: int
 59 |             The size of the decoder's feed-forward layers.
 60 |         dec_dropout: int
 61 |             The dropout probabilitiy for all fully connected layers in the
 62 |             embeddings, layers, pooler and also the attention probabilities in
 63 |             the decoder.
 64 |     """
 65 | 
 66 |     pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         vocab_size=30522,
 71 |         max_pos=512,
 72 |         enc_layers=6,
 73 |         enc_hidden_size=512,
 74 |         enc_heads=8,
 75 |         enc_ff_size=512,
 76 |         enc_dropout=0.2,
 77 |         dec_layers=6,
 78 |         dec_hidden_size=768,
 79 |         dec_heads=8,
 80 |         dec_ff_size=2048,
 81 |         dec_dropout=0.2,
 82 |         **kwargs,
 83 |     ):
 84 |         super(BertAbsConfig, self).__init__(**kwargs)
 85 | 
 86 |         self.vocab_size = vocab_size
 87 |         self.max_pos = max_pos
 88 | 
 89 |         self.enc_layers = enc_layers
 90 |         self.enc_hidden_size = enc_hidden_size
 91 |         self.enc_heads = enc_heads
 92 |         self.enc_ff_size = enc_ff_size
 93 |         self.enc_dropout = enc_dropout
 94 | 
 95 |         self.dec_layers = dec_layers
 96 |         self.dec_hidden_size = dec_hidden_size
 97 |         self.dec_heads = dec_heads
 98 |         self.dec_ff_size = dec_ff_size
 99 |         self.dec_dropout = dec_dropout
100 | 


--------------------------------------------------------------------------------
/fast_bert/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .spellcheck import BingSpellCheck
2 | 


--------------------------------------------------------------------------------
/fast_bert/utils/spellcheck.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | 
 4 | 
 5 | class BingSpellCheck(object):
 6 |     def __init__(self, key):
 7 |         self.api_key = key
 8 |         self.endpoint = "https://api.cognitive.microsoft.com/bing/v7.0/SpellCheck"
 9 | 
10 |     def spell_check(self, text, mode='spell'):
11 |         data = {'text': text}
12 | 
13 |         params = {
14 |             'mkt': 'en-us',
15 |             'mode': mode
16 |         }
17 | 
18 |         headers = {
19 |             'Content-Type': 'application/x-www-form-urlencoded',
20 |             'Ocp-Apim-Subscription-Key': self.api_key,
21 |         }
22 |         response = requests.post(
23 |             self.endpoint, headers=headers, params=params, data=data)
24 | 
25 |         corrected_spells = response.json()
26 | 
27 |         flaggedTokens = corrected_spells['flaggedTokens']
28 | 
29 |         for flagged in flaggedTokens:
30 |             text = text.replace(
31 |                 flagged['token'], flagged['suggestions'][0]['suggestion'])
32 | 
33 |         return text
34 | 


--------------------------------------------------------------------------------
/fast_bert/utils_squad_evaluate.py:
--------------------------------------------------------------------------------
  1 | """ Official evaluation script for SQuAD version 2.0.
  2 |     Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
  3 | In addition to basic functionality, we also compute additional statistics and
  4 | plot precision-recall curves if an additional na_prob.json file is provided.
  5 | This file is expected to map question ID's to the model's predicted probability
  6 | that a question is unanswerable.
  7 | """
  8 | import argparse
  9 | import collections
 10 | import json
 11 | import numpy as np
 12 | import os
 13 | import re
 14 | import string
 15 | import sys
 16 | 
 17 | class EVAL_OPTS():
 18 |   def __init__(self, data_file, pred_file, out_file="",
 19 |                na_prob_file="na_prob.json", na_prob_thresh=1.0,
 20 |                out_image_dir=None, verbose=False):
 21 |     self.data_file = data_file
 22 |     self.pred_file = pred_file
 23 |     self.out_file = out_file
 24 |     self.na_prob_file = na_prob_file
 25 |     self.na_prob_thresh = na_prob_thresh
 26 |     self.out_image_dir = out_image_dir
 27 |     self.verbose = verbose
 28 | 
 29 | OPTS = None
 30 | 
 31 | def parse_args():
 32 |   parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
 33 |   parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
 34 |   parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
 35 |   parser.add_argument('--out-file', '-o', metavar='eval.json',
 36 |                       help='Write accuracy metrics to file (default is stdout).')
 37 |   parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
 38 |                       help='Model estimates of probability of no answer.')
 39 |   parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
 40 |                       help='Predict "" if no-answer probability exceeds this (default = 1.0).')
 41 |   parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
 42 |                       help='Save precision-recall curves to directory.')
 43 |   parser.add_argument('--verbose', '-v', action='store_true')
 44 |   if len(sys.argv) == 1:
 45 |     parser.print_help()
 46 |     sys.exit(1)
 47 |   return parser.parse_args()
 48 | 
 49 | def make_qid_to_has_ans(dataset):
 50 |   qid_to_has_ans = {}
 51 |   for article in dataset:
 52 |     for p in article['paragraphs']:
 53 |       for qa in p['qas']:
 54 |         qid_to_has_ans[qa['id']] = bool(qa['answers'])
 55 |   return qid_to_has_ans
 56 | 
 57 | def normalize_answer(s):
 58 |   """Lower text and remove punctuation, articles and extra whitespace."""
 59 |   def remove_articles(text):
 60 |     regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
 61 |     return re.sub(regex, ' ', text)
 62 |   def white_space_fix(text):
 63 |     return ' '.join(text.split())
 64 |   def remove_punc(text):
 65 |     exclude = set(string.punctuation)
 66 |     return ''.join(ch for ch in text if ch not in exclude)
 67 |   def lower(text):
 68 |     return text.lower()
 69 |   return white_space_fix(remove_articles(remove_punc(lower(s))))
 70 | 
 71 | def get_tokens(s):
 72 |   if not s: return []
 73 |   return normalize_answer(s).split()
 74 | 
 75 | def compute_exact(a_gold, a_pred):
 76 |   return int(normalize_answer(a_gold) == normalize_answer(a_pred))
 77 | 
 78 | def compute_f1(a_gold, a_pred):
 79 |   gold_toks = get_tokens(a_gold)
 80 |   pred_toks = get_tokens(a_pred)
 81 |   common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
 82 |   num_same = sum(common.values())
 83 |   if len(gold_toks) == 0 or len(pred_toks) == 0:
 84 |     # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
 85 |     return int(gold_toks == pred_toks)
 86 |   if num_same == 0:
 87 |     return 0
 88 |   precision = 1.0 * num_same / len(pred_toks)
 89 |   recall = 1.0 * num_same / len(gold_toks)
 90 |   f1 = (2 * precision * recall) / (precision + recall)
 91 |   return f1
 92 | 
 93 | def get_raw_scores(dataset, preds):
 94 |   exact_scores = {}
 95 |   f1_scores = {}
 96 |   for article in dataset:
 97 |     for p in article['paragraphs']:
 98 |       for qa in p['qas']:
 99 |         qid = qa['id']
100 |         gold_answers = [a['text'] for a in qa['answers']
101 |                         if normalize_answer(a['text'])]
102 |         if not gold_answers:
103 |           # For unanswerable questions, only correct answer is empty string
104 |           gold_answers = ['']
105 |         if qid not in preds:
106 |           print('Missing prediction for %s' % qid)
107 |           continue
108 |         a_pred = preds[qid]
109 |         # Take max over all gold answers
110 |         exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
111 |         f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
112 |   return exact_scores, f1_scores
113 | 
114 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
115 |   new_scores = {}
116 |   for qid, s in scores.items():
117 |     pred_na = na_probs[qid] > na_prob_thresh
118 |     if pred_na:
119 |       new_scores[qid] = float(not qid_to_has_ans[qid])
120 |     else:
121 |       new_scores[qid] = s
122 |   return new_scores
123 | 
124 | def make_eval_dict(exact_scores, f1_scores, qid_list=None):
125 |   if not qid_list:
126 |     total = len(exact_scores)
127 |     return collections.OrderedDict([
128 |         ('exact', 100.0 * sum(exact_scores.values()) / total),
129 |         ('f1', 100.0 * sum(f1_scores.values()) / total),
130 |         ('total', total),
131 |     ])
132 |   else:
133 |     total = len(qid_list)
134 |     return collections.OrderedDict([
135 |         ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
136 |         ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
137 |         ('total', total),
138 |     ])
139 | 
140 | def merge_eval(main_eval, new_eval, prefix):
141 |   for k in new_eval:
142 |     main_eval['%s_%s' % (prefix, k)] = new_eval[k]
143 | 
144 | def plot_pr_curve(precisions, recalls, out_image, title):
145 |   plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
146 |   plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
147 |   plt.xlabel('Recall')
148 |   plt.ylabel('Precision')
149 |   plt.xlim([0.0, 1.05])
150 |   plt.ylim([0.0, 1.05])
151 |   plt.title(title)
152 |   plt.savefig(out_image)
153 |   plt.clf()
154 | 
155 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
156 |                                out_image=None, title=None):
157 |   qid_list = sorted(na_probs, key=lambda k: na_probs[k])
158 |   true_pos = 0.0
159 |   cur_p = 1.0
160 |   cur_r = 0.0
161 |   precisions = [1.0]
162 |   recalls = [0.0]
163 |   avg_prec = 0.0
164 |   for i, qid in enumerate(qid_list):
165 |     if qid_to_has_ans[qid]:
166 |       true_pos += scores[qid]
167 |     cur_p = true_pos / float(i+1)
168 |     cur_r = true_pos / float(num_true_pos)
169 |     if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
170 |       # i.e., if we can put a threshold after this point
171 |       avg_prec += cur_p * (cur_r - recalls[-1])
172 |       precisions.append(cur_p)
173 |       recalls.append(cur_r)
174 |   if out_image:
175 |     plot_pr_curve(precisions, recalls, out_image, title)
176 |   return {'ap': 100.0 * avg_prec}
177 | 
178 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
179 |                                   qid_to_has_ans, out_image_dir):
180 |   if out_image_dir and not os.path.exists(out_image_dir):
181 |     os.makedirs(out_image_dir)
182 |   num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
183 |   if num_true_pos == 0:
184 |     return
185 |   pr_exact = make_precision_recall_eval(
186 |       exact_raw, na_probs, num_true_pos, qid_to_has_ans,
187 |       out_image=os.path.join(out_image_dir, 'pr_exact.png'),
188 |       title='Precision-Recall curve for Exact Match score')
189 |   pr_f1 = make_precision_recall_eval(
190 |       f1_raw, na_probs, num_true_pos, qid_to_has_ans,
191 |       out_image=os.path.join(out_image_dir, 'pr_f1.png'),
192 |       title='Precision-Recall curve for F1 score')
193 |   oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
194 |   pr_oracle = make_precision_recall_eval(
195 |       oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
196 |       out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
197 |       title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
198 |   merge_eval(main_eval, pr_exact, 'pr_exact')
199 |   merge_eval(main_eval, pr_f1, 'pr_f1')
200 |   merge_eval(main_eval, pr_oracle, 'pr_oracle')
201 | 
202 | def histogram_na_prob(na_probs, qid_list, image_dir, name):
203 |   if not qid_list:
204 |     return
205 |   x = [na_probs[k] for k in qid_list]
206 |   weights = np.ones_like(x) / float(len(x))
207 |   plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
208 |   plt.xlabel('Model probability of no-answer')
209 |   plt.ylabel('Proportion of dataset')
210 |   plt.title('Histogram of no-answer probability: %s' % name)
211 |   plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
212 |   plt.clf()
213 | 
214 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
215 |   num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
216 |   cur_score = num_no_ans
217 |   best_score = cur_score
218 |   best_thresh = 0.0
219 |   qid_list = sorted(na_probs, key=lambda k: na_probs[k])
220 |   for i, qid in enumerate(qid_list):
221 |     if qid not in scores: continue
222 |     if qid_to_has_ans[qid]:
223 |       diff = scores[qid]
224 |     else:
225 |       if preds[qid]:
226 |         diff = -1
227 |       else:
228 |         diff = 0
229 |     cur_score += diff
230 |     if cur_score > best_score:
231 |       best_score = cur_score
232 |       best_thresh = na_probs[qid]
233 |   return 100.0 * best_score / len(scores), best_thresh
234 | 
235 | def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
236 |   num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
237 |   cur_score = num_no_ans
238 |   best_score = cur_score
239 |   best_thresh = 0.0
240 |   qid_list = sorted(na_probs, key=lambda k: na_probs[k])
241 |   for i, qid in enumerate(qid_list):
242 |     if qid not in scores: continue
243 |     if qid_to_has_ans[qid]:
244 |       diff = scores[qid]
245 |     else:
246 |       if preds[qid]:
247 |         diff = -1
248 |       else:
249 |         diff = 0
250 |     cur_score += diff
251 |     if cur_score > best_score:
252 |       best_score = cur_score
253 |       best_thresh = na_probs[qid]
254 | 
255 |   has_ans_score, has_ans_cnt = 0, 0
256 |   for qid in qid_list:
257 |     if not qid_to_has_ans[qid]: continue
258 |     has_ans_cnt += 1
259 | 
260 |     if qid not in scores: continue
261 |     has_ans_score += scores[qid]
262 | 
263 |   return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
264 | 
265 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
266 |   best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
267 |   best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
268 |   main_eval['best_exact'] = best_exact
269 |   main_eval['best_exact_thresh'] = exact_thresh
270 |   main_eval['best_f1'] = best_f1
271 |   main_eval['best_f1_thresh'] = f1_thresh
272 | 
273 | def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
274 |   best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
275 |   best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
276 |   main_eval['best_exact'] = best_exact
277 |   main_eval['best_exact_thresh'] = exact_thresh
278 |   main_eval['best_f1'] = best_f1
279 |   main_eval['best_f1_thresh'] = f1_thresh
280 |   main_eval['has_ans_exact'] = has_ans_exact
281 |   main_eval['has_ans_f1'] = has_ans_f1
282 | 
283 | def main(OPTS):
284 |   with open(OPTS.data_file) as f:
285 |     dataset_json = json.load(f)
286 |     dataset = dataset_json['data']
287 |   with open(OPTS.pred_file) as f:
288 |     preds = json.load(f)
289 |   if OPTS.na_prob_file:
290 |     with open(OPTS.na_prob_file) as f:
291 |       na_probs = json.load(f)
292 |   else:
293 |     na_probs = {k: 0.0 for k in preds}
294 |   qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
295 |   has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
296 |   no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
297 |   exact_raw, f1_raw = get_raw_scores(dataset, preds)
298 |   exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
299 |                                         OPTS.na_prob_thresh)
300 |   f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
301 |                                      OPTS.na_prob_thresh)
302 |   out_eval = make_eval_dict(exact_thresh, f1_thresh)
303 |   if has_ans_qids:
304 |     has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
305 |     merge_eval(out_eval, has_ans_eval, 'HasAns')
306 |   if no_ans_qids:
307 |     no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
308 |     merge_eval(out_eval, no_ans_eval, 'NoAns')
309 |   if OPTS.na_prob_file:
310 |     find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
311 |   if OPTS.na_prob_file and OPTS.out_image_dir:
312 |     run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
313 |                                   qid_to_has_ans, OPTS.out_image_dir)
314 |     histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
315 |     histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
316 |   if OPTS.out_file:
317 |     with open(OPTS.out_file, 'w') as f:
318 |       json.dump(out_eval, f)
319 |   else:
320 |     print(json.dumps(out_eval, indent=2))
321 |   return out_eval
322 | 
323 | if __name__ == '__main__':
324 |   OPTS = parse_args()
325 |   if OPTS.out_image_dir:
326 |     import matplotlib
327 |     matplotlib.use('Agg')
328 |     import matplotlib.pyplot as plt 
329 |   main(OPTS)


--------------------------------------------------------------------------------
/images/lr_finder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utterworks/fast-bert/cff2f913c0c01a85d8c998afb3de6c33fa8bf07a/images/lr_finder.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | pytorch-lamb
 3 | tensorboardX
 4 | fastprogress
 5 | scikit-learn
 6 | seqeval
 7 | transformers==4.22.*
 8 | pandas
 9 | python-box
10 | more-itertools
11 | onnx
12 | onnxruntime
13 | onnxruntime-tools
14 | 


--------------------------------------------------------------------------------
/sample_data/imdb_movie_reviews/label/labels.csv:
--------------------------------------------------------------------------------
1 | 0
2 | 1


--------------------------------------------------------------------------------
/sample_data/multi_label_toxic_comments/label/labels.csv:
--------------------------------------------------------------------------------
1 | toxic
2 | severe_toxic
3 | obscene
4 | threat
5 | insult
6 | identity_hate


--------------------------------------------------------------------------------
/sample_notebooks/toxic_comments_sagemaker.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sagemaker\n",
 10 |     "from pathlib import Path\n",
 11 |     "from sagemaker.predictor import json_serializer\n",
 12 |     "import json"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "role = sagemaker.get_execution_role()\n",
 22 |     "session = sagemaker.Session()"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Setup Path "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# location for train.csv, val.csv and labels.csv\n",
 39 |     "DATA_PATH = Path(\"../data/\")   \n",
 40 |     "\n",
 41 |     "# Location for storing training_config.json\n",
 42 |     "CONFIG_PATH = DATA_PATH/'config'\n",
 43 |     "CONFIG_PATH.mkdir(exist_ok=True)\n",
 44 |     "\n",
 45 |     "# S3 bucket name\n",
 46 |     "bucket = 'sagemaker-deep-learning'\n",
 47 |     "\n",
 48 |     "# Prefix for S3 bucket for input and output\n",
 49 |     "prefix = 'toxic_comments/input'\n",
 50 |     "prefix_output = 'toxic_comments/output'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Hyperparameters & Training Config"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "hyperparameters = {\n",
 67 |     "    \"epochs\": 10,\n",
 68 |     "    \"lr\": 8e-5,\n",
 69 |     "    \"max_seq_length\": 512,\n",
 70 |     "    \"train_batch_size\": 16,\n",
 71 |     "    \"lr_schedule\": \"warmup_cosine\",\n",
 72 |     "    \"warmup_steps\": 1000,\n",
 73 |     "    \"optimizer_type\": \"adamw\"\n",
 74 |     "}"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "training_config = {\n",
 84 |     "    \"run_text\": \"toxic comments\",\n",
 85 |     "    \"finetuned_model\": None,\n",
 86 |     "    \"do_lower_case\": \"True\",\n",
 87 |     "    \"train_file\": \"train.csv\",\n",
 88 |     "    \"val_file\": \"val.csv\",\n",
 89 |     "    \"label_file\": \"labels.csv\",\n",
 90 |     "    \"text_col\": \"comment_text\",\n",
 91 |     "    \"label_col\": '[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]',\n",
 92 |     "    \"multi_label\": \"True\",\n",
 93 |     "    \"grad_accumulation_steps\": \"1\",\n",
 94 |     "    \"fp16_opt_level\": \"O1\",\n",
 95 |     "    \"fp16\": \"True\",\n",
 96 |     "    \"model_type\": \"roberta\",\n",
 97 |     "    \"model_name\": \"roberta-base\",\n",
 98 |     "    \"logging_steps\": \"300\"\n",
 99 |     "}\n",
100 |     "\n",
101 |     "with open(CONFIG_PATH/'training_config.json', 'w') as f:\n",
102 |     "    json.dump(training_config, f)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Upload Data"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# This is a helper feature to upload data\n",
119 |     "# from your local machine to S3 bucket.\n",
120 |     "\n",
121 |     "s3_input = session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)\n",
122 |     "\n",
123 |     "session.upload_data(str(DATA_PATH/'labels.csv'), bucket=bucket , key_prefix=prefix)\n",
124 |     "session.upload_data(str(DATA_PATH/'train.csv'), bucket=bucket , key_prefix=prefix)\n",
125 |     "session.upload_data(str(DATA_PATH/'val.csv'), bucket=bucket , key_prefix=prefix)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Create an Estimator and start training"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "account = session.boto_session.client('sts').get_caller_identity()['Account']\n",
142 |     "region = session.boto_session.region_name\n",
143 |     "\n",
144 |     "image = \"{}.dkr.ecr.{}.amazonaws.com/sagemaker-bert:1.0-gpu-py36\".format(account, region)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "output_path = \"s3://{}/{}\".format(bucket, prefix_output)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "estimator = sagemaker.estimator.Estimator(image, \n",
163 |     "                                          role,\n",
164 |     "                                          train_instance_count=1, \n",
165 |     "                                          train_instance_type='ml.p3.8xlarge', \n",
166 |     "                                          output_path=output_path, \n",
167 |     "                                          base_job_name='toxic-comments',\n",
168 |     "                                          hyperparameters=hyperparameters,\n",
169 |     "                                          sagemaker_session=session\n",
170 |     "                                         )"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "estimator.fit(s3_input)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "## Deploy the model to hosting service"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "predictor = estimator.deploy(1, \n",
196 |     "                             'ml.m5.large', \n",
197 |     "                             endpoint_name='bert-toxic-comments', \n",
198 |     "                             update_endpoint=True, \n",
199 |     "                             serializer=json_serializer)"
200 |    ]
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "Python 3",
206 |    "language": "python",
207 |    "name": "python3"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.6.5"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 2
224 | }
225 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from io import open
 3 | from setuptools import setup, find_packages
 4 | import subprocess
 5 | 
 6 | 
 7 | with open("requirements.txt") as f:
 8 |     install_requires = f.read().strip().split("\n")
 9 | 
10 | # get version from VERSION.txt 
11 | with open("VERSION.txt") as f:
12 |     version = f.read().strip()
13 | 
14 | setup(
15 |     name="fast_bert",
16 |     # get version from VERSION file
17 |     version=version,
18 |     description="AI Library using BERT",
19 |     author="Kaushal Trivedi",
20 |     author_email="kaushaltrivedi@me.com",
21 |     license="Apache2",
22 |     url="https://github.com/kaushaltrivedi/fast-bert",
23 |     long_description=open("README.md", "r", encoding="utf-8").read(),
24 |     long_description_content_type="text/markdown",
25 |     keywords="BERT NLP deep learning google",
26 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
27 |     install_requires=install_requires,
28 |     classifiers=[
29 |         "Intended Audience :: Science/Research",
30 |         "License :: OSI Approved :: Apache Software License",
31 |         "Programming Language :: Python :: 3",
32 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
33 |     ],
34 |     zip_safe=False,
35 | )
36 | 


--------------------------------------------------------------------------------
/tag_release.sh:
--------------------------------------------------------------------------------
1 | 
2 | # get tag name from VERSION file
3 | TAG_NAME=v$(cat VERSION.txt)
4 | push_message="${1:-update}"
5 | git add . && git commit -m "$push_message" && git tag $TAG_NAME -m "tag $TAG_NAME" && git push origin $TAG_NAME 
6 | git push origin main
7 | 


--------------------------------------------------------------------------------
/test/summarisation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from fast_bert.data_abs import BertAbsDataBunch\n",
 10 |     "from fast_bert.learner_abs import BertAbsLearner\n",
 11 |     "from box import Box\n",
 12 |     "import logging\n",
 13 |     "import torch\n",
 14 |     "from pathlib import Path\n",
 15 |     "from transformers import BertTokenizer"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from tokenizers import (ByteLevelBPETokenizer,\n",
 25 |     "                            BPETokenizer,\n",
 26 |     "                            SentencePieceBPETokenizer,\n",
 27 |     "                            BertWordPieceTokenizer)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "PATH = Path(\"../../summarisation/\")\n",
 37 |     "DATA_PATH = PATH/'data'\n",
 38 |     "MODEL_PATH = PATH/'model'"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "logger = logging.getLogger()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "args = Box({\n",
 57 |     "    \"max_seq_length\": 512,\n",
 58 |     "    \"batch_size\": 8,\n",
 59 |     "    \"learning_rate\": 5e-3,\n",
 60 |     "    \"num_train_epochs\": 6,\n",
 61 |     "    \"fp16\": True,\n",
 62 |     "    \"model_name\": 'bertabs-finetuned-cnndm',\n",
 63 |     "    \"model_type\": 'bert'\n",
 64 |     "})"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": []
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "device = torch.device('cuda') if torch.cuda.device_count() else torch.device('cpu')\n",
 81 |     "if torch.cuda.device_count() > 1:\n",
 82 |     "    args.multi_gpu = True\n",
 83 |     "else:\n",
 84 |     "    args.multi_gpu = False"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "tokenizer = BertWordPieceTokenizer(str(MODEL_PATH/'vocab.txt'), lowercase=True)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "databunch = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=tokenizer, device=device)\n",
103 |     "databunch_old = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=args.model_name, device=device)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "databunch_with_data = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=args.model_name, device=device)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "databunch_with_data_new_tokeniser = BertAbsDataBunch(data_dir=DATA_PATH, tokenizer=tokenizer, device=device)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "learner = BertAbsLearner.from_pretrained_model(databunch, MODEL_PATH, device, logger=logger)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "texts = databunch_with_data.test_dl.dataset[0][1]\n",
140 |     "texts"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "%%timeit\n",
150 |     "learner.predict_batch(texts)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "learner_old = BertAbsLearner.from_pretrained_model(databunch_old, MODEL_PATH, device, logger=logger)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "%%timeit\n",
169 |     "learner_old.predict_batch(texts)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": []
185 |   }
186 |  ],
187 |  "metadata": {
188 |   "kernelspec": {
189 |    "display_name": "Python 3",
190 |    "language": "python",
191 |    "name": "python3"
192 |   },
193 |   "language_info": {
194 |    "codemirror_mode": {
195 |     "name": "ipython",
196 |     "version": 3
197 |    },
198 |    "file_extension": ".py",
199 |    "mimetype": "text/x-python",
200 |    "name": "python",
201 |    "nbconvert_exporter": "python",
202 |    "pygments_lexer": "ipython3",
203 |    "version": "3.7.4"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 4
208 | }
209 | 


--------------------------------------------------------------------------------