├── logs └── .gitignore ├── scripts ├── __init__.py ├── run_long_lm.py └── finetune_qa_models.py ├── src └── __init__.py ├── report ├── Master Thesis.pdf └── Thesis Presentation.pdf ├── .env.template ├── LICENSE ├── Makefile ├── docker-compose.yaml ├── Pretraining_Details.md ├── Dockerfile ├── Finetuning_Details.md ├── .gitignore ├── requirements.txt ├── notebooks ├── Longformer TriviaQA.ipynb ├── Convert to Long.ipynb └── Try Train Longformer SQuAD.ipynb └── README.md /logs/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | import .tracking 2 | import .lib 3 | -------------------------------------------------------------------------------- /report/Master Thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Master Thesis.pdf -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | PROJECT_NAME=xlm-l 2 | DATA_DIR= 3 | MODEL_DIR= 4 | GPU_IDS=0 5 | JUPYTER_PW= 6 | JUPYTER_PORT=8999 7 | PRIVATE_DEPS=none 8 | -------------------------------------------------------------------------------- /report/Thesis Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Thesis Presentation.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Markus Sagen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include .env 2 | 3 | export USER_ID := $(shell id -u) 4 | export USER_NAME := $(shell whoami) 5 | export PROJECT_DIR := $(shell pwd) 6 | export COMPOSE_CMD := docker-compose -f docker-compose.yaml -p ${PROJECT_NAME}_${USER_NAME} 7 | export PKG_DIR := pkg 8 | 9 | # Enable running e3k on machines with no GPU 10 | ifeq (${GPU_IDS}, none) 11 | export RUNTIME := runc 12 | else 13 | export RUNTIME := nvidia 14 | endif 15 | 16 | # Enable pulling in dependencies in private repos 17 | ifneq (${PRIVATE_DEPS}, none) 18 | clone_private_deps := for item in ${PRIVATE_DEPS}; do \ 19 | git clone $$item ${PKG_DIR}/$$item; \ 20 | echo $$item; \ 21 | done 22 | else 23 | clone_private_deps := echo "Nothing to clone" 24 | endif 25 | 26 | .PHONY: build 27 | build: 28 | mkdir -p ${PKG_DIR} 29 | $(call clone_private_deps) 30 | $(COMPOSE_CMD) build 31 | rm -rf ${PKG_DIR} 32 | 33 | .PHONY: logs 34 | logs: 35 | ${COMPOSE_CMD} logs 36 | 37 | .PHONY: up 38 | up: 39 | $(COMPOSE_CMD) up --detach 40 | 41 | .PHONY: down 42 | down: 43 | $(COMPOSE_CMD) down 44 | 45 | .PHONY: repl 46 | repl: 47 | ${COMPOSE_CMD} exec repl python3 $(run) 48 | 49 | .PHONY: ipython 50 | ipython: 51 | ${COMPOSE_CMD} exec repl ipython $(run) 52 | 53 | 54 | .PHONY: shell 55 | shell: 56 | ${COMPOSE_CMD} exec repl bash 57 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2.3' 2 | services: 3 | jupyter: 4 | image: ${PROJECT_NAME} 5 | command: jupyter lab --ip=0.0.0.0 --no-browser --NotebookApp.token='${JUPYTER_PW}' 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | args: 10 | - PKG_DIR=${PKG_DIR} 11 | - PRIVATE_DEPS=${PRIVATE_DEPS} 12 | shm_size: '16gb' 13 | ports: 14 | - ${JUPYTER_PORT}:8888 15 | user: ${USER_ID}:${USER_ID} 16 | runtime: ${RUNTIME} 17 | network_mode: bridge 18 | environment: 19 | - NVIDIA_VISIBLE_DEVICES=${GPU_IDS} 20 | volumes: 21 | - ${DATA_DIR}:/workspace/data 22 | - ${MODEL_DIR}:/workspace/models 23 | - ${PROJECT_DIR}/src:/workspace/src 24 | - ${PROJECT_DIR}/notebooks:/workspace/notebooks 25 | - ${PROJECT_DIR}/logs:/workspace/logs 26 | 27 | repl: 28 | image: ${PROJECT_NAME} 29 | tty: true 30 | shm_size: '16gb' 31 | user: ${USER_ID}:${USER_ID} 32 | runtime: ${RUNTIME} 33 | network_mode: bridge 34 | environment: 35 | - NVIDIA_VISIBLE_DEVICES=${GPU_IDS} 36 | volumes: 37 | - ${DATA_DIR}:/workspace/data 38 | - ${MODEL_DIR}:/workspace/models 39 | - ${PROJECT_DIR}/src:/workspace/src 40 | - ${PROJECT_DIR}/scripts:/workspace/scripts 41 | - ${PROJECT_DIR}/logs:/workspace/logs 42 | -------------------------------------------------------------------------------- /Pretraining_Details.md: -------------------------------------------------------------------------------- 1 | # Pre-Training Details 2 | 3 | ### Models 4 | Converting transformer models are based on the [Longformer conversion script](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb). The script can be run for any pre-trained RoBERTa based model and can be extended to be used with other pre-traned models. 5 | 6 | Training with these parameters on a 48GB GPU takes ~5 days 7 | We pre-trained both a monolingual RoBERTa and multilingual XLM-R model using the Longformer pre-training scheme to extend the context's of the models. These models were trained on the same datasets and same hyper-parameters and only trained with one seed because of the long training time. 8 | 9 | The arguments `MAX_POS` indicate how many tokens the model should learn to attend. The number of tokens it can learn to attend to must be of the form $2^x$ and be larger than $512$. 10 | 11 | The `MODEL_NAME_OR_PATH` indicated the pre-trained model that the Longformer can be extended from. The names of the models must be pre-trained model names available at [Huggingface](https://huggingface.co/models), such as `roberta-base`, `xlm-roberta-base` or similar. The pre-training scheme should in theory work for all encoder-type Transformers, such as BERT, RoBERTa, Alberta, etc. However, we have only tested it for RoBERTa and XLM-R, so the training script may need to be changed if used for BERT. 12 | 13 | We refer to these models that we have trained using the Longformer pre-training scheme as: 14 | 15 | 1. `RoBERTa-Long` 16 | 2. `XLM-Long` 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # https://hub.docker.com/r/huggingface/transformers-pytorch-gpu/dockerfile 3 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 4 | 5 | ARG PKG_DIR 6 | ARG PRIVATE_DEPS 7 | 8 | WORKDIR /workspace 9 | 10 | RUN apt update && \ 11 | apt install -y bash \ 12 | build-essential \ 13 | git \ 14 | wget \ 15 | curl \ 16 | ca-certificates \ 17 | python3 \ 18 | python3-pip && \ 19 | rm -rf /var/lib/apt/lists 20 | 21 | # RUN apt-get update && apt-get install -y git 22 | 23 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 24 | python3 -m pip install --no-cache-dir \ 25 | mkl \ 26 | torch 27 | 28 | #RUN git clone https://github.com/NVIDIA/apex 29 | #RUN cd apex && \ 30 | # python3 setup.py install && \ 31 | # pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 32 | 33 | 34 | # Install packages from private repositories 35 | COPY ${PKG_DIR}/ /pkg/ 36 | RUN if [ "${PRIVATE_DEPS}" != "none" ]; then \ 37 | for pkg in /pkg/*/* ; \ 38 | do pip install -e $pkg ; \ 39 | done; \ 40 | fi 41 | 42 | 43 | # Fix permissions 44 | RUN chmod 0777 /workspace 45 | RUN mkdir /.local && chmod 0777 /.local 46 | RUN mkdir /.jupyter && chmod 0777 /.jupyter 47 | RUN mkdir /.cache && chmod 0777 /.cache 48 | # Workaround for transformers library permissions 49 | RUN mkdir /.config && chmod 0777 /.config 50 | 51 | # Install python packages 52 | ADD src ./src 53 | ADD requirements.txt . 54 | RUN pip install -r requirements.txt 55 | -------------------------------------------------------------------------------- /Finetuning_Details.md: -------------------------------------------------------------------------------- 1 | # Fine-Tuning Details 2 | 3 | 4 | We fine-tune and evaluate on these datasets using several pre-trained models released by Huggingface and compare it with the long-context models (Longformer type models) we have trained. 5 | 6 | We have divided the models firstly based the number of languages, then on the specific dataset and finally which model was fine-tuned. The datasets SQ3 and XQ3 are the long context variants (with concatenated context) of the SQuAD and XQuAD datasets. And to better understand and evaluate how the performance was effected when creating a new dataset, we chose fine-tune on the SQ3 and XQ3 dataset using either the regular attention window (512 tokens) or the attention window learned by the Longformer trained models (4096 tokens). These datasets were denoted SQ3 (512) and SQ3 (2048) respectively for the English dataset and XQ3 (512) and XQ3 (2048) for the multilingual datasets. 7 | 8 | The long context models are trained on a longer context than 2048, but we restricted the long context datasets to this many tokens at time, since the models did not manage to fit in memory on a 48GB GPU otherwise. 9 | 10 | #### Context lengths 11 | Depending on the number of contexts one choses to concatinate together, the maximum number of tokens the model can attend to also changes. The maximum number of contexts and tokens we managed to run on a 48GB GPU was 3 concatinated context, and corresponded to that the average number of tokens for each context were slightly below 2048. Therefore, for the concatinated long datasets, we set the hyper-parameters --nr\_concats=3 and --max\_length=2048. If you want to test out other values, we suggest the following pairings: 12 | 13 | concats=1, max\_length=512 14 | concats=3, max\_length=2048 15 | concats=5, max\_length=4098 16 | 17 | 18 | #### Seeds 19 | Each model is trained with 5 different SEEDS. To replicate our experiments, re-run each code segment and replace the SEED with the following seeds: 20 | 21 | - 42 22 | - 1337 23 | - 1729 24 | - 165 25 | - 758241 26 | 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | anyio==2.1.0 3 | argon2-cffi==20.1.0 4 | asn1crypto==0.24.0 5 | async-generator==1.10 6 | attrs==20.3.0 7 | Babel==2.9.0 8 | backcall==0.2.0 9 | bleach==3.3.0 10 | cached-property==1.5.2 11 | cachetools==4.2.1 12 | certifi==2020.12.5 13 | cffi==1.14.5 14 | chardet==4.0.0 15 | click==7.1.2 16 | cloudpickle==1.6.0 17 | colorama==0.4.4 18 | contextvars==2.4 19 | cryptography==2.1.4 20 | cycler==0.10.0 21 | Cython==0.29.21 22 | dask==2021.2.0 23 | dataclasses==0.8 24 | datasets==1.3.0 25 | decorator==4.4.2 26 | defusedxml==0.6.0 27 | dill==0.3.3 28 | distributed==2021.2.0 29 | dnspython==2.1.0 30 | docopt==0.6.2 31 | entrypoints==0.3 32 | filelock==3.0.12 33 | fsspec==0.8.5 34 | gitdb==4.0.5 35 | GitPython==3.1.13 36 | google-auth==1.27.0 37 | google-auth-oauthlib==0.4.2 38 | graphviz==0.16 39 | grpcio==1.35.0 40 | h5py==3.1.0 41 | HeapDict==1.0.1 42 | hiddenlayer==0.3 43 | huggingface-hub==0.0.2 44 | idna==2.10 45 | immutables==0.15 46 | importlib-metadata==3.4.0 47 | intel-openmp==2021.1.2 48 | ipykernel==5.5.0 49 | ipython==7.16.1 50 | ipython-genutils==0.2.0 51 | ipywidgets==7.6.3 52 | jedi==0.18.0 53 | Jinja2==2.11.3 54 | joblib==1.0.1 55 | json5==0.9.5 56 | jsonpickle==1.5.2 57 | jsonschema==3.2.0 58 | jupyter-client==6.1.11 59 | jupyter-core==4.7.1 60 | jupyter-server==1.4.0 61 | jupyterlab==3.0.8 62 | jupyterlab-pygments==0.1.2 63 | jupyterlab-server==2.3.0 64 | jupyterlab-widgets==1.0.0 65 | keyring==10.6.0 66 | keyrings.alt==3.0 67 | kiwisolver==1.3.1 68 | Markdown==3.3.3 69 | MarkupSafe==1.1.1 70 | matplotlib==3.3.4 71 | mistune==0.8.4 72 | mkl==2021.1.1 73 | msgpack==1.0.2 74 | multiprocess==0.70.11.1 75 | munch==2.5.0 76 | nbclassic==0.2.6 77 | nbclient==0.5.2 78 | nbconvert==6.0.7 79 | nbformat==5.1.2 80 | nest-asyncio==1.5.1 81 | notebook==6.2.0 82 | numpy==1.19.5 83 | oauthlib==3.1.0 84 | packaging==20.9 85 | pandas==1.1.5 86 | pandocfilters==1.4.3 87 | parso==0.8.1 88 | pexpect==4.8.0 89 | pickleshare==0.7.5 90 | Pillow==8.1.0 91 | pip==20.3.3 92 | prometheus-client==0.9.0 93 | prompt-toolkit==3.0.16 94 | protobuf==3.15.0 95 | psutil==5.8.0 96 | ptyprocess==0.7.0 97 | py-cpuinfo==7.0.0 98 | pyarrow==1.0.1 99 | pyasn1==0.4.8 100 | pyasn1-modules==0.2.8 101 | pycparser==2.20 102 | pycrypto==2.6.1 103 | Pygments==2.8.0 104 | pygobject==3.26.1 105 | pymongo==3.11.3 106 | pyparsing==2.4.7 107 | pyrsistent==0.17.3 108 | python-dateutil==2.8.1 109 | pytz==2021.1 110 | pyxdg==0.25 111 | PyYAML==5.4.1 112 | pyzmq==22.0.3 113 | regex==2020.11.13 114 | requests==2.25.1 115 | requests-oauthlib==1.3.0 116 | rsa==4.7.1 117 | sacred==0.8.2 118 | sacremoses==0.0.43 119 | scikit-learn==0.24.1 120 | scipy==1.5.4 121 | seaborn==0.11.1 122 | SecretStorage==2.3.1 123 | Send2Trash==1.5.0 124 | sentencepiece==0.1.95 125 | setuptools==53.0.0 126 | six==1.11.0 127 | sklearn 128 | smmap==3.0.5 129 | sniffio==1.2.0 130 | sortedcontainers==2.3.0 131 | tbb==2021.1.1 132 | tblib==1.7.0 133 | tensorboard==2.4.1 134 | tensorboard-plugin-wit==1.8.0 135 | terminado==0.9.2 136 | testpath==0.4.4 137 | threadpoolctl==2.1.0 138 | tokenizers==0.9.2 139 | toolz==0.11.1 140 | torch==1.7.1 141 | torchsummary==1.5.1 142 | tornado==6.1 143 | tqdm==4.49.0 144 | traitlets==4.3.3 145 | transformers==3.4.0 146 | typing-extensions==3.7.4.3 147 | urllib3==1.26.3 148 | wcwidth==0.2.5 149 | webencodings==0.5.1 150 | Werkzeug==1.0.1 151 | wget==3.2 152 | wheel==0.30.0 153 | widgetsnbextension==3.5.1 154 | wrapt==1.12.1 155 | xxhash==2.0.0 156 | zict==2.0.0 157 | zipp==3.4.0 158 | -------------------------------------------------------------------------------- /notebooks/Longformer TriviaQA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import nlp\n", 10 | "import torch\n", 11 | "import datasets\n", 12 | "\n", 13 | "# ATTENTION. Rerunning this command remove the cached trivia qa dataset completely \n", 14 | "#!rm -rf /.cache/" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "trivia_qa wikitext-103-raw\n", 27 | "mkdir: cannot create directory '../data/trivia_qa': File exists\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "# https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb#scrollTo=wyDYG4YDXFV7\n", 33 | "!ls ../data\n", 34 | "!mkdir ../data/trivia_qa" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "%%time\n", 44 | "validation_dataset = datasets.load_dataset(\"trivia_qa\", \"rc\", split=\"validation[:5%]\", cache_dir=\"/workspace/data/trivia_qa\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "absl-py==0.11.0\n", 57 | "apex==0.1\n", 58 | "argon2-cffi==20.1.0\n", 59 | "asn1crypto==0.24.0\n", 60 | "async-generator==1.10\n", 61 | "attrs==20.3.0\n", 62 | "backcall==0.2.0\n", 63 | "bleach==3.2.1\n", 64 | "cached-property==1.5.2\n", 65 | "cachetools==4.1.1\n", 66 | "certifi==2020.11.8\n", 67 | "cffi==1.14.4\n", 68 | "chardet==3.0.4\n", 69 | "click==7.1.2\n", 70 | "cloudpickle==1.6.0\n", 71 | "colorama==0.4.4\n", 72 | "contextvars==2.4\n", 73 | "cryptography==2.1.4\n", 74 | "cycler==0.10.0\n", 75 | "Cython==0.29.21\n", 76 | "dask==2.30.0\n", 77 | "dataclasses==0.8\n", 78 | "datasets==1.1.3\n", 79 | "decorator==4.4.2\n", 80 | "defusedxml==0.6.0\n", 81 | "dill==0.3.3\n", 82 | "distributed==2.30.1\n", 83 | "dnspython==2.0.0\n", 84 | "docopt==0.6.2\n", 85 | "entrypoints==0.3\n", 86 | "filelock==3.0.12\n", 87 | "future==0.18.2\n", 88 | "gitdb==4.0.5\n", 89 | "GitPython==3.1.11\n", 90 | "google-auth==1.23.0\n", 91 | "google-auth-oauthlib==0.4.2\n", 92 | "graphviz==0.15\n", 93 | "grpcio==1.33.2\n", 94 | "h5py==3.1.0\n", 95 | "HeapDict==1.0.1\n", 96 | "hiddenlayer==0.3\n", 97 | "idna==2.6\n", 98 | "immutables==0.14\n", 99 | "importlib-metadata==3.1.0\n", 100 | "intel-openmp==2020.0.133\n", 101 | "ipykernel==5.3.4\n", 102 | "ipython==7.16.1\n", 103 | "ipython-genutils==0.2.0\n", 104 | "ipywidgets==7.5.1\n", 105 | "jedi==0.17.2\n", 106 | "Jinja2==2.11.2\n", 107 | "joblib==0.17.0\n", 108 | "json5==0.9.5\n", 109 | "jsonpickle==1.4.1\n", 110 | "jsonschema==3.2.0\n", 111 | "jupyter-client==6.1.7\n", 112 | "jupyter-core==4.7.0\n", 113 | "jupyterlab==2.2.9\n", 114 | "jupyterlab-pygments==0.1.2\n", 115 | "jupyterlab-server==1.2.0\n", 116 | "keyring==10.6.0\n", 117 | "keyrings.alt==3.0\n", 118 | "kiwisolver==1.3.1\n", 119 | "Markdown==3.3.3\n", 120 | "MarkupSafe==1.1.1\n", 121 | "matplotlib==3.3.3\n", 122 | "mistune==0.8.4\n", 123 | "mkl==2019.0\n", 124 | "msgpack==1.0.0\n", 125 | "multiprocess==0.70.11.1\n", 126 | "munch==2.5.0\n", 127 | "nbclient==0.5.1\n", 128 | "nbconvert==6.0.7\n", 129 | "nbformat==5.0.8\n", 130 | "nest-asyncio==1.4.3\n", 131 | "notebook==6.1.5\n", 132 | "numpy==1.19.4\n", 133 | "oauthlib==3.1.0\n", 134 | "packaging==20.4\n", 135 | "pandas==1.1.4\n", 136 | "pandocfilters==1.4.3\n", 137 | "parso==0.7.1\n", 138 | "pexpect==4.8.0\n", 139 | "pickleshare==0.7.5\n", 140 | "Pillow==8.0.1\n", 141 | "prometheus-client==0.9.0\n", 142 | "prompt-toolkit==3.0.8\n", 143 | "protobuf==3.14.0\n", 144 | "psutil==5.7.3\n", 145 | "ptyprocess==0.6.0\n", 146 | "py-cpuinfo==7.0.0\n", 147 | "pyarrow==2.0.0\n", 148 | "pyasn1==0.4.8\n", 149 | "pyasn1-modules==0.2.8\n", 150 | "pycparser==2.20\n", 151 | "pycrypto==2.6.1\n", 152 | "Pygments==2.7.2\n", 153 | "pygobject==3.26.1\n", 154 | "pymongo==3.11.1\n", 155 | "pyparsing==2.4.7\n", 156 | "pyrsistent==0.17.3\n", 157 | "python-dateutil==2.8.1\n", 158 | "pytz==2020.4\n", 159 | "pyxdg==0.25\n", 160 | "PyYAML==5.3.1\n", 161 | "pyzmq==20.0.0\n", 162 | "regex==2020.11.13\n", 163 | "requests==2.25.0\n", 164 | "requests-oauthlib==1.3.0\n", 165 | "rsa==4.6\n", 166 | "sacred==0.8.1\n", 167 | "sacremoses==0.0.43\n", 168 | "scikit-learn==0.23.2\n", 169 | "scipy==1.5.4\n", 170 | "seaborn==0.11.0\n", 171 | "SecretStorage==2.3.1\n", 172 | "Send2Trash==1.5.0\n", 173 | "sentencepiece==0.1.94\n", 174 | "six==1.11.0\n", 175 | "sklearn==0.0\n", 176 | "smmap==3.0.4\n", 177 | "sortedcontainers==2.3.0\n", 178 | "tblib==1.7.0\n", 179 | "tensorboard==2.4.0\n", 180 | "tensorboard-plugin-wit==1.7.0\n", 181 | "terminado==0.9.1\n", 182 | "testpath==0.4.4\n", 183 | "threadpoolctl==2.1.0\n", 184 | "tokenizers==0.9.2\n", 185 | "toolz==0.11.1\n", 186 | "torch==1.7.0\n", 187 | "torchsummary==1.5.1\n", 188 | "tornado==6.1\n", 189 | "tqdm==4.49.0\n", 190 | "traitlets==4.3.3\n", 191 | "transformers==3.4.0\n", 192 | "typing-extensions==3.7.4.3\n", 193 | "urllib3==1.26.2\n", 194 | "wcwidth==0.2.5\n", 195 | "webencodings==0.5.1\n", 196 | "Werkzeug==1.0.1\n", 197 | "wget==3.2\n", 198 | "widgetsnbextension==3.5.1\n", 199 | "wrapt==1.12.1\n", 200 | "xxhash==2.0.0\n", 201 | "zict==2.0.0\n", 202 | "zipp==3.4.0\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "!pip freeze\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# define the mapping function\n", 224 | "def format_dataset(example):\n", 225 | " # the context might be comprised of multiple contexts => me merge them here\n", 226 | " example[\"context\"] = \" \".join((\"\\n\".join(example[\"entity_pages\"][\"wiki_context\"])).split(\"\\n\"))\n", 227 | " example[\"targets\"] = example[\"answer\"][\"aliases\"]\n", 228 | " example[\"norm_target\"] = example[\"answer\"][\"normalized_value\"]\n", 229 | " return example\n", 230 | "\n", 231 | "# map the dataset and throw out all unnecessary columns\n", 232 | "validation_dataset = validation_dataset.map(format_dataset, remove_columns=[\"search_results\", \"question_source\", \"entity_pages\", \"answer\", \"question_id\"])" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "validation_dataset[8]" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "validation_dataset = validation_dataset.filter(lambda x: len(x[\"context\"]) > 0)\n", 251 | "# check out how many samples are left\n", 252 | "validation_dataset" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "print(\"\\n\\nLength for each example\")\n", 262 | "print(30 * \"=\")\n", 263 | "\n", 264 | "# length for each example\n", 265 | "validation_dataset.map(lambda x, i: print(f\"Id: {i} - Question Length: {len(x['question'])} - context Length: {len(x['context'])}\"), with_indices=True)\n", 266 | "print(30 * \"=\")\n", 267 | "\n", 268 | "print(\"\\n\")\n", 269 | "print(\"Num examples larger than 4 * 4096 characters: \")\n", 270 | "# filter out examples smaller than 4 * 4096\n", 271 | "short_validation_dataset = validation_dataset.filter(lambda x: (len(x['question']) + len(x['context'])) < 4 * 4096)\n", 272 | "short_validation_dataset" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "# EVAL" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering\n", 291 | "\n", 292 | "tokenizer = LongformerTokenizerFast.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n", 293 | "\n", 294 | "# download the 1.7 GB pretrained model. It might take ~1min\n", 295 | "model = LongformerForQuestionAnswering.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n", 296 | "model.to(\"cuda\")\n", 297 | "\n", 298 | "def evaluate(example):\n", 299 | " def get_answer(question, context):\n", 300 | " # encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length\n", 301 | " encoding = tokenizer.encode_plus(question, context, return_tensors=\"pt\", max_length=4096, truncation=True)\n", 302 | " input_ids = encoding[\"input_ids\"].to(\"cuda\")\n", 303 | " attention_mask = encoding[\"attention_mask\"].to(\"cuda\")\n", 304 | "\n", 305 | " # the forward method will automatically set global attention on question tokens\n", 306 | " # The scores for the possible start token and end token of the answer are retrived\n", 307 | " # wrap the function in torch.no_grad() to save memory\n", 308 | " with torch.no_grad():\n", 309 | " start_scores, end_scores = model(input_ids=input_ids, attention_mask=attention_mask)\n", 310 | "\n", 311 | " # Let's take the most likely token using `argmax` and retrieve the answer\n", 312 | " all_tokens = tokenizer.convert_ids_to_tokens(encoding[\"input_ids\"][0].tolist())\n", 313 | " answer_tokens = all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1]\n", 314 | " answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))[1:].replace('\"', '') # remove space prepending space token and remove unnecessary '\"'\n", 315 | " \n", 316 | " return answer\n", 317 | "\n", 318 | " # save the model's outut here\n", 319 | " example[\"output\"] = get_answer(example[\"question\"], example[\"context\"])\n", 320 | "\n", 321 | " # save if it's a match or not\n", 322 | " example[\"match\"] = (example[\"output\"] in example[\"targets\"]) or (example[\"output\"] == example[\"norm_target\"])\n", 323 | "\n", 324 | " return example\n" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "results_short = short_validation_dataset.map(evaluate)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "print(f\"\\nNum Correct examples: {sum(results_short['match'])}/{len(results_short)}\")\n", 343 | "wrong_results = results_short.filter(lambda x: x['match'] is False)\n", 344 | "print(f\"\\nWrong examples: \")\n", 345 | "wrong_results.map(lambda x, i: print(f\"{i} - Output: {x['output']} - Target: {x['norm_target']}\"), with_indices=True)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "results = validation_dataset.map(evaluate)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "print(f\"Correct examples: {sum(results['match'])}/{len(results)}\")" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "# TriviaQA json to SQUAD format dataloader" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 1, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "import json\n", 415 | "from pathlib import Path\n", 416 | "\n", 417 | "def read_squad_files(path: str):\n", 418 | " path = Path(path)\n", 419 | " with open(path, 'rb') as f:\n", 420 | " squad_dict = json.load(f)\n", 421 | " contexts = []\n", 422 | " questions = []\n", 423 | " answers = []\n", 424 | " for group in squad_dict['data']:\n", 425 | " for passage in group['paragraphs']:\n", 426 | " context = passage['context']\n", 427 | " for qa in passage['qas']:\n", 428 | " question = qa['question']\n", 429 | " for answer in qa['answers']:\n", 430 | " contexts.append(context)\n", 431 | " questions.append(question)\n", 432 | " answers.append(answer)\n", 433 | "\n", 434 | " return contexts, questions, answers\n", 435 | " \n", 436 | "\n", 437 | "train_contexts, train_questions, train_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-train-4096.json')\n", 438 | "val_contexts, val_questions, val_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-dev-4096.json')" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 2, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "## Add start and end tokens correctly" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 3, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "def add_end_idx(answers, contexts):\n", 457 | " for answer, context in zip(answers, contexts):\n", 458 | " gold_text = answer['text']\n", 459 | " start_idx = answer['answer_start']\n", 460 | " end_idx = start_idx + len(gold_text)\n", 461 | "\n", 462 | " # sometimes squad answers are off by a character or two – fix this\n", 463 | " if context[start_idx:end_idx].lower() == gold_text:\n", 464 | " answer['answer_end'] = end_idx\n", 465 | " elif context[start_idx-1:end_idx-1].lower() == gold_text:\n", 466 | " answer['answer_start'] = start_idx - 1\n", 467 | " answer['answer_end'] = end_idx - 1 # When the gold label is off by one character\n", 468 | " elif context[start_idx-2:end_idx-2].lower() == gold_text:\n", 469 | " answer['answer_start'] = start_idx - 2\n", 470 | " answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters\n", 471 | "\n", 472 | "add_end_idx(train_answers, train_contexts)\n", 473 | "add_end_idx(val_answers, val_contexts)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 4, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "## Tokenize results" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "from transformers import RobertaTokenizerFast\n", 492 | "tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lowercase=True)\n", 493 | "\n", 494 | "train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "### convert start-end pos to token start/end pos" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "def add_token_positions(encodings, answers):\n", 522 | " start_positions = []\n", 523 | " end_positions = []\n", 524 | " for i in range(len(answers)):\n", 525 | " start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))\n", 526 | " end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))\n", 527 | " # if None, the answer passage has been truncated\n", 528 | " if start_positions[-1] is None:\n", 529 | " start_positions[-1] = tokenizer.model_max_length\n", 530 | " if end_positions[-1] is None:\n", 531 | " end_positions[-1] = tokenizer.model_max_length\n", 532 | " encodings.update({'start_positions': start_positions, 'end_positions': end_positions})\n", 533 | "\n", 534 | "add_token_positions(train_encodings, train_answers)\n", 535 | "add_token_positions(val_encodings, val_answers)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "### Dataloader" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "import torch\n", 554 | "from torch.utils.data import DataLoader, Dataset\n", 555 | "\n", 556 | "class SquadDataset(torch.utils.data.Dataset):\n", 557 | " def __init__(self, encodings):\n", 558 | " self.encodings = encodings\n", 559 | "\n", 560 | " def __getitem__(self, idx):\n", 561 | " return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", 562 | "\n", 563 | " def __len__(self):\n", 564 | " return len(self.encodings.input_ids)\n", 565 | "\n", 566 | "train_dataset = SquadDataset(train_encodings)\n", 567 | "val_dataset = SquadDataset(val_encodings)\n", 568 | "\n" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [ 577 | "train_dataset = DataLoader(train_dataset, batch_size=16, shuffle=True)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [] 586 | } 587 | ], 588 | "metadata": { 589 | "kernelspec": { 590 | "display_name": "Python 3", 591 | "language": "python", 592 | "name": "python3" 593 | }, 594 | "language_info": { 595 | "codemirror_mode": { 596 | "name": "ipython", 597 | "version": 3 598 | }, 599 | "file_extension": ".py", 600 | "mimetype": "text/x-python", 601 | "name": "python", 602 | "nbconvert_exporter": "python", 603 | "pygments_lexer": "ipython3", 604 | "version": "3.6.9" 605 | } 606 | }, 607 | "nbformat": 4, 608 | "nbformat_minor": 4 609 | } 610 | -------------------------------------------------------------------------------- /scripts/run_long_lm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import datetime 5 | from dataclasses import dataclass, field 6 | import functools 7 | import logging 8 | import math 9 | import os 10 | import pickle 11 | import re 12 | import sys 13 | import time 14 | import threading 15 | from typing import Optional 16 | 17 | import torch 18 | from torch.utils.data.dataset import Dataset 19 | from torch.utils.tensorboard import SummaryWriter 20 | import tqdm 21 | from transformers import logging as hf_logging 22 | from transformers.modeling_longformer import LongformerSelfAttention 23 | from transformers import ( 24 | PreTrainedModel, 25 | PreTrainedTokenizer, 26 | AutoModelForMaskedLM, 27 | RobertaForMaskedLM, 28 | XLMRobertaForMaskedLM, 29 | AutoTokenizer, 30 | ) 31 | 32 | from transformers import ( 33 | HfArgumentParser, 34 | DataCollatorForLanguageModeling, 35 | Trainer, 36 | TrainingArguments, 37 | set_seed, 38 | ) 39 | 40 | 41 | class color: 42 | """Help print colors to terminal.""" 43 | PURPLE = "\033[95m" 44 | CYAN = "\033[96m" 45 | DARKCYAN = "\033[36m" 46 | BLUE = "\033[94m" 47 | GREEN = "\033[92m" 48 | YELLOW = "\033[93m" 49 | RED = "\033[91m" 50 | BOLD = "\033[1m" 51 | UNDERLINE = "\033[4m" 52 | END = "\033[0m" 53 | 54 | 55 | def is_roberta_based_model(model_name: str) -> str: 56 | """Validate if the model to pre-train is of roberta architecture.""" 57 | 58 | r = re.compile('(.*)roberta(.*)') 59 | matches = r.findall(model_name) 60 | base_name = 'none' 61 | if len(matches) > 0: 62 | base_name = '-'.join(model_name.split('-')[:-1]) 63 | 64 | return base_name 65 | 66 | 67 | ########################################## 68 | # 69 | # Arguments 70 | # 71 | ########################################## 72 | 73 | """Helper function: Define argparser and args.""" 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument( 76 | "--model_name", 77 | default=None, 78 | type=str, 79 | help="Name to save the model as.", 80 | ) 81 | parser.add_argument( 82 | "--output_dir", 83 | default=None, 84 | type=str, 85 | help="The output directory for the trained model.", 86 | ) 87 | parser.add_argument( 88 | "--model_type", 89 | default=None, 90 | type=str, 91 | help="Model type selected in the list from Huggingface ex:" 92 | " `bert, roberta, xlm-roberta, ...`", 93 | ) 94 | parser.add_argument( 95 | "--model_name_or_path", 96 | default=None, 97 | type=str, 98 | required=True, 99 | help="Path to pretrained model from huggingface.co/models. " 100 | "Only tested on `xlm-roberta-base` and `roberta-base`.", 101 | ) 102 | parser.add_argument( 103 | "--logging_dir", 104 | default=None, 105 | type=str, 106 | help="Where logs are stored.", 107 | ) 108 | parser.add_argument( 109 | "--model_max_length", 110 | default=4096, 111 | type=int, 112 | choices=[ 113 | 512, 114 | 1024, 115 | 2048, 116 | 4096, 117 | 8192, 118 | 16384, 119 | 32768, 120 | 65536, 121 | 131072, 122 | 262144, 123 | 524288, 124 | 1048576, 125 | ], 126 | help="The maxiumum position of the model", 127 | ) 128 | parser.add_argument( 129 | "--attention_window", 130 | default=512, 131 | type=int, 132 | help="Size of attention window", 133 | ) 134 | parser.add_argument( 135 | "--evaluation_strategy", 136 | default="no", 137 | type=str, 138 | help="How evaluation should be logged, 'steps', 'epochs', 'no'.", 139 | ) 140 | parser.add_argument( 141 | "--do_train", 142 | action="store_true", 143 | help="Whether to run training." 144 | ) 145 | parser.add_argument( 146 | "--do_eval", 147 | action="store_true", 148 | help="Whether to run eval on the dev set." 149 | ) 150 | parser.add_argument( 151 | "--evaluate_during_training", 152 | action="store_true", 153 | help="Run evaluation during training at each logging step.", 154 | ) 155 | parser.add_argument( 156 | "--per_device_train_batch_size", 157 | default=8, 158 | type=int, 159 | help="Batch size per GPU/CPU for training.", 160 | ) 161 | parser.add_argument( 162 | "--per_device_eval_batch_size", 163 | default=8, 164 | type=int, 165 | help="Batch size per GPU/CPU for evaluation.", 166 | ) 167 | parser.add_argument( 168 | "--learning_rate", 169 | default=5e-5, 170 | type=float, 171 | help="The initial learning rate for Adam.", 172 | ) 173 | parser.add_argument( 174 | "--gradient_accumulation_steps", 175 | type=int, 176 | default=1, 177 | help="Number of gradient updates to perform before updating the weights", 178 | ) 179 | parser.add_argument( 180 | "--weight_decay", 181 | default=0.0, 182 | type=float, 183 | help="Weight decay if we apply some." 184 | ) 185 | parser.add_argument( 186 | "--adam_epsilon", 187 | default=1e-8, 188 | type=float, 189 | help="Epsilon for Adam optimizer." 190 | ) 191 | parser.add_argument( 192 | "--max_grad_norm", default=1.0, type=float, help="Max gradient norm." 193 | ) 194 | parser.add_argument( 195 | "--num_train_epochs", 196 | default=3.0, 197 | type=float, 198 | help="Total number of training epochs to perform.", 199 | ) 200 | parser.add_argument( 201 | "--max_steps", 202 | default=-1, 203 | type=int, 204 | help="If > 0: set total number of training steps to perform. " 205 | "Override num_train_epochs.", 206 | ) 207 | parser.add_argument( 208 | "--warmup_steps", 209 | default=0, 210 | type=int, 211 | help="Linear warmup over warmup_steps." 212 | ) 213 | parser.add_argument( 214 | "--verbose_logging", 215 | action="store_true", 216 | help="If true, log all information when loading datasets.", 217 | ) 218 | parser.add_argument( 219 | "--cache_dir", 220 | default=None, 221 | help="Where do you want to store the pretrained models.", 222 | ) 223 | parser.add_argument( 224 | "--lang_id", 225 | default=0, 226 | type=int, 227 | help="language id of input for language-specific xlm models " 228 | "(see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", 229 | ) 230 | parser.add_argument( 231 | "--logging_steps", 232 | type=int, 233 | default=500, 234 | help="Log every X updates steps." 235 | ) 236 | parser.add_argument( 237 | "--save_steps", 238 | type=int, 239 | default=500, 240 | help="Save checkpoint every X updates steps.", 241 | ) 242 | parser.add_argument( 243 | "--eval_all_checkpoints", 244 | action="store_true", 245 | help="Evaluate all checkpoints starting with the same prefix as model_name" 246 | "ending and ending with step number", 247 | ) 248 | parser.add_argument( 249 | "--overwrite_output_dir", 250 | action="store_true", 251 | help="Overwrite the content of the output directory", 252 | ) 253 | parser.add_argument( 254 | "--seed", 255 | type=int, 256 | default=42, 257 | help="random seed for initialization" 258 | ) 259 | parser.add_argument( 260 | "--local_rank", 261 | type=int, 262 | default=-1, 263 | help="local_rank for distributed training on gpus", 264 | ) 265 | parser.add_argument( 266 | "--fp16", 267 | action="store_true", 268 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex)", 269 | ) 270 | parser.add_argument( 271 | "--fp16_opt_level", 272 | type=str, 273 | default="O1", 274 | help="For fp16: Apex AMP optimization level selected in" 275 | "['O0', 'O1', 'O2', and 'O3'].", 276 | ) 277 | parser.add_argument( 278 | "--train_file_path", 279 | type=str, 280 | default="/workspace/data/wikitext-103/wiki.train.raw", 281 | help="File path to language model training file", 282 | ) 283 | parser.add_argument( 284 | "--val_file_path", 285 | type=str, 286 | default="/workspace/data/wikitext-103/wiki.valid.raw", 287 | help="File path to language model training file", 288 | ) 289 | parser.add_argument( 290 | "--eval_steps", 291 | type=int, 292 | default=None, 293 | help="File path to language model training file", 294 | ) 295 | 296 | args = parser.parse_args() 297 | 298 | hf_logging.enable_default_handler() 299 | hf_logging.set_verbosity_info() 300 | hf_logging.enable_explicit_format() 301 | 302 | tb_writer = SummaryWriter(log_dir=args.logging_dir) 303 | 304 | logger = logging.getLogger("") 305 | logger.setLevel(logging.INFO) 306 | fh = logging.FileHandler(f"{args.logging_dir}.log") 307 | sh = logging.StreamHandler(sys.stdout) 308 | formatter = logging.Formatter( 309 | "[%(asctime)s], %(levelname)s %(message)s", 310 | datefmt="%a, %d %b %Y %H:%M:%S", 311 | ) 312 | fh.setFormatter(formatter) 313 | sh.setFormatter(formatter) 314 | logger.addHandler(fh) 315 | logger.addHandler(sh) 316 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n") 317 | 318 | logger.warning( 319 | f"Process rank: {args.local_rank}, \ 320 | distributed training: {bool(args.local_rank != -1)}, \ 321 | 16-bits training: {args.fp16}" 322 | ) 323 | 324 | 325 | ########################################## 326 | # 327 | # Replace Huggingface - TextDataset 328 | # 329 | ########################################## 330 | 331 | # https://github.com/tqdm/tqdm/issues/458 332 | def provide_progress_bar( 333 | function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={} 334 | ): 335 | ret = [None] # Mutable var so the function can store its return value 336 | 337 | def myrunner(function, ret, *args, **kwargs): 338 | ret[0] = function(*args, **kwargs) 339 | 340 | thread = threading.Thread( 341 | target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs 342 | ) 343 | pbar = tqdm.tqdm(total=estimated_time, **tqdm_kwargs) 344 | 345 | thread.start() 346 | while thread.is_alive(): 347 | thread.join(timeout=tstep) 348 | pbar.update(tstep) 349 | pbar.close() 350 | return ret[0] 351 | 352 | 353 | def progress_wrapped(estimated_time, tstep=0.2, tqdm_kwargs={}): 354 | def real_decorator(function): 355 | @functools.wraps(function) 356 | def wrapper(*args, **kwargs): 357 | return provide_progress_bar( 358 | function, 359 | estimated_time=estimated_time, 360 | tstep=tstep, 361 | tqdm_kwargs=tqdm_kwargs, 362 | args=args, 363 | kwargs=kwargs, 364 | ) 365 | 366 | return wrapper 367 | return real_decorator 368 | 369 | 370 | class TextDataset(Dataset): 371 | # Ugly HACK on older transformers 372 | # Use same code as Huggingface TextDataset 373 | def __init__( 374 | self, 375 | tokenizer: PreTrainedTokenizer, 376 | file_path: str, 377 | block_size: int, 378 | overwrite_cache=False, 379 | cache_dir: Optional[str] = None, 380 | ): 381 | assert os.path.isfile( 382 | file_path), f"Input file path {file_path} not found" 383 | block_size = block_size - \ 384 | tokenizer.num_special_tokens_to_add(pair=False) 385 | 386 | directory, filename = os.path.split(file_path) 387 | cached_features_file = os.path.join( 388 | cache_dir if cache_dir is not None else directory, 389 | "cached_lm_{}_{}_{}".format( 390 | tokenizer.__class__.__name__, 391 | str(block_size), 392 | filename, 393 | ), 394 | ) 395 | 396 | # Make sure only the first process in distributed training processes the dataset, 397 | # and the others will use the cache. 398 | @progress_wrapped(estimated_time=200) 399 | def tokenize_text(text): 400 | return tokenizer.tokenize(text) 401 | 402 | @progress_wrapped(estimated_time=300) 403 | def convert_tokens_to_ids(tokenized_text): 404 | return tokenizer.convert_tokens_to_ids(tokenized_text) 405 | 406 | if os.path.exists(cached_features_file) and not overwrite_cache: 407 | start = time.time() 408 | with open(cached_features_file, "rb") as handle: 409 | self.examples = pickle.load(handle) 410 | logger.info( 411 | f"Loading features from cached file {cached_features_file} [took %.3f s]", 412 | time.time() - start, 413 | ) 414 | 415 | else: 416 | logger.info( 417 | f"Creating features from dataset file at {directory}\n\n") 418 | 419 | self.examples = [] 420 | with open(file_path, encoding="utf-8") as f: 421 | text = f.read() 422 | 423 | # For large texts and models, this could take a long time 424 | # Done i two steps, since each part can take between 5-10 min 425 | start = time.time() 426 | text = tokenize_text(text) 427 | logger.info("Tokenizing text [took %.3f s]", time.time() - start) 428 | start = time.time() 429 | tokenized_text = convert_tokens_to_ids(text) 430 | logger.info( 431 | "Converting text to id [took %.3f s]\n", time.time() - start) 432 | 433 | start = time.time() 434 | for i in range( 435 | 0, len(tokenized_text) - block_size + 1, block_size 436 | ): # Truncate in block of block_size 437 | self.examples.append( 438 | tokenizer.build_inputs_with_special_tokens( 439 | tokenized_text[i: i + block_size] 440 | ) 441 | ) 442 | logger.info( 443 | "Build tokenizer inputs by block_size length [took %.3f s]", 444 | time.time() - start, 445 | ) 446 | 447 | start = time.time() 448 | with open(cached_features_file, "wb") as handle: 449 | pickle.dump(self.examples, handle, 450 | protocol=pickle.HIGHEST_PROTOCOL) 451 | logger.info( 452 | "Saving features into cached file %s [took %.3f s]", 453 | cached_features_file, 454 | time.time() - start, 455 | ) 456 | 457 | def __len__(self): 458 | return len(self.examples) 459 | 460 | def __getitem__(self, i) -> torch.Tensor: 461 | return torch.tensor(self.examples[i], dtype=torch.long) 462 | 463 | 464 | ########################################################### 465 | # 466 | # Longformer conversion 467 | # 468 | ########################################################### 469 | 470 | # TODO: Huggingface transformers v. >3.5.1 breaks this 471 | class LongModelSelfAttention(LongformerSelfAttention): 472 | def forward( 473 | self, 474 | hidden_states, 475 | attention_mask=None, 476 | head_mask=None, 477 | encoder_hidden_states=None, 478 | encoder_attention_mask=None, 479 | output_attentions=False, 480 | ): 481 | print() 482 | 483 | return super().forward( 484 | hidden_states, 485 | attention_mask=attention_mask, 486 | ) 487 | 488 | 489 | # Load initial model 490 | MODEL: PreTrainedModel 491 | 492 | if is_roberta_based_model(args.model_name_or_path) == "xlm-roberta": 493 | MODEL = XLMRobertaForMaskedLM 494 | elif is_roberta_based_model(args.model_name_or_path) == "roberta": 495 | MODEL = RobertaForMaskedLM 496 | else: 497 | raise NotImplementedError( 498 | "Currently only supports roberta-based architectures.") 499 | 500 | 501 | class LongModelForMaskedLM: 502 | def __init__(self, config): 503 | super().__init__(config) 504 | print(f"\n{color.YELLOW}Converting models to Longformer is currently only tested for RoBERTa like architectures.{color.END}") 505 | for i, layer in enumerate(self.roberta.encoder.layer): 506 | layer.attention.self = LongModelSelfAttention(config, layer_id=i) 507 | 508 | 509 | def create_long_model( 510 | save_model_to, 511 | model, 512 | tokenizer, 513 | attention_window, 514 | model_max_length 515 | ): 516 | 517 | config = model.config 518 | position_embeddings = model.roberta.embeddings.position_embeddings 519 | 520 | tokenizer.model_max_length = model_max_length 521 | tokenizer.init_kwargs['model_max_length'] = model_max_length 522 | current_model_max_length, embed_size = position_embeddings.weight.shape 523 | 524 | # NOTE: RoBERTa has positions 0,1 reserved 525 | # embedding size is max position + 2 526 | model_max_length += 2 527 | config.max_position_embeddings = model_max_length 528 | assert model_max_length > current_model_max_length, \ 529 | "New model max_length must be longer than current max_length" 530 | 531 | # BUG for XLM: Need to make all zeros sice too large base model 532 | new_pos_embed = position_embeddings.weight.new_zeros( 533 | model_max_length, embed_size 534 | ) 535 | 536 | k = 2 537 | step = current_model_max_length - 2 538 | while k < model_max_length - 1: 539 | new_pos_embed[k:( 540 | k + step)] = position_embeddings.weight[2:] 541 | k += step 542 | 543 | # HACK for Huggingface transformers >=3.4.0 and < 4.0 544 | # https://github.com/huggingface/transformers/issues/6465#issuecomment-719042969 545 | position_embeddings.weight.data = new_pos_embed 546 | model.roberta.embeddings.position_embeddings.num_embeddings = len( 547 | new_pos_embed.data 548 | ) 549 | num_model_embeddings = position_embeddings.num_embeddings 550 | model.roberta.embeddings.position_ids = torch.arange( 551 | 0, num_model_embeddings 552 | )[None] 553 | 554 | # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` 555 | config.attention_window = [attention_window] * config.num_hidden_layers 556 | for i, layer in enumerate(model.roberta.encoder.layer): 557 | longformer_self_attn = LongformerSelfAttention(config, layer_id=i) 558 | longformer_self_attn.query = layer.attention.self.query 559 | longformer_self_attn.key = layer.attention.self.key 560 | longformer_self_attn.value = layer.attention.self.value 561 | 562 | longformer_self_attn.query_global = layer.attention.self.query 563 | longformer_self_attn.key_global = layer.attention.self.key 564 | longformer_self_attn.value_global = layer.attention.self.value 565 | 566 | layer.attention.self = longformer_self_attn 567 | 568 | logger.info(f'saving model to {save_model_to}') 569 | model.save_pretrained(save_model_to) 570 | tokenizer.save_pretrained(save_model_to) 571 | return model, tokenizer 572 | 573 | 574 | def copy_proj_layers(model): 575 | for _, layer in enumerate(model.roberta.encoder.layer): 576 | layer.attention.self.query_global = layer.attention.self.query 577 | layer.attention.self.key_global = layer.attention.self.key 578 | layer.attention.self.value_global = layer.attention.self.value 579 | return model 580 | 581 | 582 | def pretrain_and_evaluate( 583 | training_args, data_args, model, tokenizer, eval_only, model_path 584 | ): 585 | val_dataset = TextDataset( 586 | tokenizer=tokenizer, 587 | file_path=data_args.val_file_path, 588 | block_size=tokenizer.max_len, 589 | ) 590 | if eval_only: 591 | train_dataset = val_dataset 592 | else: 593 | logger.info( 594 | f"Loading and tokenizing training data is usually slow: {data_args.train_file_path}" 595 | ) 596 | train_dataset = TextDataset( 597 | tokenizer=tokenizer, 598 | file_path=data_args.train_file_path, 599 | block_size=tokenizer.max_len, 600 | ) 601 | 602 | data_collator = DataCollatorForLanguageModeling( 603 | tokenizer=tokenizer, mlm=True, mlm_probability=0.15 604 | ) 605 | 606 | trainer = Trainer( 607 | model=model, 608 | args=training_args, 609 | data_collator=data_collator, 610 | train_dataset=train_dataset, 611 | eval_dataset=val_dataset, 612 | prediction_loss_only=True, 613 | ) 614 | 615 | eval_loss = trainer.evaluate() 616 | eval_loss = eval_loss["eval_loss"] 617 | print(f"Initial eval bpc: {color.GREEN}{eval_loss/math.log(2)}{color.END}") 618 | logger.info(f"Initial eval bpc: {eval_loss/math.log(2)}") 619 | 620 | if not eval_only: 621 | trainer.train(model_path=model_path) 622 | trainer.save_model() 623 | 624 | eval_loss = trainer.evaluate() 625 | eval_loss = eval_loss["eval_loss"] 626 | print( 627 | f"Eval bpc after pretraining: \ 628 | {color.GREEN}{eval_loss/math.log(2)}{color.END}" 629 | ) 630 | logger.info(f"Eval bpc after pretraining: {eval_loss/math.log(2)}") 631 | 632 | 633 | @dataclass 634 | class ModelArguments: 635 | """Huggingface parameters for the model training.""" 636 | 637 | model_name_or_path: str = field( 638 | default=None, 639 | metadata={ 640 | "help": "Name of pretrained model to load for model and tokenizer" 641 | ", based on huggingface.co/models, ex 'roberta-base'" 642 | }, 643 | ) 644 | model_name: str = field( 645 | default="roberta-base-long-4096-lm", 646 | metadata={"help": "Name to use when saving model."}, 647 | ) 648 | attention_window: int = field( 649 | default=512, 650 | metadata={"help": "Size of attention window"} 651 | ) 652 | model_max_length: int = field( 653 | default=4096, 654 | metadata={"help": "Maximum position"} 655 | ) 656 | cache_dir: Optional[str] = field( 657 | default=None, 658 | metadata={ 659 | "help": "Where do you want to store the pretrained models." 660 | }, 661 | ) 662 | 663 | 664 | @dataclass 665 | class DataTrainingArguments: 666 | """Training and validation data arguments.""" 667 | 668 | val_file_path: str = field( 669 | default="/workspace/data/wikitext-103-raw/wiki.valid.raw", 670 | metadata={"help": "File for training a Language Model"}, 671 | ) 672 | train_file_path: str = field( 673 | default="/workspace/data/wikitext-103-raw/wiki.train.raw", 674 | metadata={"help": "File for evaluating a Language Model"}, 675 | ) 676 | 677 | 678 | def main(): 679 | ############################################ 680 | # 681 | # Define model params 682 | # 683 | ############################################ 684 | 685 | parser = HfArgumentParser( 686 | (ModelArguments, DataTrainingArguments, TrainingArguments) 687 | ) 688 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 689 | 690 | set_seed(training_args.seed) 691 | 692 | if ( 693 | os.path.exists(training_args.output_dir) 694 | and os.listdir(training_args.output_dir) 695 | and training_args.do_train 696 | and not training_args.overwrite_output_dir 697 | ): 698 | raise ValueError( 699 | f"Output directory ({training_args.output_dir}) \ 700 | already exists and is not empty.\ 701 | Use --overwrite_output_dir to overcome." 702 | ) 703 | 704 | ########################################### 705 | # 706 | # RUN 707 | # 708 | ########################################### 709 | 710 | start = time.time() 711 | print("---------------------------------------------------------") 712 | print( 713 | f"\nLoading from Huggingface pretrained model: \ 714 | `{color.BOLD}{color.GREEN}\ 715 | {model_args.model_name_or_path}\ 716 | {color.END}{color.END}` \ 717 | with name: {model_args.model_name}\n" 718 | ) 719 | 720 | model = AutoModelForMaskedLM.from_pretrained( 721 | model_args.model_name_or_path, 722 | cache_dir=model_args.cache_dir, 723 | ) 724 | tokenizer = AutoTokenizer.from_pretrained( 725 | model_args.model_name_or_path, 726 | model_max_length=model_args.model_max_length, 727 | cache_dir=model_args.cache_dir, 728 | use_fast=True, 729 | ) 730 | 731 | print(f"{color.RED}Creating Longformer model{color.END}") 732 | model_path = training_args.output_dir 733 | if not os.path.exists(model_path): 734 | os.makedirs(model_path) 735 | 736 | logger.info( 737 | f"Converting {model_args.model_name_or_path} \ 738 | into {model_args.model_name}" 739 | ) 740 | model, tokenizer = create_long_model( 741 | save_model_to=model_path, 742 | model=model, 743 | tokenizer=tokenizer, 744 | attention_window=model_args.attention_window, 745 | model_max_length=model_args.model_max_length, 746 | ) 747 | 748 | print(f"{color.RED}Loading Model{color.END}") 749 | logger.info(f"Loading the model from {model_path}") 750 | model = LongModelForMaskedLM.from_pretrained(model_path) 751 | tokenizer = AutoTokenizer.from_pretrained( 752 | model_path, 753 | model_max_length=model_args.model_max_length, 754 | use_fast=True 755 | ) 756 | 757 | print(f"{color.RED}Evaluate{color.END}") 758 | logger.info( 759 | f"Pretraining \ 760 | {model_args.model_name_or_path}-{model_args.model_max_length}... " 761 | ) 762 | pretrain_and_evaluate( 763 | training_args, 764 | data_args, 765 | model, 766 | tokenizer, 767 | eval_only=False, 768 | model_path=training_args.output_dir, 769 | ) 770 | 771 | print( 772 | f"{color.PURPLE}TIME elapsed{color.END}: {datetime.datetime.fromtimestamp(time.time()-start).strftime('%d days, %H:%M:%S')}" 773 | ) 774 | 775 | logger.info( 776 | "Copying local projection layers into global projection layers..." 777 | ) 778 | model = copy_proj_layers(model) 779 | logger.info(f"Saving model to {model_path}") 780 | model.save_pretrained(model_path) 781 | 782 | print(f"{color.RED}Loading Done model{color.END}") 783 | 784 | logger.info(f"Loading the model from {model_path}") 785 | model = LongModelForMaskedLM.from_pretrained(model_path) 786 | tokenizer = AutoTokenizer.from_pretrained(model_path) 787 | 788 | 789 | if __name__ == "__main__": 790 | main() 791 | -------------------------------------------------------------------------------- /scripts/finetune_qa_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import print_function 4 | import argparse 5 | from collections import Counter 6 | from dataclasses import dataclass, field 7 | import logging 8 | import os 9 | import re 10 | import string 11 | import sys 12 | from typing import Any, Dict, List, Optional, Union 13 | 14 | import datasets 15 | import torch 16 | from torch.utils.tensorboard import SummaryWriter 17 | from tqdm.auto import tqdm 18 | from transformers import logging as hf_logging 19 | from transformers import ( 20 | AutoModelForQuestionAnswering, 21 | AutoTokenizer, 22 | DataCollator, 23 | HfArgumentParser, 24 | PreTrainedModel, 25 | PreTrainedTokenizer, 26 | set_seed, 27 | Trainer, 28 | TrainingArguments, 29 | ) 30 | 31 | 32 | # helper 33 | class color: 34 | PURPLE = "\033[95m" 35 | CYAN = "\033[96m" 36 | DARKCYAN = "\033[36m" 37 | BLUE = "\033[94m" 38 | GREEN = "\033[92m" 39 | YELLOW = "\033[93m" 40 | RED = "\033[91m" 41 | BOLD = "\033[1m" 42 | UNDERLINE = "\033[4m" 43 | END = "\033[0m" 44 | 45 | 46 | @dataclass 47 | class QADataset: 48 | """Collection for the language to load in HF datasets 49 | 50 | args: 51 | - langs: includes the number of languages to load, 52 | - text_on_eval: the print statements when evaluating the datasets 53 | - data: the tokenized datasets 54 | """ 55 | langs: List[str] 56 | text_on_eval: List[str] 57 | data: List[Any] = None 58 | 59 | 60 | SQUAD = QADataset( 61 | ["squad"], 62 | [ 63 | "SQuAD 1.1 validation dataset" 64 | ] 65 | ) 66 | 67 | 68 | # base xquad 69 | XQUAD = QADataset( 70 | ["ar", "de", "el", "en", "es", "hi", "ru", "th", "tr", "vi", "zh", ], 71 | [ 72 | "XQuAD Arabic validation", 73 | "XQuAD German validation", 74 | "XQuAD Greek validation", 75 | "XQuAD English validation", 76 | "XQuAD Spanish validation", 77 | "XQuAD Hindi validation", 78 | "XQuAD Russian validation", 79 | "XQuAD Thai validation", 80 | "XQuAD Turkish validation", 81 | "XQuAD Vietnamese validation", 82 | "XQuAD Chinese validation", 83 | ] 84 | ) 85 | 86 | # base mlqa 87 | MLQA = QADataset( 88 | ["ar", "de", "en", "es", "hi", "vi", "zh"], 89 | [ 90 | "SQuAD 1.1 validation dataset", 91 | "MLQA Arabic validation", 92 | "MLQA German validation", 93 | "MLQA English validation", 94 | "MLQA Spanish validation", 95 | "MLQA Hindi validation", 96 | "MLQA Vietnamese validation", 97 | "MLQA Chinese validation", 98 | ] 99 | ) 100 | 101 | 102 | def check_positive_concats(nr_concats): 103 | """Helper funtion for argparse 104 | Instructs how many contexts to concatinate together. 105 | Defualt for longer contexts are three. 106 | More can be used, but then it requires larger GPUs. 107 | 108 | *NOTE* this is only used when using the datasets: 109 | - squad_long or 110 | - xquad_long 111 | """ 112 | try: 113 | nr_concats_int = int(nr_concats) 114 | if nr_concats_int <= 0: 115 | raise argparse.ArgumentTypeError( 116 | f"--nr_concats expects a positive int as a value, \ 117 | not {nr_concats}" 118 | ) 119 | except ValueError as e: 120 | if hasattr(e, "message"): 121 | print(e.message) 122 | else: 123 | print(e) 124 | return nr_concats_int 125 | 126 | 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument( 129 | "--nr_concats", 130 | default=3, 131 | type=check_positive_concats, 132 | help="How many context to concatinate when using a `long` QA dataset.\n" 133 | "3 is default and yields an average context lenght of 2048 tokens", 134 | ) 135 | parser.add_argument( 136 | "--model_name", 137 | default=None, 138 | type=str, 139 | help="Name to save the model as.", 140 | ) 141 | parser.add_argument( 142 | "--output_dir", 143 | default=None, 144 | type=str, 145 | help="The output directory for the model checkpoints and predictions.", 146 | ) 147 | parser.add_argument( 148 | "--model_type", 149 | default=None, 150 | type=str, 151 | help="Model type selected from Huggingface ex: `roberta, xlm-roberta`", 152 | ) 153 | parser.add_argument( 154 | "--model_name_or_path", 155 | default=None, 156 | type=str, 157 | required=True, 158 | help="Path to pretrained model from huggingface.co/models. \n" 159 | "Only tested on `xlm-roberta-base` and `roberta-base`.", 160 | ) 161 | parser.add_argument( 162 | "--datasets", 163 | default=None, 164 | type=str, 165 | required=True, 166 | help="Define one of Huggingface Datasets Question Answering Tasks.", 167 | ) 168 | parser.add_argument( 169 | "--train_file_path", 170 | default=None, 171 | type=str, 172 | help="File path to where torch training file is stored (.pt files).", 173 | ) 174 | parser.add_argument( 175 | "--valid_file_path", 176 | default=None, 177 | type=str, 178 | help="File path to where torch validation file is stored (.pt files).", 179 | ) 180 | parser.add_argument( 181 | "--data_dir", 182 | default=None, 183 | type=str, 184 | help="Directory to where to store training and validation torch files.", 185 | ) 186 | parser.add_argument( 187 | "--logging_dir", 188 | default=None, 189 | type=str, 190 | help="The output directory where the the loggs are stored.", 191 | ) 192 | parser.add_argument( 193 | "--max_length", 194 | default=512, 195 | type=int, 196 | choices=[ 197 | 512, 198 | 1024, 199 | 2048, 200 | 4096, 201 | ], 202 | help="The maxiumum position of the model", 203 | ) 204 | parser.add_argument( 205 | "--attention_window", 206 | default=512, 207 | type=int, 208 | help="Size of attention window", 209 | ) 210 | parser.add_argument( 211 | "--do_train", 212 | action="store_true", 213 | help="Whether to run training." 214 | ) 215 | parser.add_argument( 216 | "--do_eval", 217 | action="store_true", 218 | help="Whether to run eval on the dev set." 219 | ) 220 | parser.add_argument( 221 | "--evaluate_during_training", 222 | action="store_true", 223 | help="Run evaluation during training at each logging step.", 224 | ) 225 | parser.add_argument( 226 | "--per_device_train_batch_size", 227 | default=8, 228 | type=int, 229 | help="Batch size per GPU/CPU for training.", 230 | ) 231 | parser.add_argument( 232 | "--per_device_eval_batch_size", 233 | default=8, 234 | type=int, 235 | help="Batch size per GPU/CPU for evaluation.", 236 | ) 237 | parser.add_argument( 238 | "--learning_rate", 239 | default=5e-5, 240 | type=float, 241 | help="The initial learning rate for Adam.", 242 | ) 243 | parser.add_argument( 244 | "--gradient_accumulation_steps", 245 | type=int, 246 | default=1, 247 | help="Number of updates to acummulate the gradient for before updating.", 248 | ) 249 | parser.add_argument( 250 | "--weight_decay", 251 | default=0.0, 252 | type=float, 253 | help="Weight decay if we apply some." 254 | ) 255 | parser.add_argument( 256 | "--adam_epsilon", 257 | default=1e-8, 258 | type=float, 259 | help="Epsilon for Adam optimizer." 260 | ) 261 | parser.add_argument( 262 | "--max_grad_norm", 263 | default=1.0, 264 | type=float, 265 | help="Max gradient norm." 266 | ) 267 | parser.add_argument( 268 | "--num_train_epochs", 269 | default=3.0, 270 | type=float, 271 | help="Total number of training epochs to perform.", 272 | ) 273 | parser.add_argument( 274 | "--max_steps", 275 | default=-1, 276 | type=int, 277 | help="If > 0: set total number of training steps to perform." 278 | " Override num_train_epochs.", 279 | ) 280 | parser.add_argument( 281 | "--warmup_steps", 282 | default=0, 283 | type=int, 284 | help="Linear warmup over warmup_steps." 285 | ) 286 | parser.add_argument( 287 | "--verbose_logging", 288 | action="store_true", 289 | help="If true, display all logging messages from huggingface libraries." 290 | "A number of warnings are expected for a normal SQuAD evaluation.", 291 | ) 292 | parser.add_argument( 293 | "--lang_id", 294 | default=0, 295 | type=int, 296 | help="language id of input for language-specific xlm models.", 297 | ) 298 | parser.add_argument( 299 | "--logging_steps", type=int, default=500, help="Log every X updates steps." 300 | ) 301 | parser.add_argument( 302 | "--save_steps", 303 | type=int, 304 | default=500, 305 | help="Save checkpoint every X updates steps.", 306 | ) 307 | parser.add_argument( 308 | "--eval_all_checkpoints", 309 | action="store_true", 310 | help="Evaluate all checkpoints starting with the same prefix as model_name", 311 | ) 312 | parser.add_argument( 313 | "--overwrite_output_dir", 314 | action="store_true", 315 | help="Overwrite the content of the output directory", 316 | ) 317 | parser.add_argument( 318 | "--seed", type=int, default=42, help="random seed for initialization" 319 | ) 320 | parser.add_argument( 321 | "--local_rank", 322 | type=int, 323 | default=-1, 324 | help="local_rank for distributed training on gpus", 325 | ) 326 | parser.add_argument( 327 | "--fp16", 328 | action="store_true", 329 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex).", 330 | ) 331 | parser.add_argument( 332 | "--fp16_opt_level", 333 | type=str, 334 | default="O1", 335 | help="For fp16: Apex AMP optimization level selected in " 336 | "['O0', 'O1', 'O2', and 'O3']." 337 | ) 338 | parser.add_argument( 339 | "--prediction_loss_only", 340 | action="store_true", 341 | help="If only prediciton loss should be returned", 342 | ) 343 | parser.add_argument( 344 | "--eval_steps", 345 | type=int, 346 | default=500, 347 | help="If input should be tokenized to only lowercase", 348 | ) 349 | parser.add_argument( 350 | "--do_lowercase", 351 | action="store_true", 352 | help="If input should be lowercase or not when tokenizing", 353 | ) 354 | 355 | 356 | args = parser.parse_args() 357 | 358 | hf_logging.enable_default_handler() 359 | hf_logging.set_verbosity_info() 360 | hf_logging.enable_explicit_format() 361 | 362 | # Setup logging 363 | tb_writer = SummaryWriter(log_dir=args.logging_dir) 364 | 365 | logger = logging.getLogger("") 366 | logger.setLevel(logging.INFO) 367 | 368 | fh = logging.FileHandler(f"{args.logging_dir}.log") 369 | sh = logging.StreamHandler(sys.stdout) 370 | formatter = logging.Formatter( 371 | "[%(asctime)s], %(levelname)s %(message)s", 372 | datefmt="%a, %d %b %Y %H:%M:%S", 373 | ) 374 | fh.setFormatter(formatter) 375 | sh.setFormatter(formatter) 376 | logger.addHandler(fh) 377 | logger.addHandler(sh) 378 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n") 379 | 380 | logger.warning( 381 | f"Process rank: {args.local_rank}, \ 382 | distributed training: {bool(args.local_rank != -1)}, \ 383 | 16-bits training: {args.fp16}" 384 | ) 385 | 386 | 387 | logger.info("=" * 50) 388 | logger.info("=" + "\t" * 6 + " =") 389 | logger.info("=" + "\tInitialization" + "\t" * 4 + " =") 390 | logger.info("=" + "\t" * 6 + " =") 391 | logger.info("=" * 50 + "\n\n") 392 | 393 | 394 | tokenizer = AutoTokenizer.from_pretrained( 395 | args.model_name_or_path, 396 | cache_dir=args.cache_dir, 397 | do_lowercase=args.do_lowercase, 398 | pad_to_max_length=True, 399 | max_length=args.max_length, 400 | truncation=True, 401 | use_fast=True, 402 | ) 403 | model = AutoModelForQuestionAnswering.from_pretrained( 404 | args.model_name_or_path, 405 | cache_dir=args.cache_dir, 406 | ) 407 | 408 | 409 | ######################################### 410 | # # 411 | # SQuADs Evaluation metrics # 412 | # # 413 | ######################################### 414 | 415 | def normalize_answer(s: str) -> str: 416 | """Lower text and remove punctuation, articles and extra whitespace.""" 417 | 418 | def remove_articles(text): 419 | return re.sub(r"\b(a|an|the)\b", " ", text) 420 | 421 | def white_space_fix(text): 422 | return " ".join(text.split()) 423 | 424 | def remove_punc(text): 425 | exclude = set(string.punctuation) 426 | return "".join(ch for ch in text if ch not in exclude) 427 | 428 | def lower(text): 429 | return text.lower() 430 | 431 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 432 | 433 | 434 | def f1_score(prediction, ground_truth): 435 | prediction_tokens = normalize_answer(prediction).split() 436 | ground_truth_tokens = normalize_answer(ground_truth).split() 437 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 438 | num_same = sum(common.values()) 439 | if num_same == 0: 440 | return 0 441 | precision = 1.0 * num_same / len(prediction_tokens) 442 | recall = 1.0 * num_same / len(ground_truth_tokens) 443 | f1 = (2 * precision * recall) / (precision + recall) 444 | return f1 445 | 446 | 447 | def exact_match_score(prediction: str, ground_truth: str) -> bool: 448 | return normalize_answer(prediction) == normalize_answer(ground_truth) 449 | 450 | 451 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 452 | scores_for_ground_truths = [] 453 | for ground_truth in ground_truths: 454 | score = metric_fn(prediction, ground_truth) 455 | scores_for_ground_truths.append(score) 456 | return max(scores_for_ground_truths) 457 | 458 | 459 | def evaluate( 460 | gold_answers: List[str], 461 | predictions: List[str] 462 | ) -> Dict[Union[str, float]]: 463 | 464 | f1 = exact_match = total = 0 465 | 466 | for ground_truths, prediction in zip(gold_answers, predictions): 467 | total += 1 468 | exact_match += metric_max_over_ground_truths( 469 | exact_match_score, prediction, ground_truths 470 | ) 471 | f1 += metric_max_over_ground_truths(f1_score, 472 | prediction, ground_truths) 473 | 474 | exact_match = 100.0 * exact_match / total 475 | f1 = 100.0 * f1 / total 476 | 477 | return {"exact_match": exact_match, "f1": f1} 478 | 479 | 480 | #################################################### 481 | # 482 | # Evaluation 483 | # 484 | #################################################### 485 | 486 | 487 | def get_squad_evaluation( 488 | valid_dataset: DataCollator, 489 | model: PreTrainedModel, 490 | tokenizer: PreTrainedTokenizer, 491 | dataset_name: str, 492 | batch_size: int 493 | ) -> None: 494 | """ 495 | Makes a prediction and evaluates it based on the trained model 496 | The evaluation is based on the SQuAD evaluation metric: 497 | valdid_datset is expected to be converted to a torch Tensor type: 498 | """ 499 | 500 | logging.info(f"Generating perdictions for dataset '{dataset_name}'") 501 | dataloader = torch.utils.data.DataLoader( 502 | valid_dataset, batch_size=batch_size) 503 | 504 | # predictions 505 | predicted_answers = [] 506 | with torch.no_grad(): 507 | for batch in tqdm(dataloader): 508 | start_scores, end_scores = model( 509 | input_ids=batch["input_ids"].cuda(), 510 | attention_mask=batch["attention_mask"].cuda(), 511 | ) 512 | for i in range(start_scores.shape[0]): 513 | all_tokens = tokenizer.convert_ids_to_tokens( 514 | batch["input_ids"][i]) 515 | answer = " ".join( 516 | all_tokens[ 517 | torch.argmax(start_scores[i]): 518 | torch.argmax(end_scores[i]) + 1 519 | ] 520 | ) 521 | ans_ids = tokenizer.convert_tokens_to_ids(answer.split()) 522 | answer = tokenizer.decode(ans_ids) 523 | predicted_answers.append(answer) 524 | 525 | valid_dataset.reset_format() 526 | predictions = [] 527 | references = [] 528 | for ref, pred_answer in zip(valid_dataset, predicted_answers): 529 | actual_answer = ref["answers"]["text"] 530 | predictions.append(pred_answer) 531 | references.append(actual_answer) 532 | 533 | eval_score = evaluate(references, predictions) 534 | logging.info(f"Results from prediction:\n{eval_score}\n" + "=" * 55 + "\n") 535 | 536 | 537 | ######################################### 538 | # # 539 | # Convert train and validation datasets # 540 | # # 541 | ######################################### 542 | 543 | def get_correct_alignement(context: str, answer): 544 | """Some original examples in SQuAD have indices wrong by 1 or 2 character. 545 | """ 546 | gold_text = answer["text"][0] 547 | start_idx = answer["answer_start"][0] 548 | end_idx = start_idx + len(gold_text) 549 | if context[start_idx:end_idx] == gold_text: 550 | return start_idx, end_idx 551 | elif context[start_idx - 1: end_idx - 1] == gold_text: 552 | return start_idx - 1, end_idx - 1 553 | elif context[start_idx - 2: end_idx - 2] == gold_text: 554 | return start_idx - 2, end_idx - 2 555 | else: 556 | raise ValueError() 557 | 558 | 559 | MAX_CONTEXT_LENGTH = 0 560 | 561 | 562 | def convert_to_features(example): 563 | 564 | encodings = tokenizer.encode_plus( 565 | example["question"], 566 | example["context"], 567 | pad_to_max_length=True, 568 | max_length=args.max_length, 569 | truncation=True, 570 | ) 571 | context_encodings = tokenizer.encode_plus(example["context"]) 572 | 573 | start_idx, end_idx = get_correct_alignement( 574 | example["context"], example["answers"]) 575 | start_positions_context = context_encodings.char_to_token(start_idx) 576 | end_positions_context = context_encodings.char_to_token(end_idx - 1) 577 | 578 | # FIXME: UGLY HACK because of XLM-R tokenization, works fine with roberta 579 | sep_idx = encodings["input_ids"].index(tokenizer.sep_token_id) 580 | try: 581 | start_positions = start_positions_context + sep_idx + 1 582 | end_positions = end_positions_context + sep_idx + 1 583 | 584 | # if end_positions > 4096: 585 | # start_positions, end_positions = None, None 586 | except: 587 | start_positions = None 588 | end_positions = None 589 | 590 | encodings.update( 591 | { 592 | "start_positions": start_positions, 593 | "end_positions": end_positions, 594 | "attention_mask": encodings["attention_mask"], 595 | } 596 | ) 597 | return encodings 598 | 599 | 600 | def convert_dataset_to_torch_format(data): 601 | data = data.map(convert_to_features).filter( 602 | lambda example: (example["start_positions"] is not None) 603 | and (example["end_positions"] is not None) 604 | ) 605 | 606 | # set the tensor type and the columns which the dataset should return 607 | columns = ["input_ids", "attention_mask", 608 | "start_positions", "end_positions"] 609 | data.set_format(type="torch", columns=columns) 610 | print(max(data["start_positions"])) 611 | print(data.shape) 612 | return data 613 | 614 | 615 | ################## 616 | # 617 | # Training 618 | # 619 | ################## 620 | 621 | 622 | class DummyDataCollator: 623 | def __call__(self, batch): 624 | 625 | input_ids = torch.stack([example["input_ids"] for example in batch]) 626 | attention_mask = torch.stack( 627 | [example["attention_mask"] for example in batch]) 628 | start_positions = torch.stack( 629 | [example["start_positions"] for example in batch]) 630 | end_positions = torch.stack( 631 | [example["end_positions"] for example in batch]) 632 | 633 | return { 634 | "input_ids": input_ids, 635 | "start_positions": start_positions, 636 | "end_positions": end_positions, 637 | "attention_mask": attention_mask, 638 | } 639 | 640 | 641 | @ dataclass 642 | class ModelArguments: 643 | 644 | model_name_or_path: str = field( 645 | metadata={ 646 | "help": "Path to pretrained model or model identifier" 647 | } 648 | ) 649 | tokenizer_name: Optional[str] = field( 650 | default=None, 651 | metadata={ 652 | "help": "Pretrained tokenizer name or path" 653 | }, 654 | ) 655 | cache_dir: Optional[str] = field( 656 | default=None, 657 | metadata={ 658 | "help": "Where do you want to store the pretrained models" 659 | }, 660 | ) 661 | do_lowercase: bool = field( 662 | default=False, 663 | metadata={"help": "If tokenizer should make all to lowercase."}, 664 | ) 665 | max_seq_length: Optional[int] = field( 666 | default=384, 667 | metadata={"help": "TODO"}, 668 | ) 669 | doc_stride: Optional[int] = field( 670 | default=128, 671 | metadata={"help": "TODO"}, 672 | ) 673 | model_type: Optional[str] = field( 674 | default=None, 675 | metadata={"help": "TODO"}, 676 | ) 677 | 678 | 679 | @dataclass 680 | class DataTrainingArguments: 681 | 682 | datasets: str = field(metadata={"help": "The dataset name to load."}) 683 | data_dir: Optional[str] = field( 684 | default=None, 685 | metadata={ 686 | "help": "Path to the dataset containing train and eval datasets."}, 687 | ) 688 | train_file_path: Optional[str] = field( 689 | default="train_data.pt", 690 | metadata={"help": "Path for cached train dataset"}, 691 | ) 692 | valid_file_path: Optional[str] = field( 693 | default="valid_data.pt", 694 | metadata={"help": "Path for cached valid dataset"}, 695 | ) 696 | max_length: Optional[int] = field( 697 | default=512, 698 | metadata={"help": "Max input length for the source text"}, 699 | ) 700 | nr_concats: Optional[int] = field( 701 | default=3, 702 | metadata={"help": "Number of contexts to concatinate"}, 703 | ) 704 | 705 | 706 | def load_datasets( 707 | languages: QADataset, 708 | base_dataset: str = None, 709 | concatinate: bool = False, 710 | split: str = 'validation', 711 | ): 712 | """Loads a Huggingface dataset based on the `base` dataset 713 | (squad/xquad/mlqa).""" 714 | 715 | dataset: List[Any] = [] 716 | 717 | data: List 718 | dataset: str 719 | for lang in languages.langs: 720 | if base_dataset is not None: 721 | dataset = f"{base_dataset}.{lang}" 722 | if base_dataset == "mlqa": 723 | dataset = f"{dataset}.{lang}" 724 | 725 | data = datasets.load_dataset(base_dataset, dataset, split=split) 726 | else: 727 | data = datasets.load_dataset(lang, split=split) 728 | 729 | if concatinate: 730 | data = concatinate_squad_data(data, args.nr_concats) 731 | data = convert_dataset_to_torch_format(data) 732 | dataset.append(data) 733 | 734 | return dataset 735 | 736 | 737 | def concatinate_squad_data(d, span=3): 738 | """ 739 | Concatinate "SPAN" number of SQuAD questions together 740 | """ 741 | 742 | def get_span(index, span): 743 | """ 744 | Returns the value in a range for whole numbers 745 | 746 | Ex: index=4, span=5 747 | lower=0, upper=5 748 | 749 | index=5, span=5 750 | lower=0, upper=5 751 | 752 | index=8, span=5 753 | lower=5, upper=10 754 | """ 755 | lower_bound = (index) // span 756 | lower_bound = lower_bound * span 757 | upper_bound = lower_bound + span 758 | return lower_bound, upper_bound 759 | 760 | def set_start_pos(example, idx): 761 | """ 762 | Get correct new starting position when concatinating SQuAD datasets 763 | """ 764 | low, high = get_span(idx, span) 765 | 766 | # Get new starting position 767 | prev_start = 0 768 | if idx != low: 769 | prev_start = len("".join(data["context"][low:idx])) 770 | 771 | start_pos = data["answers"][idx]["answer_start"][0] 772 | if not isinstance(start_pos, int): 773 | start_pos = start_pos[0] 774 | new_start = [prev_start + start_pos] 775 | example["answers"]["answer_start"] = new_start 776 | return example 777 | 778 | def set_context(example, idx): 779 | """ 780 | Concatinate "SPAN" number of SQuAD samples 781 | """ 782 | low, high = get_span(idx, span) 783 | 784 | # Get new context 785 | example["context"] = "".join(data["context"][low:high]) 786 | return example 787 | 788 | # Filters out questions using the same context but different questions 789 | data = d.filter( 790 | lambda example, idx: example["context"] != d["context"][idx - 1], 791 | with_indices=True, 792 | ) 793 | 794 | data = data.map( 795 | lambda example, idx: set_start_pos(example, idx), 796 | with_indices=True 797 | ) 798 | data = data.map( 799 | lambda example, idx: set_context(example, idx), 800 | with_indices=True 801 | ) 802 | 803 | return data 804 | 805 | 806 | ################################################################# 807 | # 808 | # Main function 809 | # 810 | ################################################################# 811 | 812 | 813 | def main(): 814 | 815 | parser = HfArgumentParser( 816 | (ModelArguments, DataTrainingArguments, TrainingArguments) 817 | ) 818 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 819 | 820 | # Needed for getting eval_loss for QA in transformer v. 3.0.2 and 4.0.0 821 | training_args.label_names = ["start_positions", "end_positions"] 822 | 823 | if ( 824 | os.path.exists(training_args.output_dir) 825 | and os.listdir(training_args.output_dir) 826 | and training_args.do_train 827 | and not training_args.overwrite_output_dir 828 | ): 829 | raise ValueError( 830 | f"Output directory ({training_args.output_dir}) \ 831 | already exists and is not empty. \ 832 | Use --overwrite_output_dir to overcome." 833 | ) 834 | 835 | if data_args.data_dir is None: 836 | data_args.data_dir = "." 837 | 838 | if data_args.train_file_path is None or data_args.valid_file_path is None: 839 | data_args.train_file_path = f"{data_args.data_dir}/train_data.pt" 840 | data_args.valid_file_path = f"{data_args.data_dir}/val_data.pt" 841 | 842 | logger.info("Model parameters set: \n", model_args) 843 | logging.info(f"Logging to file: {training_args.logging_dir}.log") 844 | 845 | set_seed(training_args.seed) 846 | 847 | tokenizer = AutoTokenizer.from_pretrained( 848 | model_args.model_name_or_path, 849 | cache_dir=model_args.cache_dir, 850 | do_lowercase=args.do_lowercase, 851 | pad_to_max_length=True, 852 | max_length=args.max_length, 853 | truncation=True, 854 | use_fast=True, 855 | ) 856 | 857 | model = AutoModelForQuestionAnswering.from_pretrained( 858 | model_args.model_name_or_path, 859 | cache_dir=model_args.cache_dir, 860 | ) 861 | 862 | if data_args.datasets == "xquad": 863 | XQUAD.data = load_datasets(XQUAD, base_dataset="xquad") 864 | 865 | if data_args.datasets == "mlqa": 866 | MLQA.data = load_datasets(MLQA, base_dataset="mlqa") 867 | 868 | if data_args.datasets == "tydiqa": 869 | raise ValueError("Not yet implemented") 870 | 871 | if data_args.datasets == "xquad_long": 872 | XQUAD.data = load_datasets(XQUAD, "xquad", concatinate=True) 873 | 874 | if data_args.datasets in ["squad_long", "xquad_long"]: 875 | train_dataset = load_datasets( 876 | SQUAD, split="train", concatinate=True)[0] 877 | valid_dataset = load_datasets(SQUAD, concatinate=True)[0] 878 | SQUAD.data = valid_dataset 879 | 880 | if (data_args.datasets in ["xquad", "mlqa", "squad"]): 881 | train_dataset = load_datasets( 882 | SQUAD, split="train", concatinate=True)[0] 883 | valid_dataset = load_datasets(SQUAD, concatinate=True)[0] 884 | SQUAD.data = valid_dataset 885 | 886 | torch.save(train_dataset, data_args.train_file_path) 887 | torch.save(valid_dataset, data_args.valid_file_path) 888 | 889 | train_dataset = torch.load(data_args.train_file_path) 890 | valid_dataset = torch.load(data_args.valid_file_path) 891 | 892 | #################################### 893 | # 894 | # Train the model 895 | # 896 | #################################### 897 | 898 | if training_args.do_train: 899 | 900 | trainer = Trainer( 901 | model=model, 902 | args=training_args, 903 | train_dataset=train_dataset, 904 | eval_dataset=valid_dataset, 905 | data_collator=DummyDataCollator(), 906 | prediction_loss_only=True, 907 | ) 908 | 909 | if training_args.do_train: 910 | trainer.train( 911 | model_path=model_args.model_name_or_path 912 | if os.path.isdir(model_args.model_name_or_path) 913 | else None 914 | ) 915 | trainer.save_model() 916 | if trainer.is_world_process_zero(): 917 | tokenizer.save_pretrained(training_args.output_dir) 918 | 919 | results = {} 920 | if training_args.do_eval and training_args.local_rank in [-1, 0]: 921 | logger.info("*** Evaluation ***") 922 | 923 | eval_output = trainer.evaluate() 924 | output_eval_file = os.path.join( 925 | training_args.output_dir, "eval_results.txt" 926 | ) 927 | print("\n'==========================================\n") 928 | print("Eval output: ", eval_output) 929 | print("\n'==========================================\n") 930 | 931 | with open(output_eval_file, "w") as writer: 932 | logger.info("***** Eval results *****") 933 | for key in sorted(eval_output.keys()): 934 | logger.info(" %s = %s", key, str(eval_output[key])) 935 | writer.write("%s = %s\n" % (key, str(eval_output[key]))) 936 | print(key, str(eval_output[key])) 937 | 938 | results.update(eval_output) 939 | 940 | logging.info("=" * 45) 941 | logging.info("Results from evaluation:") 942 | logging.info(results) 943 | logging.info("\n") 944 | 945 | logging.info("" * 45) 946 | 947 | #################################### 948 | # 949 | # Evaluate the trained model 950 | # 951 | #################################### 952 | 953 | if training_args.do_train: 954 | tokenizer = AutoTokenizer.from_pretrained( 955 | training_args.output_dir, 956 | use_fast=True, 957 | do_lowercase=args.do_lowercase 958 | ) 959 | model = AutoModelForQuestionAnswering.from_pretrained( 960 | training_args.output_dir, 961 | ) 962 | else: 963 | try: 964 | model_path = training_args.output_dir 965 | tokenizer = AutoTokenizer.from_pretrained( 966 | training_args.output_dir, 967 | use_fast=True, 968 | do_lowercase=args.do_lowercase 969 | ) 970 | model = AutoModelForQuestionAnswering.from_pretrained( 971 | training_args.output_dir, 972 | ) 973 | except: 974 | model_path = model_args.model_name_or_path 975 | tokenizer = AutoTokenizer.from_pretrained( 976 | model_path, use_fast=True, do_lowercase=args.do_lowercase 977 | ) 978 | model = AutoModelForQuestionAnswering.from_pretrained( 979 | model_path 980 | ) 981 | 982 | model = model.cuda() 983 | model.eval() 984 | 985 | get_squad_evaluation( 986 | SQUAD.data, 987 | model, 988 | tokenizer, 989 | SQUAD.text_on_eval, 990 | training_args.per_device_eval_batch_size, 991 | ) 992 | if data_args.datasets == "xquad" or data_args.datasets == "xquad_long": 993 | for i, _ in enumerate(XQUAD.langs): 994 | get_squad_evaluation( 995 | XQUAD.data[i], 996 | model, 997 | tokenizer, 998 | XQUAD.text_on_eval[i], 999 | training_args.per_device_eval_batch_size, 1000 | ) 1001 | elif data_args.datasets == "mlqa": 1002 | for i, _ in enumerate(MLQA.langs): 1003 | get_squad_evaluation( 1004 | MLQA.data[i], 1005 | model, 1006 | tokenizer, 1007 | MLQA.text_on_eval[i], 1008 | training_args.per_device_eval_batch_size, 1009 | ) 1010 | 1011 | elif data_args.datasets == "trivia_qa": 1012 | pass 1013 | 1014 | else: 1015 | print("Not a valid eval dataset...\n Exiting") 1016 | 1017 | 1018 | if __name__ == "__main__": 1019 | main() 1020 | -------------------------------------------------------------------------------- /notebooks/Convert to Long.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import logging\n", 10 | "import os\n", 11 | "import math\n", 12 | "from dataclasses import dataclass, field\n", 13 | "\n", 14 | "import copy # for deep copy\n", 15 | "\n", 16 | "import torch\n", 17 | "from torch import nn\n", 18 | "from transformers import RobertaForMaskedLM, RobertaTokenizerFast, TextDataset, DataCollatorForLanguageModeling, Trainer\n", 19 | "from transformers import TrainingArguments, HfArgumentParser\n", 20 | "from transformers.modeling_longformer import LongformerSelfAttention\n", 21 | "\n", 22 | "logger = logging.getLogger(__name__)\n", 23 | "logging.basicConfig(level=logging.INFO)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 22, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "class RobertaLongSelfAttention(LongformerSelfAttention): \n", 33 | " def forward(\n", 34 | " self,\n", 35 | " hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None\n", 36 | " ):\n", 37 | " return super().forward(hidden_states, attention_mask=attention_mask)\n", 38 | "\n", 39 | "class RobertaLongForMaskedLM(RobertaForMaskedLM):\n", 40 | " def __init__(self, config):\n", 41 | " super().__init__(config)\n", 42 | " for i, layer in enumerate(self.roberta.encoder.layer):\n", 43 | " # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n", 44 | " layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 23, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def create_long_model(save_model_to, attention_window, max_pos):\n", 54 | " model = RobertaForMaskedLM.from_pretrained('roberta-base')\n", 55 | " tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos)\n", 56 | " config = model.config\n", 57 | "\n", 58 | " # extend position embeddings\n", 59 | " tokenizer.model_max_length = max_pos\n", 60 | " tokenizer.init_kwargs['model_max_length'] = max_pos\n", 61 | " current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape\n", 62 | " max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2\n", 63 | " config.max_position_embeddings = max_pos\n", 64 | " assert max_pos > current_max_pos\n", 65 | " # allocate a larger position embedding matrix\n", 66 | " new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)\n", 67 | " # copy position embeddings over and over to initialize the new position embeddings\n", 68 | " k = 2\n", 69 | " step = current_max_pos - 2\n", 70 | " while k < max_pos - 1:\n", 71 | " new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]\n", 72 | " k += step\n", 73 | " \n", 74 | " model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed\n", 75 | " model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)\n", 76 | " \"\"\"\n", 77 | " model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # add after this line\n", 78 | " model.roberta.embeddings.position_embeddings.num_embeddings = len(new_pos_embed.data)\n", 79 | " # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length\n", 80 | " model.roberta.embeddings.position_ids = torch.arange(0, model.roberta.embeddings.position_embeddings.num_embeddings)[None]\n", 81 | " \"\"\"\n", 82 | " \n", 83 | " \n", 84 | " # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n", 85 | " config.attention_window = [attention_window] * config.num_hidden_layers\n", 86 | " for i, layer in enumerate(model.roberta.encoder.layer):\n", 87 | " longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n", 88 | " longformer_self_attn.query = copy.deepcopy(layer.attention.self.query)\n", 89 | " longformer_self_attn.key = copy.deepcopy(layer.attention.self.key)\n", 90 | " longformer_self_attn.value = copy.deepcopy(layer.attention.self.value)\n", 91 | "\n", 92 | " longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)\n", 93 | " longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)\n", 94 | " longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)\n", 95 | "\n", 96 | " \"\"\"\n", 97 | " longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n", 98 | " longformer_self_attn.query = layer.attention.self.query\n", 99 | " longformer_self_attn.key = layer.attention.self.key\n", 100 | " longformer_self_attn.value = layer.attention.self.value\n", 101 | "\n", 102 | " longformer_self_attn.query_global = layer.attention.self.query\n", 103 | " longformer_self_attn.key_global = layer.attention.self.key\n", 104 | " longformer_self_attn.value_global = layer.attention.self.value\n", 105 | " \"\"\"\n", 106 | "\n", 107 | " layer.attention.self = longformer_self_attn\n", 108 | "\n", 109 | " logger.info(f'saving model to {save_model_to}')\n", 110 | " model.save_pretrained(save_model_to)\n", 111 | " tokenizer.save_pretrained(save_model_to)\n", 112 | " return model, tokenizer" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 24, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "def copy_proj_layers(model):\n", 122 | " for i, layer in enumerate(model.roberta.encoder.layer):\n", 123 | " layer.attention.self.query_global = layer.attention.self.query\n", 124 | " layer.attention.self.key_global = layer.attention.self.key\n", 125 | " layer.attention.self.value_global = layer.attention.self.value\n", 126 | " return model" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 25, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):\n", 136 | " val_dataset = TextDataset(tokenizer=tokenizer,\n", 137 | " file_path=args.val_datapath,\n", 138 | " block_size=tokenizer.max_len)\n", 139 | " if eval_only:\n", 140 | " train_dataset = val_dataset\n", 141 | " else:\n", 142 | " logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')\n", 143 | " train_dataset = TextDataset(tokenizer=tokenizer,\n", 144 | " file_path=args.train_datapath,\n", 145 | " block_size=tokenizer.max_len)\n", 146 | "\n", 147 | " data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)\n", 148 | " trainer = Trainer(model=model, args=args, data_collator=data_collator,\n", 149 | " train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True)\n", 150 | "\n", 151 | " eval_loss = trainer.evaluate()\n", 152 | " eval_loss = eval_loss['eval_loss']\n", 153 | " logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')\n", 154 | " \n", 155 | " if not eval_only:\n", 156 | " trainer.train(model_path=model_path)\n", 157 | " trainer.save_model()\n", 158 | "\n", 159 | " eval_loss = trainer.evaluate()\n", 160 | " eval_loss = eval_loss['eval_loss']\n", 161 | " logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 26, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "@dataclass\n", 171 | "class ModelArgs:\n", 172 | " attention_window: int = field(default=512, metadata={\"help\": \"Size of attention window\"})\n", 173 | " max_pos: int = field(default=4096, metadata={\"help\": \"Maximum position\"})\n", 174 | "\n", 175 | "parser = HfArgumentParser((TrainingArguments, ModelArgs,))\n", 176 | "\n", 177 | "\n", 178 | "training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[\n", 179 | " '--output_dir', 'tmp',\n", 180 | " '--warmup_steps', '500',\n", 181 | " '--learning_rate', '0.00003',\n", 182 | " '--weight_decay', '0.01',\n", 183 | " '--adam_epsilon', '1e-6',\n", 184 | " '--max_steps', '3000',\n", 185 | " '--logging_steps', '500',\n", 186 | " '--save_steps', '500',\n", 187 | " '--max_grad_norm', '5.0',\n", 188 | " '--per_gpu_eval_batch_size', '8',\n", 189 | " '--per_gpu_train_batch_size', '2', # 32GB gpu with fp32\n", 190 | " '--gradient_accumulation_steps', '32',\n", 191 | " '--evaluate_during_training',\n", 192 | " '--do_train',\n", 193 | " '--do_eval',\n", 194 | "])\n", 195 | "training_args.val_datapath = '/workspace/data/wikitext-103-raw/wiki.valid.raw'\n", 196 | "training_args.train_datapath = '/workspace/data/wikitext-103-raw/wiki.train.raw'\n", 197 | "\n", 198 | "# Choose GPU\n", 199 | "import os\n", 200 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 27, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stderr", 210 | "output_type": "stream", 211 | "text": [ 212 | "Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']\n", 213 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", 214 | "INFO:__main__:Evaluating roberta-base (seqlen: 512) for refernece ...\n", 215 | "INFO:filelock:Lock 140125418510600 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n", 216 | "INFO:filelock:Lock 140125418510600 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n", 217 | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.\n" 218 | ] 219 | }, 220 | { 221 | "data": { 222 | "text/html": [ 223 | "\n", 224 | "
\n", 225 | " \n", 234 | " \n", 235 | " \n", 236 | " [62/62 00:07]\n", 237 | "
\n", 238 | " " 239 | ], 240 | "text/plain": [ 241 | "" 242 | ] 243 | }, 244 | "metadata": {}, 245 | "output_type": "display_data" 246 | }, 247 | { 248 | "name": "stderr", 249 | "output_type": "stream", 250 | "text": [ 251 | "INFO:__main__:Initial eval bpc: 2.549888218283919\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')\n", 257 | "roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')\n", 258 | "logger.info('Evaluating roberta-base (seqlen: 512) for refernece ...')\n", 259 | "pretrain_and_evaluate(training_args, roberta_base, roberta_base_tokenizer, eval_only=True, model_path=None)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 28, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stderr", 269 | "output_type": "stream", 270 | "text": [ 271 | "INFO:__main__:Converting roberta-base into roberta-base-4096\n", 272 | "Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']\n", 273 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", 274 | "INFO:__main__:saving model to tmp/roberta-base-4096\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}'\n", 280 | "if not os.path.exists(model_path):\n", 281 | " os.makedirs(model_path)\n", 282 | "\n", 283 | "logger.info(f'Converting roberta-base into roberta-base-{model_args.max_pos}')\n", 284 | "model, tokenizer = create_long_model(\n", 285 | " save_model_to=model_path, attention_window=model_args.attention_window, max_pos=model_args.max_pos)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 29, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "'\\nSelf = \\n \\n (query): Linear(in_features=768, out_features=768, bias=True)\\n (key): Linear(in_features=768, out_features=768, bias=True)\\n (value): Linear(in_features=768, out_features=768, bias=True)\\n (query_global): Linear(in_features=768, out_features=768, bias=True)\\n (key_global): Linear(in_features=768, out_features=768, bias=True)\\n (value_global): Linear(in_features=768, out_features=768, bias=True)\\n'" 297 | ] 298 | }, 299 | "execution_count": 29, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "\"\"\"\n", 306 | "Self = \n", 307 | " \n", 308 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 309 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 310 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 311 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 312 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 313 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 314 | "\"\"\"" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 30, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stderr", 324 | "output_type": "stream", 325 | "text": [ 326 | "INFO:__main__:Loading the model from tmp/roberta-base-4096\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "logger.info(f'Loading the model from {model_path}')\n", 332 | "tokenizer = RobertaTokenizerFast.from_pretrained(model_path)\n", 333 | "model = RobertaLongForMaskedLM.from_pretrained(model_path)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 31, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stderr", 343 | "output_type": "stream", 344 | "text": [ 345 | "INFO:__main__:Pretraining roberta-base-4096 ... \n", 346 | "INFO:filelock:Lock 140124002609248 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.valid.raw.lock\n", 347 | "INFO:filelock:Lock 140124002609248 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.valid.raw.lock\n", 348 | "INFO:__main__:Loading and tokenizing training data is usually slow: /workspace/data/wikitext-103-raw/wiki.train.raw\n", 349 | "INFO:filelock:Lock 140125403321344 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.train.raw.lock\n", 350 | "INFO:filelock:Lock 140125403321344 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.train.raw.lock\n", 351 | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.\n" 352 | ] 353 | }, 354 | { 355 | "ename": "TypeError", 356 | "evalue": "forward() takes from 2 to 6 positional arguments but 7 were given", 357 | "output_type": "error", 358 | "traceback": [ 359 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 360 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 361 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtraining_args\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m \u001b[0;31m## <<<<<<<<<<<<<<<<<<<<<<<< REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'magic'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mpretrain_and_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtraining_args\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 362 | "\u001b[0;32m\u001b[0m in \u001b[0;36mpretrain_and_evaluate\u001b[0;34m(args, model, tokenizer, eval_only, model_path)\u001b[0m\n\u001b[1;32m 15\u001b[0m train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True)\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0meval_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0meval_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0meval_loss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'eval_loss'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Initial eval bpc: {eval_loss/math.log(2)}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 363 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, eval_dataset)\u001b[0m\n\u001b[1;32m 1311\u001b[0m \u001b[0;31m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1312\u001b[0m \u001b[0;31m# self.args.prediction_loss_only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1313\u001b[0;31m \u001b[0mprediction_loss_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_metrics\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1314\u001b[0m )\n\u001b[1;32m 1315\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 364 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mprediction_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only)\u001b[0m\n\u001b[1;32m 1415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1416\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1417\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprediction_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprediction_loss_only\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1418\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1419\u001b[0m \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrepeat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 365 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mprediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1510\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1512\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhas_labels\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 366 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 367 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict, **kwargs)\u001b[0m\n\u001b[1;32m 903\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 905\u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 906\u001b[0m )\n\u001b[1;32m 907\u001b[0m \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 368 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 369 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 692\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 693\u001b[0m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 694\u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 695\u001b[0m )\n\u001b[1;32m 696\u001b[0m \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoder_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 370 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 371 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m )\n\u001b[1;32m 429\u001b[0m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlayer_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 372 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 373 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 345\u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 346\u001b[0m )\n\u001b[1;32m 347\u001b[0m \u001b[0mattention_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself_attention_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 374 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 375 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions)\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 279\u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 280\u001b[0m )\n\u001b[1;32m 281\u001b[0m \u001b[0mattention_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 376 | "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 377 | "\u001b[0;31mTypeError\u001b[0m: forward() takes from 2 to 6 positional arguments but 7 were given" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "logger.info(f'Pretraining roberta-base-{model_args.max_pos} ... ')\n", 383 | "\n", 384 | "training_args.max_steps = 3 ## <<<<<<<<<<<<<<<<<<<<<<<< REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<\n", 385 | "%magic\n", 386 | "pretrain_and_evaluate(training_args, model, tokenizer, eval_only=False, model_path=training_args.output_dir)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "logger.info(f'Copying local projection layers into global projection layers ... ')\n", 396 | "model = copy_proj_layers(model)\n", 397 | "logger.info(f'Saving model to {model_path}')\n", 398 | "model.save_pretrained(model_path)\n" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "logger.info(f'Loading the model from {model_path}')\n", 408 | "tokenizer = RobertaTokenizerFast.from_pretrained(model_path)\n", 409 | "model = RobertaLongForMaskedLM.from_pretrained(model_path)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "import transformers" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "transformers.__version__" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "model.roberta.embeddings" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "model.roberta.embeddings.position_embeddings" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "model.roberta.embeddings.position_embeddings.num_embeddings" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "model.roberta.embeddings.position_embeddings.num_embeddings" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "torch.cop" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [] 488 | } 489 | ], 490 | "metadata": { 491 | "kernelspec": { 492 | "display_name": "Python 3", 493 | "language": "python", 494 | "name": "python3" 495 | }, 496 | "language_info": { 497 | "codemirror_mode": { 498 | "name": "ipython", 499 | "version": 3 500 | }, 501 | "file_extension": ".py", 502 | "mimetype": "text/x-python", 503 | "name": "python", 504 | "nbconvert_exporter": "python", 505 | "pygments_lexer": "ipython3", 506 | "version": "3.6.9" 507 | } 508 | }, 509 | "nbformat": 4, 510 | "nbformat_minor": 4 511 | } 512 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multilingual Longformer 3 | Master thesis work for investigating if and how well multilingual models in low-resource languages (such as Swedish) can incorporate longer contexts without re-training the models from scratch on long-context datasets in each language. The goal was to investigate if a multilingual model, such as XLM-R, could be extended into a Longformer model and then pre-train only on English, while still having a long-context model in several languages. 4 | 5 | The script provided includes the necessary steps to reproduce the result presented in the master thesis. We convert pre-trained monolingual and multilingual language models on English to Longformer models to a maximum model length of 4096. 6 | 7 | **We call the pre-trained models using the Longformer pre-training**: 8 | 1. RoBERTa-Long 9 | 2. XLM-Long (weights and config are available on Huggingface [here](https://huggingface.co/markussagen/xlm-roberta-longformer-base-4096)) 10 | 11 | Based on a RoBERTa and XLM-R model that has been pre-trained using the Longformer pre-training scheme. 12 | 13 | Training of all models are done through docker containers for reproducability 14 | 15 | ## Usage and Setup 16 | Example of how to build, start, run and shutdown the docker container and the training script 17 | If you encounter problems, toggle the `Technical Requirement` and `Pre-Requisites` links to verify that you have a sufficiently large GPU and the pre-requisite applications/libraries installed. 18 | 19 |
Technical Requirements 20 |

21 | **Please Note**: 22 | Running the following project is quite computationally expensive. It is required to have a Docker container with at least 90GB of RAM allocated for the pre-training and a CUDA enabled GPU with 48GB of memory! 23 | 24 | For the Fine-tuning on QA tasks, 32GB of RAM is sufficient and a smaller GPU can be used when fine-tuning on regular or multilingual SQuAD. However, for the datasets created with a longer context, it requires at least 32GB of RAM 25 |

26 |
27 | 28 | 29 |
Pre-Requisites 30 |

31 | The following applications and libraries needs to be installed in order to run the application 32 | - [Docker](https://docs.docker.com/get-docker/) 33 | - [Docker Compose](https://docs.docker.com/compose/install/) 34 | - Miniconda or Anaconda with Python3 35 | - make (terminal command) 36 | - wget (terminal command) 37 | - unzip (terminal command) 38 | - tmux (terminal command) 39 | - CUDA enabled GPU (check if set up correctly by entering `nvidia-smi` in your terminal) 40 | - [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html) installed and linked to your Docker container (Needed if encountering error: ```ERROR: for XXX_markussagen_repl1 Cannot create container for service repl: Unknown runtime specified nvidia```) 41 |

42 |
43 | 44 | 45 | 1. **Download the repo** 46 | 47 | git@github.com:MarkusSagen/Master-Thesis-Multilingual-Longformer.git 48 | cp .env.template .env 49 | 50 | 2. **Download the dataset** 51 | Unzip the dataset and then place it in a suitable location 52 | 53 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip 54 | unzip wikitext-103-raw-v1.zip 55 | 56 | 3. **Change your model and dataset paths** 57 | Open the `.env` file and change the `DATA_DIR` and the `MODEL_DIR` to the relative path to where you have you want your models stored and where you downloaded the dataset. Make sure that the folders you set exist on your system. 58 | For instance: 59 | 60 | DATA_DIR=/Users/admin/data/wikitext-103-raw 61 | MODEL_DIR=/Users/admin/model 62 | 4. **Start the docker container** 63 | 64 | make build && make up 65 | 5. **Start tmux** 66 | In your terminal start tmux. This will ensure that your runs are not stopped if you are disconnected from an ssh connection 67 | 68 | tmux 69 | 6. **Run the script** 70 | Here is an example of how a training script might look like for pre-training a XLM-R model into a Longformer. The general format follows the parameters of [Huggingface Transformer's TrainingArgument](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments). 71 | 72 | export SEED=42 73 | export MAX_LENGTH=4096 74 | export MODEL_NAME_OR_PATH=xlm-roberta-base 75 | export MODEL_NAME=xlm-roberta-to-longformer 76 | export MODEL_DIR=/workspace/models 77 | export DATA_DIR=/workspace/data 78 | export LOG_DIR=/workspace/logs 79 | 80 | make repl run="scripts/run_long_lm.py \ 81 | --model_name_or_path $MODEL_NAME_OR_PATH \ 82 | --model_name $MODEL_NAME \ 83 | --output_dir $MODEL_DIR/$MODEL_NAME \ 84 | --logging_dir $LOG_DIR/$MODEL_NAME \ 85 | --val_file_path $DATA_DIR/wiki.valid.raw \ 86 | --train_file_path $DATA_DIR/wiki.train.raw \ 87 | --seed $SEED \ 88 | --model_max_length $MAX_LENGTH \ 89 | --adam_epsilon 1e-8 \ 90 | --warmup_steps 500 \ 91 | --learning_rate 3e-5 \ 92 | --weight_decay 0.01 \ 93 | --max_steps 6000 \ 94 | --evaluate_during_training \ 95 | --logging_steps 50 \ 96 | --eval_steps 50 \ 97 | --save_steps 6000 \ 98 | --max_grad_norm 1.0 \ 99 | --per_device_eval_batch_size 2 \ 100 | --per_device_train_batch_size 1 \ 101 | --gradient_accumulation_steps 64 \ 102 | --overwrite_output_dir \ 103 | --fp16 \ 104 | --do_train \ 105 | --do_eval 106 | " 107 | 7. **Shutdown run and container** 108 | 109 | make down 110 | 111 | 8. **(Optional) terminate tmux** 112 | 113 | exit 114 | 115 | ## Training and Evaluation in-depth 116 | The training of these models were done in two steps: 117 | 1. Pre-train `RoBERTa-base` and `XLM-R-base` models into Longformer models 118 | 2. Fine-tune regular RoBERTa and XLM-R models on SQuAD formated dataset. Compare the results of these with our Longformer trained models and a Longformer model released by the Longformer authors. We train these models with multiple different seeds, datasets and context length. 119 | 120 | We have grouped each model trained and evaluated based on: 121 | - The dataset and language used for each model 122 | - Then based on what model that were trained 123 | 124 | ## Pre-train: Transfer Long-Context of Language Models 125 | 126 | The models were trained according to this structure 127 | 128 | English Pre-training 129 | |-- Wikitext-103 130 | |-- RoBERTa-Long (4096) 131 | |-- XLM-Long (4096) 132 | 133 | 134 | 135 | Each fine-tuning are grouped based on the dataset, language and context length and then evaluated for each model. For more in-depth explanation of the pre-training script and parameters, see [Here](Pretraining_Details.md). 136 | 137 |
Runs: 138 |

139 | 140 |

Wikitext-103 141 |

142 | 143 | ##### RoBERTa 144 | 145 | export SEED=42 146 | export MAX_LENGTH=4096 147 | export MODEL_DIR=/workspace/models 148 | export MODEL_NAME_OR_PATH=roberta-base 149 | export MODEL_NAME=$MODEL_NAME_OR_PATH-long 150 | export DATA_DIR=/workspace/data 151 | export LOG_DIR=/workspace/logs 152 | 153 | make repl run="scripts/run_long_lm.py \ 154 | --model_name_or_path $MODEL_NAME_OR_PATH \ 155 | --model_name $MODEL_NAME \ 156 | --output_dir $MODEL_DIR/$MODEL_NAME \ 157 | --logging_dir $LOG_DIR/$MODEL_NAME \ 158 | --val_file_path $DATA_DIR/wiki.valid.raw \ 159 | --train_file_path $DATA_DIR/wiki.train.raw \ 160 | --seed $SEED \ 161 | --model_max_length $MAX_LENGTH \ 162 | --adam_epsilon 1e-8 \ 163 | --warmup_steps 500 \ 164 | --learning_rate 3e-5 \ 165 | --weight_decay 0.01 \ 166 | --max_steps 6000 \ 167 | --evaluate_during_training \ 168 | --logging_steps 50 \ 169 | --eval_steps 50 \ 170 | --save_steps 500 \ 171 | --max_grad_norm 1.0 \ 172 | --per_device_eval_batch_size 2 \ 173 | --per_device_train_batch_size 1 \ 174 | --gradient_accumulation_steps 64 \ 175 | --overwrite_output_dir \ 176 | --fp16 \ 177 | --do_train \ 178 | --do_eval 179 | " 180 | 181 | 182 | ##### XLM-R 183 | 184 | export SEED=42 185 | export MAX_LENGTH=4096 186 | export MODEL_DIR=/workspace/models 187 | export MODEL_NAME_OR_PATH=xlm-roberta-base 188 | export MODEL_NAME=$MODEL_NAME_OR_PATH-long 189 | export DATA_DIR=/workspace/data 190 | export LOG_DIR=/workspace/logs 191 | 192 | make repl run="scripts/run_long_lm.py \ 193 | --model_name_or_path $MODEL_NAME_OR_PATH \ 194 | --model_name $MODEL_NAME \ 195 | --output_dir $MODEL_DIR/$MODEL_NAME \ 196 | --logging_dir $LOG_DIR/$MODEL_NAME \ 197 | --val_file_path $DATA_DIR/wiki.valid.raw \ 198 | --train_file_path $DATA_DIR/wiki.train.raw \ 199 | --seed $SEED \ 200 | --model_max_length $MAX_LENGTH \ 201 | --adam_epsilon 1e-8 \ 202 | --warmup_steps 500 \ 203 | --learning_rate 3e-5 \ 204 | --weight_decay 0.01 \ 205 | --max_steps 6000 \ 206 | --evaluate_during_training \ 207 | --logging_steps 50 \ 208 | --eval_steps 50 \ 209 | --save_steps 500 \ 210 | --max_grad_norm 1.0 \ 211 | --per_device_eval_batch_size 2 \ 212 | --per_device_train_batch_size 1 \ 213 | --gradient_accumulation_steps 64 \ 214 | --overwrite_output_dir \ 215 | --fp16 \ 216 | --do_train \ 217 | --do_eval 218 | " 219 | 220 |

221 |
222 | 223 |

224 |
225 | 226 | 227 | ## Fine-Tune on Question Answering Tasks 228 | 229 | 230 | English QA Fine-Tuning: 231 | |-- SQuAD 232 | |-- RoBERTa (512) 233 | |-- Longformer (4096) 234 | |-- RoBERTa-Long (4096) 235 | |-- XLM-R (512) 236 | |-- XLM-Long (4096) 237 | |-- SQ3 (512) 238 | |-- RoBERTa (512) 239 | |-- Longformer (4096) 240 | |-- RoBERTa-Long (4096) 241 | |-- XLM-R (512) 242 | |-- XLM-Long (4096) 243 | |-- SQ3 (2048) 244 | |-- RoBERTa (512) 245 | |-- Longformer (4096) 246 | |-- RoBERTa-Long (4096) 247 | |-- XLM-R (512) 248 | |-- XLM-Long (4096) 249 | |-- TriviaQA = TODO 250 | 251 | Multilingual QA Fine-Tuning: 252 | |-- XQuAD 253 | |-- RoBERTa (512) 254 | |-- XLM-R (512) 255 | |-- XLM-Long (4096) 256 | |-- XQ3 (512) 257 | |-- XLM-R (512) 258 | |-- XLM-Long (4096) 259 | |-- XQ3 (2048) 260 | |-- XLM-R (512) 261 | |-- XLM-Long (4096) 262 | |-- MLQA 263 | |-- XLM-R (512) 264 | |-- XLM-Long (4096) 265 | 266 | 267 | We fine-tune the models on SQuAD-formated extractive question-answer datasets in English and multiple other languages. We also create a concatenated dataset with longer context for both the SQuAD and XQuAD (multilingual SQuAD). The datasets are provided through Huggingface's Datasets library. 268 | 269 | For more more in-depth information regarding how the fine-tuning scripts, parameters and evaluation setup, see [Here](Finetuning_Details.md) 270 | 271 |
Runs: 272 |

273 | 274 | Each fine-tuning are grouped based on the dataset, language and context length and then evaluated for each model. 275 | 276 | ### English 277 | 278 |

SQuAD 279 |

280 | 281 | ##### RoBERTa 282 | 283 | export SEED=42 284 | export DATASET=squad 285 | export MODEL_DIR=/workspace/models 286 | export MODEL_NAME_OR_PATH=roberta-base 287 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 288 | export LOG_DIR=/workspace/logs 289 | export DATA_DIR=/workspace/data 290 | # Debugging 291 | CUDA_LAUNCH_BLOCKING=1 292 | # model args 293 | make repl run="scripts/finetune_qa_models.py \ 294 | --model_name_or_path $MODEL_NAME_OR_PATH \ 295 | --output_dir $MODEL_DIR/$MODEL_NAME \ 296 | --logging_dir $LOG_DIR/$MODEL_NAME \ 297 | --dataset $DATASET \ 298 | --data_dir $DATA_DIR \ 299 | --seed $SEED \ 300 | --num_train_epochs 3 \ 301 | --learning_rate 3e-5 \ 302 | --logging_steps 50 \ 303 | --eval_steps 50 \ 304 | --save_steps 1000 \ 305 | --per_device_train_batch_size 4 \ 306 | --per_device_eval_batch_size 32 \ 307 | --gradient_accumulation_steps 8 \ 308 | --overwrite_output_dir \ 309 | --evaluate_during_training \ 310 | --fp16 \ 311 | --do_train \ 312 | --do_eval \ 313 | --do_lowercase \ 314 | --max_length 512 \ 315 | " 316 | 317 | 318 | ##### Longformer 319 | 320 | export SEED=42 321 | export DATASET=squad 322 | export MODEL_DIR=/workspace/models 323 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096 324 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 325 | export LOG_DIR=/workspace/logs 326 | export DATA_DIR=/workspace/data 327 | # Debugging 328 | CUDA_LAUNCH_BLOCKING=1 329 | # model args 330 | make repl run="scripts/finetune_qa_models.py \ 331 | --model_name_or_path $MODEL_NAME_OR_PATH \ 332 | --output_dir $MODEL_DIR/$MODEL_NAME \ 333 | --logging_dir $LOG_DIR/$MODEL_NAME \ 334 | --dataset $DATASET \ 335 | --data_dir $DATA_DIR \ 336 | --seed $SEED \ 337 | --num_train_epochs 3 \ 338 | --learning_rate 3e-5 \ 339 | --logging_steps 50 \ 340 | --eval_steps 50 \ 341 | --save_steps 1000 \ 342 | --per_device_train_batch_size 4 \ 343 | --per_device_eval_batch_size 32 \ 344 | --gradient_accumulation_steps 8 \ 345 | --overwrite_output_dir \ 346 | --evaluate_during_training \ 347 | --fp16 \ 348 | --do_train \ 349 | --do_eval \ 350 | --do_lowercase \ 351 | --max_length 512 \ 352 | " 353 | 354 | 355 | ##### RoBERTa-Long 356 | 357 | export SEED=42 358 | export DATASET=squad 359 | export MODEL_DIR=/workspace/models 360 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long 361 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 362 | export LOG_DIR=/workspace/logs 363 | export DATA_DIR=/workspace/data 364 | # Debugging 365 | CUDA_LAUNCH_BLOCKING=1 366 | # model args 367 | make repl run="scripts/finetune_qa_models.py \ 368 | --model_name_or_path $MODEL_NAME_OR_PATH \ 369 | --output_dir $MODEL_DIR/$MODEL_NAME \ 370 | --logging_dir $LOG_DIR/$MODEL_NAME \ 371 | --dataset $DATASET \ 372 | --data_dir $DATA_DIR \ 373 | --seed $SEED \ 374 | --num_train_epochs 3 \ 375 | --learning_rate 3e-5 \ 376 | --logging_steps 50 \ 377 | --eval_steps 50 \ 378 | --save_steps 1000 \ 379 | --per_device_train_batch_size 4 \ 380 | --per_device_eval_batch_size 32 \ 381 | --gradient_accumulation_steps 8 \ 382 | --overwrite_output_dir \ 383 | --evaluate_during_training \ 384 | --fp16 \ 385 | --do_train \ 386 | --do_eval \ 387 | --do_lowercase \ 388 | --max_length 512 \ 389 | " 390 | 391 | 392 | ##### XLM-R 393 | 394 | export SEED=42 395 | export DATASET=squad 396 | export MODEL_DIR=/workspace/models 397 | export MODEL_NAME_OR_PATH=xlm-roberta-base 398 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 399 | export LOG_DIR=/workspace/logs 400 | export DATA_DIR=/workspace/data 401 | # Debugging 402 | CUDA_LAUNCH_BLOCKING=1 403 | # model args 404 | make repl run="scripts/finetune_qa_models.py \ 405 | --model_name_or_path $MODEL_NAME_OR_PATH \ 406 | --output_dir $MODEL_DIR/$MODEL_NAME \ 407 | --logging_dir $LOG_DIR/$MODEL_NAME \ 408 | --dataset $DATASET \ 409 | --data_dir $DATA_DIR \ 410 | --seed $SEED \ 411 | --num_train_epochs 3 \ 412 | --learning_rate 3e-5 \ 413 | --logging_steps 50 \ 414 | --eval_steps 50 \ 415 | --save_steps 1000 \ 416 | --per_device_train_batch_size 4 \ 417 | --per_device_eval_batch_size 32 \ 418 | --gradient_accumulation_steps 8 \ 419 | --overwrite_output_dir \ 420 | --evaluate_during_training \ 421 | --fp16 \ 422 | --do_train \ 423 | --do_eval \ 424 | --do_lowercase \ 425 | --max_length 512 \ 426 | " 427 | 428 | 429 | ##### XLM-Long 430 | 431 | export SEED=42 432 | export DATASET=squad 433 | export MODEL_DIR=/workspace/models 434 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 435 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 436 | export LOG_DIR=/workspace/logs 437 | export DATA_DIR=/workspace/data 438 | # Debugging 439 | CUDA_LAUNCH_BLOCKING=1 440 | # model args 441 | make repl run="scripts/finetune_qa_models.py \ 442 | --model_name_or_path $MODEL_NAME_OR_PATH \ 443 | --output_dir $MODEL_DIR/$MODEL_NAME \ 444 | --logging_dir $LOG_DIR/$MODEL_NAME \ 445 | --dataset $DATASET \ 446 | --data_dir $DATA_DIR \ 447 | --seed $SEED \ 448 | --num_train_epochs 3 \ 449 | --learning_rate 3e-5 \ 450 | --logging_steps 50 \ 451 | --eval_steps 50 \ 452 | --save_steps 1000 \ 453 | --per_device_train_batch_size 4 \ 454 | --per_device_eval_batch_size 32 \ 455 | --gradient_accumulation_steps 8 \ 456 | --overwrite_output_dir \ 457 | --evaluate_during_training \ 458 | --fp16 \ 459 | --do_train \ 460 | --do_eval \ 461 | --do_lowercase \ 462 | --max_length 512 \ 463 | " 464 | 465 | 466 |

467 |
468 | 469 |
SQ3 (512) 470 |

471 | 472 | ##### RoBERTa 473 | 474 | export SEED=42 475 | export MAX_LENGTH=512 476 | export NR_CONCATS=1 477 | export DATASET=squad_long 478 | export MODEL_DIR=/workspace/models 479 | export MODEL_NAME_OR_PATH=roberta-base 480 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 481 | export LOG_DIR=/workspace/logs 482 | export DATA_DIR=/workspace/data 483 | # Debugging 484 | CUDA_LAUNCH_BLOCKING=1 485 | # model args 486 | make repl run="scripts/finetune_qa_models.py \ 487 | --model_name_or_path $MODEL_NAME_OR_PATH \ 488 | --output_dir $MODEL_DIR/$MODEL_NAME \ 489 | --logging_dir $LOG_DIR/$MODEL_NAME \ 490 | --dataset $DATASET \ 491 | --data_dir $DATA_DIR \ 492 | --seed $SEED \ 493 | --num_train_epochs 3 \ 494 | --learning_rate 3e-5 \ 495 | --logging_steps 50 \ 496 | --eval_steps 50 \ 497 | --save_steps 1000 \ 498 | --per_device_train_batch_size 4 \ 499 | --per_device_eval_batch_size 32 \ 500 | --gradient_accumulation_steps 8 \ 501 | --overwrite_output_dir \ 502 | --evaluate_during_training \ 503 | --fp16 \ 504 | --do_train \ 505 | --do_eval \ 506 | --do_lowercase \ 507 | --nr_concats $NR_CONCATS \ 508 | --max_length $MAX_LENGTH \ 509 | " 510 | 511 | ##### Longformer 512 | 513 | export SEED=42 514 | export MAX_LENGTH=512 515 | export NR_CONCATS=1 516 | export DATASET=squad_long 517 | export MODEL_DIR=/workspace/models 518 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096 519 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 520 | export LOG_DIR=/workspace/logs 521 | export DATA_DIR=/workspace/data 522 | # Debugging 523 | CUDA_LAUNCH_BLOCKING=1 524 | # model args 525 | make repl run="scripts/finetune_qa_models.py \ 526 | --model_name_or_path $MODEL_NAME_OR_PATH \ 527 | --output_dir $MODEL_DIR/$MODEL_NAME \ 528 | --logging_dir $LOG_DIR/$MODEL_NAME \ 529 | --dataset $DATASET \ 530 | --data_dir $DATA_DIR \ 531 | --seed $SEED \ 532 | --num_train_epochs 3 \ 533 | --learning_rate 3e-5 \ 534 | --logging_steps 50 \ 535 | --eval_steps 50 \ 536 | --save_steps 1000 \ 537 | --per_device_train_batch_size 4 \ 538 | --per_device_eval_batch_size 32 \ 539 | --gradient_accumulation_steps 8 \ 540 | --overwrite_output_dir \ 541 | --evaluate_during_training \ 542 | --fp16 \ 543 | --do_train \ 544 | --do_eval \ 545 | --do_lowercase \ 546 | --nr_concats $NR_CONCATS \ 547 | --max_length $MAX_LENGTH \ 548 | " 549 | 550 | 551 | ##### RoBERTa-Long 552 | 553 | export SEED=42 554 | export MAX_LENGTH=512 555 | export NR_CONCATS=1 556 | export DATASET=squad_long 557 | export MODEL_DIR=/workspace/models 558 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long 559 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 560 | export LOG_DIR=/workspace/logs 561 | export DATA_DIR=/workspace/data 562 | # Debugging 563 | CUDA_LAUNCH_BLOCKING=1 564 | # model args 565 | make repl run="scripts/finetune_qa_models.py \ 566 | --model_name_or_path $MODEL_NAME_OR_PATH \ 567 | --output_dir $MODEL_DIR/$MODEL_NAME \ 568 | --logging_dir $LOG_DIR/$MODEL_NAME \ 569 | --dataset $DATASET \ 570 | --data_dir $DATA_DIR \ 571 | --seed $SEED \ 572 | --num_train_epochs 3 \ 573 | --learning_rate 3e-5 \ 574 | --logging_steps 50 \ 575 | --eval_steps 50 \ 576 | --save_steps 1000 \ 577 | --per_device_train_batch_size 4 \ 578 | --per_device_eval_batch_size 32 \ 579 | --gradient_accumulation_steps 8 \ 580 | --overwrite_output_dir \ 581 | --evaluate_during_training \ 582 | --fp16 \ 583 | --do_train \ 584 | --do_eval \ 585 | --do_lowercase \ 586 | --nr_concats $NR_CONCATS \ 587 | --max_length $MAX_LENGTH \ 588 | " 589 | 590 | ##### XLM-R 591 | 592 | export SEED=42 593 | export MAX_LENGTH=512 594 | export NR_CONCATS=1 595 | export DATASET=squad_long 596 | export MODEL_DIR=/workspace/models 597 | export MODEL_NAME_OR_PATH=xlm-roberta-base 598 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 599 | export LOG_DIR=/workspace/logs 600 | export DATA_DIR=/workspace/data 601 | # Debugging 602 | CUDA_LAUNCH_BLOCKING=1 603 | # model args 604 | make repl run="scripts/finetune_qa_models.py \ 605 | --model_name_or_path $MODEL_NAME_OR_PATH \ 606 | --output_dir $MODEL_DIR/$MODEL_NAME \ 607 | --logging_dir $LOG_DIR/$MODEL_NAME \ 608 | --dataset $DATASET \ 609 | --data_dir $DATA_DIR \ 610 | --seed $SEED \ 611 | --num_train_epochs 3 \ 612 | --learning_rate 3e-5 \ 613 | --logging_steps 50 \ 614 | --eval_steps 50 \ 615 | --save_steps 1000 \ 616 | --per_device_train_batch_size 4 \ 617 | --per_device_eval_batch_size 32 \ 618 | --gradient_accumulation_steps 8 \ 619 | --overwrite_output_dir \ 620 | --evaluate_during_training \ 621 | --fp16 \ 622 | --do_train \ 623 | --do_eval \ 624 | --do_lowercase \ 625 | --nr_concats $NR_CONCATS \ 626 | --max_length $MAX_LENGTH \ 627 | " 628 | 629 | ##### XLM-Long 630 | 631 | export SEED=42 632 | export MAX_LENGTH=512 633 | export NR_CONCATS=1 634 | export DATASET=squad_long 635 | export MODEL_DIR=/workspace/models 636 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 637 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 638 | export LOG_DIR=/workspace/logs 639 | export DATA_DIR=/workspace/data 640 | # Debugging 641 | CUDA_LAUNCH_BLOCKING=1 642 | # model args 643 | make repl run="scripts/finetune_qa_models.py \ 644 | --model_name_or_path $MODEL_NAME_OR_PATH \ 645 | --output_dir $MODEL_DIR/$MODEL_NAME \ 646 | --logging_dir $LOG_DIR/$MODEL_NAME \ 647 | --dataset $DATASET \ 648 | --data_dir $DATA_DIR \ 649 | --seed $SEED \ 650 | --num_train_epochs 3 \ 651 | --learning_rate 3e-5 \ 652 | --logging_steps 50 \ 653 | --eval_steps 50 \ 654 | --save_steps 1000 \ 655 | --per_device_train_batch_size 4 \ 656 | --per_device_eval_batch_size 32 \ 657 | --gradient_accumulation_steps 8 \ 658 | --overwrite_output_dir \ 659 | --evaluate_during_training \ 660 | --fp16 \ 661 | --do_train \ 662 | --do_eval \ 663 | --do_lowercase \ 664 | --nr_concats $NR_CONCATS \ 665 | --max_length $MAX_LENGTH \ 666 | " 667 | 668 | 669 |

670 |
671 | 672 |
SQ3 (4096) 673 |

674 | 675 | ##### Longformer 676 | 677 | export SEED=42 678 | export MAX_LENGTH=2048 679 | export NR_CONCATS=3 680 | export DATASET=squad_long 681 | export MODEL_DIR=/workspace/models 682 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096 683 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 684 | export LOG_DIR=/workspace/logs 685 | export DATA_DIR=/workspace/data 686 | # Debugging 687 | CUDA_LAUNCH_BLOCKING=1 688 | # model args 689 | make repl run="scripts/finetune_qa_models.py \ 690 | --model_name_or_path $MODEL_NAME_OR_PATH \ 691 | --output_dir $MODEL_DIR/$MODEL_NAME \ 692 | --logging_dir $LOG_DIR/$MODEL_NAME \ 693 | --dataset $DATASET \ 694 | --data_dir $DATA_DIR \ 695 | --seed $SEED \ 696 | --num_train_epochs 3 \ 697 | --learning_rate 3e-5 \ 698 | --logging_steps 50 \ 699 | --eval_steps 50 \ 700 | --save_steps 1000 \ 701 | --per_device_train_batch_size 1 \ 702 | --per_device_eval_batch_size 32 \ 703 | --gradient_accumulation_steps 32 \ 704 | --overwrite_output_dir \ 705 | --evaluate_during_training \ 706 | --fp16 \ 707 | --do_train \ 708 | --do_eval \ 709 | --do_lowercase \ 710 | --nr_concats $NR_CONCATS \ 711 | --max_length $MAX_LENGTH \ 712 | " 713 | 714 | 715 | ##### RoBERTa-Long 716 | 717 | export SEED=42 718 | export MAX_LENGTH=2048 719 | export NR_CONCATS=3 720 | export DATASET=squad_long 721 | export MODEL_DIR=/workspace/models 722 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long 723 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 724 | export LOG_DIR=/workspace/logs 725 | export DATA_DIR=/workspace/data 726 | # Debugging 727 | CUDA_LAUNCH_BLOCKING=1 728 | # model args 729 | make repl run="scripts/finetune_qa_models.py \ 730 | --model_name_or_path $MODEL_NAME_OR_PATH \ 731 | --output_dir $MODEL_DIR/$MODEL_NAME \ 732 | --logging_dir $LOG_DIR/$MODEL_NAME \ 733 | --dataset $DATASET \ 734 | --data_dir $DATA_DIR \ 735 | --seed $SEED \ 736 | --num_train_epochs 3 \ 737 | --learning_rate 3e-5 \ 738 | --logging_steps 50 \ 739 | --eval_steps 50 \ 740 | --save_steps 1000 \ 741 | --per_device_train_batch_size 1 \ 742 | --per_device_eval_batch_size 32 \ 743 | --gradient_accumulation_steps 32 \ 744 | --overwrite_output_dir \ 745 | --evaluate_during_training \ 746 | --fp16 \ 747 | --do_train \ 748 | --do_eval \ 749 | --do_lowercase \ 750 | --nr_concats $NR_CONCATS \ 751 | --max_length $MAX_LENGTH \ 752 | " 753 | 754 | 755 | ##### XLM-Long 756 | 757 | export SEED=42 758 | export MAX_LENGTH=2048 759 | export NR_CONCATS=3 760 | export DATASET=squad_long 761 | export MODEL_DIR=/workspace/models 762 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 763 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 764 | export LOG_DIR=/workspace/logs 765 | export DATA_DIR=/workspace/data 766 | # Debugging 767 | CUDA_LAUNCH_BLOCKING=1 768 | # model args 769 | make repl run="scripts/finetune_qa_models.py \ 770 | --model_name_or_path $MODEL_NAME_OR_PATH \ 771 | --output_dir $MODEL_DIR/$MODEL_NAME \ 772 | --logging_dir $LOG_DIR/$MODEL_NAME \ 773 | --dataset $DATASET \ 774 | --data_dir $DATA_DIR \ 775 | --seed $SEED \ 776 | --num_train_epochs 3 \ 777 | --learning_rate 3e-5 \ 778 | --logging_steps 50 \ 779 | --eval_steps 50 \ 780 | --save_steps 1000 \ 781 | --per_device_train_batch_size 1 \ 782 | --per_device_eval_batch_size 32 \ 783 | --gradient_accumulation_steps 32 \ 784 | --overwrite_output_dir \ 785 | --evaluate_during_training \ 786 | --fp16 \ 787 | --do_train \ 788 | --do_eval \ 789 | --do_lowercase \ 790 | --nr_concats $NR_CONCATS \ 791 | --max_length $MAX_LENGTH \ 792 | " 793 | 794 | 795 |

796 |
797 | 798 |
TODO TriviaQA (4096) 799 |

800 |

801 |
802 | 803 | ### Multilingual 804 |
XQuAD 805 |

806 | 807 | ##### RoBERTa 808 | 809 | export SEED=42 810 | export DATASET=xquad 811 | export MODEL_DIR=/workspace/models 812 | export MODEL_NAME_OR_PATH=roberta-base 813 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 814 | export LOG_DIR=/workspace/logs 815 | export DATA_DIR=/workspace/data 816 | # Debugging 817 | CUDA_LAUNCH_BLOCKING=1 818 | # model args 819 | make repl run="scripts/finetune_qa_models.py \ 820 | --model_name_or_path $MODEL_NAME_OR_PATH \ 821 | --output_dir $MODEL_DIR/$MODEL_NAME \ 822 | --logging_dir $LOG_DIR/$MODEL_NAME \ 823 | --dataset $DATASET \ 824 | --data_dir $DATA_DIR \ 825 | --seed $SEED \ 826 | --num_train_epochs 3 \ 827 | --learning_rate 3e-5 \ 828 | --logging_steps 50 \ 829 | --eval_steps 50 \ 830 | --save_steps 1000 \ 831 | --per_device_train_batch_size 4 \ 832 | --per_device_eval_batch_size 32 \ 833 | --gradient_accumulation_steps 8 \ 834 | --overwrite_output_dir \ 835 | --evaluate_during_training \ 836 | --fp16 \ 837 | --do_train \ 838 | --do_eval \ 839 | --do_lowercase \ 840 | --max_length 512 \ 841 | " 842 | 843 | ##### XLM-R 844 | 845 | export SEED=42 846 | export DATASET=xquad 847 | export MODEL_DIR=/workspace/models 848 | export MODEL_NAME_OR_PATH=xlm-roberta-base 849 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 850 | export LOG_DIR=/workspace/logs 851 | export DATA_DIR=/workspace/data 852 | # Debugging 853 | CUDA_LAUNCH_BLOCKING=1 854 | # model args 855 | make repl run="scripts/finetune_qa_models.py \ 856 | --model_name_or_path $MODEL_NAME_OR_PATH \ 857 | --output_dir $MODEL_DIR/$MODEL_NAME \ 858 | --logging_dir $LOG_DIR/$MODEL_NAME \ 859 | --dataset $DATASET \ 860 | --data_dir $DATA_DIR \ 861 | --seed $SEED \ 862 | --num_train_epochs 3 \ 863 | --learning_rate 3e-5 \ 864 | --logging_steps 50 \ 865 | --eval_steps 50 \ 866 | --save_steps 1000 \ 867 | --per_device_train_batch_size 4 \ 868 | --per_device_eval_batch_size 32 \ 869 | --gradient_accumulation_steps 8 \ 870 | --overwrite_output_dir \ 871 | --evaluate_during_training \ 872 | --fp16 \ 873 | --do_train \ 874 | --do_eval \ 875 | --do_lowercase \ 876 | --max_length 512 \ 877 | " 878 | 879 | ##### XLM-Long 880 | 881 | export SEED=42 882 | export DATASET=xquad 883 | export MODEL_DIR=/workspace/models 884 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 885 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 886 | export LOG_DIR=/workspace/logs 887 | export DATA_DIR=/workspace/data 888 | # Debugging 889 | CUDA_LAUNCH_BLOCKING=1 890 | # model args 891 | make repl run="scripts/finetune_qa_models.py \ 892 | --model_name_or_path $MODEL_NAME_OR_PATH \ 893 | --output_dir $MODEL_DIR/$MODEL_NAME \ 894 | --logging_dir $LOG_DIR/$MODEL_NAME \ 895 | --dataset $DATASET \ 896 | --data_dir $DATA_DIR \ 897 | --seed $SEED \ 898 | --num_train_epochs 3 \ 899 | --learning_rate 3e-5 \ 900 | --logging_steps 50 \ 901 | --eval_steps 50 \ 902 | --save_steps 1000 \ 903 | --per_device_train_batch_size 4 \ 904 | --per_device_eval_batch_size 32 \ 905 | --gradient_accumulation_steps 8 \ 906 | --overwrite_output_dir \ 907 | --evaluate_during_training \ 908 | --fp16 \ 909 | --do_train \ 910 | --do_eval \ 911 | --do_lowercase \ 912 | --max_length 512 \ 913 | " 914 | 915 | 916 |

917 |
918 | 919 |
XQ3 (512) 920 |

921 | 922 | ##### XLM-R 923 | 924 | export SEED=42 925 | export MAX_LENGTH=512 926 | export NR_CONCATS=1 927 | export DATASET=xquad_long 928 | export MODEL_DIR=/workspace/models 929 | export MODEL_NAME_OR_PATH=xlm-roberta-base 930 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 931 | export LOG_DIR=/workspace/logs 932 | export DATA_DIR=/workspace/data 933 | # Debugging 934 | CUDA_LAUNCH_BLOCKING=1 935 | # model args 936 | make repl run="scripts/finetune_qa_models.py \ 937 | --model_name_or_path $MODEL_NAME_OR_PATH \ 938 | --output_dir $MODEL_DIR/$MODEL_NAME \ 939 | --logging_dir $LOG_DIR/$MODEL_NAME \ 940 | --dataset $DATASET \ 941 | --data_dir $DATA_DIR \ 942 | --seed $SEED \ 943 | --num_train_epochs 3 \ 944 | --learning_rate 3e-5 \ 945 | --logging_steps 50 \ 946 | --eval_steps 50 \ 947 | --save_steps 1000 \ 948 | --per_device_train_batch_size 4 \ 949 | --per_device_eval_batch_size 32 \ 950 | --gradient_accumulation_steps 8 \ 951 | --overwrite_output_dir \ 952 | --evaluate_during_training \ 953 | --fp16 \ 954 | --do_train \ 955 | --do_eval \ 956 | --do_lowercase \ 957 | --nr_concats $NR_CONCATS \ 958 | --max_length $MAX_LENGTH \ 959 | " 960 | 961 | 962 | ##### XLM-Long 963 | 964 | export SEED=42 965 | export MAX_LENGTH=512 966 | export NR_CONCATS=1 967 | export DATASET=xquad_long 968 | export MODEL_DIR=/workspace/models 969 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 970 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 971 | export LOG_DIR=/workspace/logs 972 | export DATA_DIR=/workspace/data 973 | # Debugging 974 | CUDA_LAUNCH_BLOCKING=1 975 | # model args 976 | make repl run="scripts/finetune_qa_models.py \ 977 | --model_name_or_path $MODEL_NAME_OR_PATH \ 978 | --output_dir $MODEL_DIR/$MODEL_NAME \ 979 | --logging_dir $LOG_DIR/$MODEL_NAME \ 980 | --dataset $DATASET \ 981 | --data_dir $DATA_DIR \ 982 | --seed $SEED \ 983 | --num_train_epochs 3 \ 984 | --learning_rate 3e-5 \ 985 | --logging_steps 50 \ 986 | --eval_steps 50 \ 987 | --save_steps 1000 \ 988 | --per_device_train_batch_size 4 \ 989 | --per_device_eval_batch_size 32 \ 990 | --gradient_accumulation_steps 8 \ 991 | --overwrite_output_dir \ 992 | --evaluate_during_training \ 993 | --fp16 \ 994 | --do_train \ 995 | --do_eval \ 996 | --do_lowercase \ 997 | --nr_concats $NR_CONCATS \ 998 | --max_length $MAX_LENGTH \ 999 | " 1000 | 1001 | 1002 |

1003 |
1004 | 1005 |
XQ3 (4096) 1006 |

1007 | 1008 | ##### XLM-Long 1009 | 1010 | export SEED=42 1011 | export MAX_LENGTH=2048 1012 | export NR_CONCATS=3 1013 | export DATASET=xquad_long 1014 | export MODEL_DIR=/workspace/models 1015 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 1016 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 1017 | export LOG_DIR=/workspace/logs 1018 | export DATA_DIR=/workspace/data 1019 | # Debugging 1020 | CUDA_LAUNCH_BLOCKING=1 1021 | # model args 1022 | make repl run="scripts/finetune_qa_models.py \ 1023 | --model_name_or_path $MODEL_NAME_OR_PATH \ 1024 | --output_dir $MODEL_DIR/$MODEL_NAME \ 1025 | --logging_dir $LOG_DIR/$MODEL_NAME \ 1026 | --dataset $DATASET \ 1027 | --data_dir $DATA_DIR \ 1028 | --seed $SEED \ 1029 | --num_train_epochs 3 \ 1030 | --learning_rate 3e-5 \ 1031 | --logging_steps 50 \ 1032 | --eval_steps 50 \ 1033 | --save_steps 1000 \ 1034 | --per_device_train_batch_size 1 \ 1035 | --per_device_eval_batch_size 32 \ 1036 | --gradient_accumulation_steps 32 \ 1037 | --overwrite_output_dir \ 1038 | --evaluate_during_training \ 1039 | --fp16 \ 1040 | --do_train \ 1041 | --do_eval \ 1042 | --do_lowercase \ 1043 | --nr_concats $NR_CONCATS \ 1044 | --max_length $MAX_LENGTH \ 1045 | " 1046 | 1047 | 1048 |

1049 |
1050 | 1051 | 1052 |
MLQA 1053 |

1054 | 1055 | ##### XLM-R 1056 | 1057 | export SEED=42 1058 | export DATASET=mlqa 1059 | export MODEL_DIR=/workspace/models 1060 | export MODEL_NAME_OR_PATH=xlm-roberta-base 1061 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 1062 | export LOG_DIR=/workspace/logs 1063 | export DATA_DIR=/workspace/data 1064 | # Debugging 1065 | CUDA_LAUNCH_BLOCKING=1 1066 | # model args 1067 | make repl run="scripts/finetune_qa_models.py \ 1068 | --model_name_or_path $MODEL_NAME_OR_PATH \ 1069 | --output_dir $MODEL_DIR/$MODEL_NAME \ 1070 | --logging_dir $LOG_DIR/$MODEL_NAME \ 1071 | --dataset $DATASET \ 1072 | --data_dir $DATA_DIR \ 1073 | --seed $SEED \ 1074 | --num_train_epochs 3 \ 1075 | --learning_rate 3e-5 \ 1076 | --logging_steps 50 \ 1077 | --eval_steps 50 \ 1078 | --save_steps 1000 \ 1079 | --per_device_train_batch_size 4 \ 1080 | --per_device_eval_batch_size 32 \ 1081 | --gradient_accumulation_steps 8 \ 1082 | --overwrite_output_dir \ 1083 | --evaluate_during_training \ 1084 | --fp16 \ 1085 | --do_train \ 1086 | --do_eval \ 1087 | --do_lowercase \ 1088 | --max_length 512 \ 1089 | " 1090 | 1091 | 1092 | ##### XLM-Long 1093 | 1094 | export SEED=42 1095 | export DATASET=mlqa 1096 | export MODEL_DIR=/workspace/models 1097 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long 1098 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET 1099 | export LOG_DIR=/workspace/logs 1100 | export DATA_DIR=/workspace/data 1101 | # Debugging 1102 | CUDA_LAUNCH_BLOCKING=1 1103 | # model args 1104 | make repl run="scripts/finetune_qa_models.py \ 1105 | --model_name_or_path $MODEL_NAME_OR_PATH \ 1106 | --output_dir $MODEL_DIR/$MODEL_NAME \ 1107 | --logging_dir $LOG_DIR/$MODEL_NAME \ 1108 | --dataset $DATASET \ 1109 | --data_dir $DATA_DIR \ 1110 | --seed $SEED \ 1111 | --num_train_epochs 3 \ 1112 | --learning_rate 3e-5 \ 1113 | --logging_steps 50 \ 1114 | --eval_steps 50 \ 1115 | --save_steps 1000 \ 1116 | --per_device_train_batch_size 4 \ 1117 | --per_device_eval_batch_size 32 \ 1118 | --gradient_accumulation_steps 8 \ 1119 | --overwrite_output_dir \ 1120 | --evaluate_during_training \ 1121 | --fp16 \ 1122 | --do_train \ 1123 | --do_eval \ 1124 | --do_lowercase \ 1125 | --max_length 512 \ 1126 | " 1127 | 1128 | 1129 | 1130 | 1131 |

1132 |
1133 | 1134 | 1135 |

1136 |
1137 | 1138 | ## Acknowledgment 1139 | Many thanks to the [Longformer Authors](https://github.com/allenai/longformer) for providing reproducible training scripts and Huggingface for open-sourcing their models and frameworks. I would like to thank my supervisor at Peltarion Philipp Eisen for his invaluable feedback, insight and availability. Thank you Professor Joakim Nivre for insightful and thorough feedback and for taking the time out of your busy schedule. A massive thank you to all the wonderful people at Peltarion for the opportunity to work on such an interesting project. 1140 | 1141 | ## Citation 1142 | You can read the report [here](http://www.diva-portal.org/smash/get/diva2:1545786/FULLTEXT02.pdf) 1143 | ``` 1144 | @mastersthesis{Sagen1545786, 1145 | author = {Sagen, Markus}, 1146 | institution = {Uppsala University, Department of Information Technology}, 1147 | pages = {45}, 1148 | school = {Uppsala University, Department of Information Technology}, 1149 | title = {Large-Context Question Answering with Cross-Lingual Transfer}, 1150 | series = {UPTEC IT}, 1151 | ISSN = {1401-5749}, 1152 | number = {21003}, 1153 | year = {2021} 1154 | } 1155 | ``` 1156 | 1157 | 1158 | ## Contact 1159 | 1160 | > The model weights and config for the XLM-Long are available [at Huggingface](https://huggingface.co/markussagen/xlm-roberta-longformer-base-4096). 1161 | > Import as model_name: `markussagen/xlm-roberta-longformer-base-4096` 1162 | 1163 | For questions regarding the code or the master thesis in general add an issue in the repo or contact: 1164 | [markus.john.sagen@gmail.com](mailto:markus.john.sagen@gmail.com) 1165 | 1166 | ## TODO 1167 | - Include plots and table for the evaluations 1168 | - Create bash scripts to fine-tune models on all seeds. Just send in model name 1169 | -------------------------------------------------------------------------------- /notebooks/Try Train Longformer SQuAD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import torch\n", 32 | "import datasets as nlp\n", 33 | "from transformers import LongformerTokenizerFast" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "application/vnd.jupyter.widget-view+json": { 44 | "model_id": "63522ed3effe4cba996db1224652e8a0", 45 | "version_major": 2, 46 | "version_minor": 0 47 | }, 48 | "text/plain": [ 49 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…" 50 | ] 51 | }, 52 | "metadata": {}, 53 | "output_type": "display_data" 54 | }, 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "\n" 60 | ] 61 | }, 62 | { 63 | "data": { 64 | "application/vnd.jupyter.widget-view+json": { 65 | "model_id": "1b86554d789b47ffba0ea4b4bd7bd6ac", 66 | "version_major": 2, 67 | "version_minor": 0 68 | }, 69 | "text/plain": [ 70 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…" 71 | ] 72 | }, 73 | "metadata": {}, 74 | "output_type": "display_data" 75 | }, 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "def get_correct_alignement(context, answer):\n", 95 | " \"\"\" Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. \"\"\"\n", 96 | " gold_text = answer['text'][0]\n", 97 | " start_idx = answer['answer_start'][0]\n", 98 | " end_idx = start_idx + len(gold_text)\n", 99 | " if context[start_idx:end_idx] == gold_text:\n", 100 | " return start_idx, end_idx # When the gold label position is good\n", 101 | " elif context[start_idx-1:end_idx-1] == gold_text:\n", 102 | " return start_idx-1, end_idx-1 # When the gold label is off by one character\n", 103 | " elif context[start_idx-2:end_idx-2] == gold_text:\n", 104 | " return start_idx-2, end_idx-2 # When the gold label is off by two character\n", 105 | " else:\n", 106 | " raise ValueError()\n", 107 | "\n", 108 | "# Tokenize our training dataset\n", 109 | "def convert_to_features(example):\n", 110 | " # Tokenize contexts and questions (as pairs of inputs)\n", 111 | " encodings = tokenizer.encode_plus(example['question'], example['context'], pad_to_max_length=True, max_length=512, truncation=True)\n", 112 | " context_encodings = tokenizer.encode_plus(example['context'])\n", 113 | " \n", 114 | "\n", 115 | " # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.\n", 116 | " # this will give us the position of answer span in the context text\n", 117 | " start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])\n", 118 | " start_positions_context = context_encodings.char_to_token(start_idx)\n", 119 | " end_positions_context = context_encodings.char_to_token(end_idx-1)\n", 120 | "\n", 121 | " # here we will compute the start and end position of the answer in the whole example\n", 122 | " # as the example is encoded like this question context\n", 123 | " # and we know the postion of the answer in the context\n", 124 | " # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)\n", 125 | " # this will give us the position of the answer span in whole example \n", 126 | " sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)\n", 127 | " start_positions = start_positions_context + sep_idx + 1\n", 128 | " end_positions = end_positions_context + sep_idx + 1\n", 129 | "\n", 130 | " if end_positions > 512:\n", 131 | " start_positions, end_positions = 0, 0\n", 132 | "\n", 133 | " encodings.update({'start_positions': start_positions,\n", 134 | " 'end_positions': end_positions,\n", 135 | " 'attention_mask': encodings['attention_mask']})\n", 136 | " return encodings" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "application/vnd.jupyter.widget-view+json": { 147 | "model_id": "ca4e9622c3254a46b42130e280594080", 148 | "version_major": 2, 149 | "version_minor": 0 150 | }, 151 | "text/plain": [ 152 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2043.0, style=ProgressStyle(description…" 153 | ] 154 | }, 155 | "metadata": {}, 156 | "output_type": "display_data" 157 | }, 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "\n" 163 | ] 164 | }, 165 | { 166 | "data": { 167 | "application/vnd.jupyter.widget-view+json": { 168 | "model_id": "36d24978dfd84650b485d2ebbddaa6c3", 169 | "version_major": 2, 170 | "version_minor": 0 171 | }, 172 | "text/plain": [ 173 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=986.0, style=ProgressStyle(description_…" 174 | ] 175 | }, 176 | "metadata": {}, 177 | "output_type": "display_data" 178 | }, 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "\n", 184 | "Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown size, total: 119.27 MiB) to /.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41...\n" 185 | ] 186 | }, 187 | { 188 | "data": { 189 | "application/vnd.jupyter.widget-view+json": { 190 | "model_id": "f94edb441c714bce84e7bb0a574c6823", 191 | "version_major": 2, 192 | "version_minor": 0 193 | }, 194 | "text/plain": [ 195 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=8116577.0, style=ProgressStyle(descript…" 196 | ] 197 | }, 198 | "metadata": {}, 199 | "output_type": "display_data" 200 | }, 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "data": { 210 | "application/vnd.jupyter.widget-view+json": { 211 | "model_id": "351a3e7b4fc241f1a4e983f2a14767a0", 212 | "version_major": 2, 213 | "version_minor": 0 214 | }, 215 | "text/plain": [ 216 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054280.0, style=ProgressStyle(descript…" 217 | ] 218 | }, 219 | "metadata": {}, 220 | "output_type": "display_data" 221 | }, 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "\n" 227 | ] 228 | }, 229 | { 230 | "data": { 231 | "application/vnd.jupyter.widget-view+json": { 232 | "model_id": "5102dc040b554a53b433aafa90d8f75d", 233 | "version_major": 2, 234 | "version_minor": 0 235 | }, 236 | "text/plain": [ 237 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 238 | ] 239 | }, 240 | "metadata": {}, 241 | "output_type": "display_data" 242 | }, 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "\r" 248 | ] 249 | }, 250 | { 251 | "data": { 252 | "application/vnd.jupyter.widget-view+json": { 253 | "model_id": "bf4ce91a4e0c4b379f49a431155bb052", 254 | "version_major": 2, 255 | "version_minor": 0 256 | }, 257 | "text/plain": [ 258 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 259 | ] 260 | }, 261 | "metadata": {}, 262 | "output_type": "display_data" 263 | }, 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Dataset squad downloaded and prepared to /.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41. Subsequent calls will reuse this data.\n" 269 | ] 270 | }, 271 | { 272 | "name": "stderr", 273 | "output_type": "stream", 274 | "text": [ 275 | "Reusing dataset squad (/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)\n", 276 | "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:1773: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", 277 | " FutureWarning,\n" 278 | ] 279 | }, 280 | { 281 | "data": { 282 | "application/vnd.jupyter.widget-view+json": { 283 | "model_id": "cdb0696df27646d2a7293feb6235a180", 284 | "version_major": 2, 285 | "version_minor": 0 286 | }, 287 | "text/plain": [ 288 | "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))" 289 | ] 290 | }, 291 | "metadata": {}, 292 | "output_type": "display_data" 293 | }, 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "\n" 299 | ] 300 | }, 301 | { 302 | "data": { 303 | "application/vnd.jupyter.widget-view+json": { 304 | "model_id": "cefbdc9d6e184b26940367884b1ffcc4", 305 | "version_major": 2, 306 | "version_minor": 0 307 | }, 308 | "text/plain": [ 309 | "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))" 310 | ] 311 | }, 312 | "metadata": {}, 313 | "output_type": "display_data" 314 | }, 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "# load train and validation split of squad\n", 325 | "train_dataset = nlp.load_dataset('squad', split='train')\n", 326 | "valid_dataset = nlp.load_dataset('squad', split='validation')\n", 327 | "\n", 328 | "# Temp. Only for testing quickly\n", 329 | "train_dataset = nlp.Dataset.from_dict(train_dataset[:3])\n", 330 | "valid_dataset = nlp.Dataset.from_dict(valid_dataset[:3])\n", 331 | "\n", 332 | "train_dataset = train_dataset.map(convert_to_features)\n", 333 | "valid_dataset = valid_dataset.map(convert_to_features, load_from_cache_file=False)\n", 334 | "\n", 335 | "\n", 336 | "# set the tensor type and the columns which the dataset should return\n", 337 | "columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']\n", 338 | "train_dataset.set_format(type='torch', columns=columns)\n", 339 | "valid_dataset.set_format(type='torch', columns=columns)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 5, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "(3, 3)" 351 | ] 352 | }, 353 | "execution_count": 5, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "len(train_dataset), len(valid_dataset)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 6, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 7, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "t = torch.load('train_data.pt')" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 8, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "# Write training script" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 9, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 10, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 11, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "import json\n", 429 | "\n", 430 | "args_dict = {\n", 431 | " \"n_gpu\": 1,\n", 432 | " \"model_name_or_path\": 'allenai/longformer-base-4096',\n", 433 | " \"max_len\": 512 ,\n", 434 | " \"output_dir\": './models',\n", 435 | " \"overwrite_output_dir\": True,\n", 436 | " \"per_gpu_train_batch_size\": 8,\n", 437 | " \"per_gpu_eval_batch_size\": 8,\n", 438 | " \"gradient_accumulation_steps\": 16,\n", 439 | " \"learning_rate\": 1e-4,\n", 440 | " \"num_train_epochs\": 3,\n", 441 | " \"do_train\": True\n", 442 | "}" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 12, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 14, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "## SQuAD evaluation script. Modifed slightly for this notebook\n", 466 | "\n", 467 | "from __future__ import print_function\n", 468 | "from collections import Counter\n", 469 | "import string\n", 470 | "import re\n", 471 | "import argparse\n", 472 | "import json\n", 473 | "import sys\n", 474 | "\n", 475 | "\n", 476 | "def normalize_answer(s):\n", 477 | " \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n", 478 | " def remove_articles(text):\n", 479 | " return re.sub(r'\\b(a|an|the)\\b', ' ', text)\n", 480 | "\n", 481 | " def white_space_fix(text):\n", 482 | " return ' '.join(text.split())\n", 483 | "\n", 484 | " def remove_punc(text):\n", 485 | " exclude = set(string.punctuation)\n", 486 | " return ''.join(ch for ch in text if ch not in exclude)\n", 487 | "\n", 488 | " def lower(text):\n", 489 | " return text.lower()\n", 490 | "\n", 491 | " return white_space_fix(remove_articles(remove_punc(lower(s))))\n", 492 | "\n", 493 | "\n", 494 | "def f1_score(prediction, ground_truth):\n", 495 | " prediction_tokens = normalize_answer(prediction).split()\n", 496 | " ground_truth_tokens = normalize_answer(ground_truth).split()\n", 497 | " common = Counter(prediction_tokens) & Counter(ground_truth_tokens)\n", 498 | " num_same = sum(common.values())\n", 499 | " if num_same == 0:\n", 500 | " return 0\n", 501 | " precision = 1.0 * num_same / len(prediction_tokens)\n", 502 | " recall = 1.0 * num_same / len(ground_truth_tokens)\n", 503 | " f1 = (2 * precision * recall) / (precision + recall)\n", 504 | " return f1\n", 505 | "\n", 506 | "\n", 507 | "def exact_match_score(prediction, ground_truth):\n", 508 | " return (normalize_answer(prediction) == normalize_answer(ground_truth))\n", 509 | "\n", 510 | "\n", 511 | "def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):\n", 512 | " scores_for_ground_truths = []\n", 513 | " for ground_truth in ground_truths:\n", 514 | " score = metric_fn(prediction, ground_truth)\n", 515 | " scores_for_ground_truths.append(score)\n", 516 | " return max(scores_for_ground_truths)\n", 517 | "\n", 518 | "\n", 519 | "def evaluate(gold_answers, predictions):\n", 520 | " f1 = exact_match = total = 0\n", 521 | "\n", 522 | " for ground_truths, prediction in zip(gold_answers, predictions):\n", 523 | " total += 1\n", 524 | " exact_match += metric_max_over_ground_truths(\n", 525 | " exact_match_score, prediction, ground_truths)\n", 526 | " f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)\n", 527 | " \n", 528 | " exact_match = 100.0 * exact_match / total\n", 529 | " f1 = 100.0 * f1 / total\n", 530 | "\n", 531 | " return {'exact_match': exact_match, 'f1': f1}" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 15, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "import torch\n", 541 | "from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering\n", 542 | "from tqdm.auto import tqdm" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 16, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "data": { 552 | "text/plain": [ 553 | "LongformerForQuestionAnswering(\n", 554 | " (longformer): LongformerModel(\n", 555 | " (embeddings): LongformerEmbeddings(\n", 556 | " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", 557 | " (position_embeddings): Embedding(4098, 768, padding_idx=1)\n", 558 | " (token_type_embeddings): Embedding(1, 768)\n", 559 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 560 | " (dropout): Dropout(p=0.1, inplace=False)\n", 561 | " )\n", 562 | " (encoder): LongformerEncoder(\n", 563 | " (layer): ModuleList(\n", 564 | " (0): LongformerLayer(\n", 565 | " (attention): LongformerAttention(\n", 566 | " (self): LongformerSelfAttention(\n", 567 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 568 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 569 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 570 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 571 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 572 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 573 | " )\n", 574 | " (output): LongformerSelfOutput(\n", 575 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 576 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 577 | " (dropout): Dropout(p=0.1, inplace=False)\n", 578 | " )\n", 579 | " )\n", 580 | " (intermediate): LongformerIntermediate(\n", 581 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 582 | " )\n", 583 | " (output): LongformerOutput(\n", 584 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 585 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 586 | " (dropout): Dropout(p=0.1, inplace=False)\n", 587 | " )\n", 588 | " )\n", 589 | " (1): LongformerLayer(\n", 590 | " (attention): LongformerAttention(\n", 591 | " (self): LongformerSelfAttention(\n", 592 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 593 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 594 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 595 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 596 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 597 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 598 | " )\n", 599 | " (output): LongformerSelfOutput(\n", 600 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 601 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 602 | " (dropout): Dropout(p=0.1, inplace=False)\n", 603 | " )\n", 604 | " )\n", 605 | " (intermediate): LongformerIntermediate(\n", 606 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 607 | " )\n", 608 | " (output): LongformerOutput(\n", 609 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 610 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 611 | " (dropout): Dropout(p=0.1, inplace=False)\n", 612 | " )\n", 613 | " )\n", 614 | " (2): LongformerLayer(\n", 615 | " (attention): LongformerAttention(\n", 616 | " (self): LongformerSelfAttention(\n", 617 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 618 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 619 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 620 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 621 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 622 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 623 | " )\n", 624 | " (output): LongformerSelfOutput(\n", 625 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 626 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 627 | " (dropout): Dropout(p=0.1, inplace=False)\n", 628 | " )\n", 629 | " )\n", 630 | " (intermediate): LongformerIntermediate(\n", 631 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 632 | " )\n", 633 | " (output): LongformerOutput(\n", 634 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 635 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 636 | " (dropout): Dropout(p=0.1, inplace=False)\n", 637 | " )\n", 638 | " )\n", 639 | " (3): LongformerLayer(\n", 640 | " (attention): LongformerAttention(\n", 641 | " (self): LongformerSelfAttention(\n", 642 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 643 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 644 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 645 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 646 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 647 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 648 | " )\n", 649 | " (output): LongformerSelfOutput(\n", 650 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 651 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 652 | " (dropout): Dropout(p=0.1, inplace=False)\n", 653 | " )\n", 654 | " )\n", 655 | " (intermediate): LongformerIntermediate(\n", 656 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 657 | " )\n", 658 | " (output): LongformerOutput(\n", 659 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 660 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 661 | " (dropout): Dropout(p=0.1, inplace=False)\n", 662 | " )\n", 663 | " )\n", 664 | " (4): LongformerLayer(\n", 665 | " (attention): LongformerAttention(\n", 666 | " (self): LongformerSelfAttention(\n", 667 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 668 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 669 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 670 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 671 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 672 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 673 | " )\n", 674 | " (output): LongformerSelfOutput(\n", 675 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 676 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 677 | " (dropout): Dropout(p=0.1, inplace=False)\n", 678 | " )\n", 679 | " )\n", 680 | " (intermediate): LongformerIntermediate(\n", 681 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 682 | " )\n", 683 | " (output): LongformerOutput(\n", 684 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 685 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 686 | " (dropout): Dropout(p=0.1, inplace=False)\n", 687 | " )\n", 688 | " )\n", 689 | " (5): LongformerLayer(\n", 690 | " (attention): LongformerAttention(\n", 691 | " (self): LongformerSelfAttention(\n", 692 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 693 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 694 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 695 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 696 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 697 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 698 | " )\n", 699 | " (output): LongformerSelfOutput(\n", 700 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 701 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 702 | " (dropout): Dropout(p=0.1, inplace=False)\n", 703 | " )\n", 704 | " )\n", 705 | " (intermediate): LongformerIntermediate(\n", 706 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 707 | " )\n", 708 | " (output): LongformerOutput(\n", 709 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 710 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 711 | " (dropout): Dropout(p=0.1, inplace=False)\n", 712 | " )\n", 713 | " )\n", 714 | " (6): LongformerLayer(\n", 715 | " (attention): LongformerAttention(\n", 716 | " (self): LongformerSelfAttention(\n", 717 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 718 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 719 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 720 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 721 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 722 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 723 | " )\n", 724 | " (output): LongformerSelfOutput(\n", 725 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 726 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 727 | " (dropout): Dropout(p=0.1, inplace=False)\n", 728 | " )\n", 729 | " )\n", 730 | " (intermediate): LongformerIntermediate(\n", 731 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 732 | " )\n", 733 | " (output): LongformerOutput(\n", 734 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 735 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 736 | " (dropout): Dropout(p=0.1, inplace=False)\n", 737 | " )\n", 738 | " )\n", 739 | " (7): LongformerLayer(\n", 740 | " (attention): LongformerAttention(\n", 741 | " (self): LongformerSelfAttention(\n", 742 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 743 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 744 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 745 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 746 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 747 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 748 | " )\n", 749 | " (output): LongformerSelfOutput(\n", 750 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 751 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 752 | " (dropout): Dropout(p=0.1, inplace=False)\n", 753 | " )\n", 754 | " )\n", 755 | " (intermediate): LongformerIntermediate(\n", 756 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 757 | " )\n", 758 | " (output): LongformerOutput(\n", 759 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 760 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 761 | " (dropout): Dropout(p=0.1, inplace=False)\n", 762 | " )\n", 763 | " )\n", 764 | " (8): LongformerLayer(\n", 765 | " (attention): LongformerAttention(\n", 766 | " (self): LongformerSelfAttention(\n", 767 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 768 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 769 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 770 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 771 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 772 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 773 | " )\n", 774 | " (output): LongformerSelfOutput(\n", 775 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 776 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 777 | " (dropout): Dropout(p=0.1, inplace=False)\n", 778 | " )\n", 779 | " )\n", 780 | " (intermediate): LongformerIntermediate(\n", 781 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 782 | " )\n", 783 | " (output): LongformerOutput(\n", 784 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 785 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 786 | " (dropout): Dropout(p=0.1, inplace=False)\n", 787 | " )\n", 788 | " )\n", 789 | " (9): LongformerLayer(\n", 790 | " (attention): LongformerAttention(\n", 791 | " (self): LongformerSelfAttention(\n", 792 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 793 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 794 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 795 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 796 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 797 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 798 | " )\n", 799 | " (output): LongformerSelfOutput(\n", 800 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 801 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 802 | " (dropout): Dropout(p=0.1, inplace=False)\n", 803 | " )\n", 804 | " )\n", 805 | " (intermediate): LongformerIntermediate(\n", 806 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 807 | " )\n", 808 | " (output): LongformerOutput(\n", 809 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 810 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 811 | " (dropout): Dropout(p=0.1, inplace=False)\n", 812 | " )\n", 813 | " )\n", 814 | " (10): LongformerLayer(\n", 815 | " (attention): LongformerAttention(\n", 816 | " (self): LongformerSelfAttention(\n", 817 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 818 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 819 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 820 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 821 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 822 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 823 | " )\n", 824 | " (output): LongformerSelfOutput(\n", 825 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 826 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 827 | " (dropout): Dropout(p=0.1, inplace=False)\n", 828 | " )\n", 829 | " )\n", 830 | " (intermediate): LongformerIntermediate(\n", 831 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 832 | " )\n", 833 | " (output): LongformerOutput(\n", 834 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 835 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 836 | " (dropout): Dropout(p=0.1, inplace=False)\n", 837 | " )\n", 838 | " )\n", 839 | " (11): LongformerLayer(\n", 840 | " (attention): LongformerAttention(\n", 841 | " (self): LongformerSelfAttention(\n", 842 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 843 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 844 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 845 | " (query_global): Linear(in_features=768, out_features=768, bias=True)\n", 846 | " (key_global): Linear(in_features=768, out_features=768, bias=True)\n", 847 | " (value_global): Linear(in_features=768, out_features=768, bias=True)\n", 848 | " )\n", 849 | " (output): LongformerSelfOutput(\n", 850 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 851 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 852 | " (dropout): Dropout(p=0.1, inplace=False)\n", 853 | " )\n", 854 | " )\n", 855 | " (intermediate): LongformerIntermediate(\n", 856 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 857 | " )\n", 858 | " (output): LongformerOutput(\n", 859 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 860 | " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 861 | " (dropout): Dropout(p=0.1, inplace=False)\n", 862 | " )\n", 863 | " )\n", 864 | " )\n", 865 | " )\n", 866 | " )\n", 867 | " (qa_outputs): Linear(in_features=768, out_features=2, bias=True)\n", 868 | ")" 869 | ] 870 | }, 871 | "execution_count": 16, 872 | "metadata": {}, 873 | "output_type": "execute_result" 874 | } 875 | ], 876 | "source": [ 877 | "tokenizer = LongformerTokenizerFast.from_pretrained('models')\n", 878 | "model = LongformerForQuestionAnswering.from_pretrained('models')\n", 879 | "model = model.cuda()\n", 880 | "model.eval()" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 17, 886 | "metadata": {}, 887 | "outputs": [], 888 | "source": [] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 18, 893 | "metadata": {}, 894 | "outputs": [], 895 | "source": [] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "metadata": {}, 901 | "outputs": [], 902 | "source": [] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": {}, 908 | "outputs": [], 909 | "source": [] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": null, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": null, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": null, 949 | "metadata": {}, 950 | "outputs": [], 951 | "source": [] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": null, 963 | "metadata": {}, 964 | "outputs": [], 965 | "source": [] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": null, 977 | "metadata": {}, 978 | "outputs": [], 979 | "source": [] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": null, 984 | "metadata": {}, 985 | "outputs": [], 986 | "source": [] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": null, 991 | "metadata": {}, 992 | "outputs": [], 993 | "source": [] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": null, 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": null, 1005 | "metadata": {}, 1006 | "outputs": [], 1007 | "source": [] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": null, 1012 | "metadata": {}, 1013 | "outputs": [], 1014 | "source": [] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": null, 1019 | "metadata": {}, 1020 | "outputs": [], 1021 | "source": [] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": {}, 1027 | "outputs": [], 1028 | "source": [] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": null, 1033 | "metadata": {}, 1034 | "outputs": [], 1035 | "source": [] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [] 1043 | } 1044 | ], 1045 | "metadata": { 1046 | "kernelspec": { 1047 | "display_name": "Python 3", 1048 | "language": "python", 1049 | "name": "python3" 1050 | }, 1051 | "language_info": { 1052 | "codemirror_mode": { 1053 | "name": "ipython", 1054 | "version": 3 1055 | }, 1056 | "file_extension": ".py", 1057 | "mimetype": "text/x-python", 1058 | "name": "python", 1059 | "nbconvert_exporter": "python", 1060 | "pygments_lexer": "ipython3", 1061 | "version": "3.6.9" 1062 | } 1063 | }, 1064 | "nbformat": 4, 1065 | "nbformat_minor": 4 1066 | } 1067 | --------------------------------------------------------------------------------