├── logs └── .gitignore ├── scripts ├── __init__.py ├── run_long_lm.py └── finetune_qa_models.py ├── src └── __init__.py ├── report ├── Master Thesis.pdf └── Thesis Presentation.pdf ├── .env.template ├── LICENSE ├── Makefile ├── docker-compose.yaml ├── Pretraining_Details.md ├── Dockerfile ├── Finetuning_Details.md ├── .gitignore ├── requirements.txt ├── notebooks ├── Longformer TriviaQA.ipynb ├── Convert to Long.ipynb └── Try Train Longformer SQuAD.ipynb └── README.md /logs/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | import .tracking 2 | import .lib 3 | -------------------------------------------------------------------------------- /report/Master Thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Master Thesis.pdf -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | PROJECT_NAME=xlm-l 2 | DATA_DIR= 3 | MODEL_DIR= 4 | GPU_IDS=0 5 | JUPYTER_PW= 6 | JUPYTER_PORT=8999 7 | PRIVATE_DEPS=none 8 | -------------------------------------------------------------------------------- /report/Thesis Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Thesis Presentation.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Markus Sagen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include .env 2 | 3 | export USER_ID := $(shell id -u) 4 | export USER_NAME := $(shell whoami) 5 | export PROJECT_DIR := $(shell pwd) 6 | export COMPOSE_CMD := docker-compose -f docker-compose.yaml -p ${PROJECT_NAME}_${USER_NAME} 7 | export PKG_DIR := pkg 8 | 9 | # Enable running e3k on machines with no GPU 10 | ifeq (${GPU_IDS}, none) 11 | export RUNTIME := runc 12 | else 13 | export RUNTIME := nvidia 14 | endif 15 | 16 | # Enable pulling in dependencies in private repos 17 | ifneq (${PRIVATE_DEPS}, none) 18 | clone_private_deps := for item in ${PRIVATE_DEPS}; do \ 19 | git clone $$item ${PKG_DIR}/$$item; \ 20 | echo $$item; \ 21 | done 22 | else 23 | clone_private_deps := echo "Nothing to clone" 24 | endif 25 | 26 | .PHONY: build 27 | build: 28 | mkdir -p ${PKG_DIR} 29 | $(call clone_private_deps) 30 | $(COMPOSE_CMD) build 31 | rm -rf ${PKG_DIR} 32 | 33 | .PHONY: logs 34 | logs: 35 | ${COMPOSE_CMD} logs 36 | 37 | .PHONY: up 38 | up: 39 | $(COMPOSE_CMD) up --detach 40 | 41 | .PHONY: down 42 | down: 43 | $(COMPOSE_CMD) down 44 | 45 | .PHONY: repl 46 | repl: 47 | ${COMPOSE_CMD} exec repl python3 $(run) 48 | 49 | .PHONY: ipython 50 | ipython: 51 | ${COMPOSE_CMD} exec repl ipython $(run) 52 | 53 | 54 | .PHONY: shell 55 | shell: 56 | ${COMPOSE_CMD} exec repl bash 57 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2.3' 2 | services: 3 | jupyter: 4 | image: ${PROJECT_NAME} 5 | command: jupyter lab --ip=0.0.0.0 --no-browser --NotebookApp.token='${JUPYTER_PW}' 6 | build: 7 | context: . 8 | dockerfile: Dockerfile 9 | args: 10 | - PKG_DIR=${PKG_DIR} 11 | - PRIVATE_DEPS=${PRIVATE_DEPS} 12 | shm_size: '16gb' 13 | ports: 14 | - ${JUPYTER_PORT}:8888 15 | user: ${USER_ID}:${USER_ID} 16 | runtime: ${RUNTIME} 17 | network_mode: bridge 18 | environment: 19 | - NVIDIA_VISIBLE_DEVICES=${GPU_IDS} 20 | volumes: 21 | - ${DATA_DIR}:/workspace/data 22 | - ${MODEL_DIR}:/workspace/models 23 | - ${PROJECT_DIR}/src:/workspace/src 24 | - ${PROJECT_DIR}/notebooks:/workspace/notebooks 25 | - ${PROJECT_DIR}/logs:/workspace/logs 26 | 27 | repl: 28 | image: ${PROJECT_NAME} 29 | tty: true 30 | shm_size: '16gb' 31 | user: ${USER_ID}:${USER_ID} 32 | runtime: ${RUNTIME} 33 | network_mode: bridge 34 | environment: 35 | - NVIDIA_VISIBLE_DEVICES=${GPU_IDS} 36 | volumes: 37 | - ${DATA_DIR}:/workspace/data 38 | - ${MODEL_DIR}:/workspace/models 39 | - ${PROJECT_DIR}/src:/workspace/src 40 | - ${PROJECT_DIR}/scripts:/workspace/scripts 41 | - ${PROJECT_DIR}/logs:/workspace/logs 42 | -------------------------------------------------------------------------------- /Pretraining_Details.md: -------------------------------------------------------------------------------- 1 | # Pre-Training Details 2 | 3 | ### Models 4 | Converting transformer models are based on the [Longformer conversion script](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb). The script can be run for any pre-trained RoBERTa based model and can be extended to be used with other pre-traned models. 5 | 6 | Training with these parameters on a 48GB GPU takes ~5 days 7 | We pre-trained both a monolingual RoBERTa and multilingual XLM-R model using the Longformer pre-training scheme to extend the context's of the models. These models were trained on the same datasets and same hyper-parameters and only trained with one seed because of the long training time. 8 | 9 | The arguments `MAX_POS` indicate how many tokens the model should learn to attend. The number of tokens it can learn to attend to must be of the form $2^x$ and be larger than $512$. 10 | 11 | The `MODEL_NAME_OR_PATH` indicated the pre-trained model that the Longformer can be extended from. The names of the models must be pre-trained model names available at [Huggingface](https://huggingface.co/models), such as `roberta-base`, `xlm-roberta-base` or similar. The pre-training scheme should in theory work for all encoder-type Transformers, such as BERT, RoBERTa, Alberta, etc. However, we have only tested it for RoBERTa and XLM-R, so the training script may need to be changed if used for BERT. 12 | 13 | We refer to these models that we have trained using the Longformer pre-training scheme as: 14 | 15 | 1. `RoBERTa-Long` 16 | 2. `XLM-Long` 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # https://hub.docker.com/r/huggingface/transformers-pytorch-gpu/dockerfile 3 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 4 | 5 | ARG PKG_DIR 6 | ARG PRIVATE_DEPS 7 | 8 | WORKDIR /workspace 9 | 10 | RUN apt update && \ 11 | apt install -y bash \ 12 | build-essential \ 13 | git \ 14 | wget \ 15 | curl \ 16 | ca-certificates \ 17 | python3 \ 18 | python3-pip && \ 19 | rm -rf /var/lib/apt/lists 20 | 21 | # RUN apt-get update && apt-get install -y git 22 | 23 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 24 | python3 -m pip install --no-cache-dir \ 25 | mkl \ 26 | torch 27 | 28 | #RUN git clone https://github.com/NVIDIA/apex 29 | #RUN cd apex && \ 30 | # python3 setup.py install && \ 31 | # pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 32 | 33 | 34 | # Install packages from private repositories 35 | COPY ${PKG_DIR}/ /pkg/ 36 | RUN if [ "${PRIVATE_DEPS}" != "none" ]; then \ 37 | for pkg in /pkg/*/* ; \ 38 | do pip install -e $pkg ; \ 39 | done; \ 40 | fi 41 | 42 | 43 | # Fix permissions 44 | RUN chmod 0777 /workspace 45 | RUN mkdir /.local && chmod 0777 /.local 46 | RUN mkdir /.jupyter && chmod 0777 /.jupyter 47 | RUN mkdir /.cache && chmod 0777 /.cache 48 | # Workaround for transformers library permissions 49 | RUN mkdir /.config && chmod 0777 /.config 50 | 51 | # Install python packages 52 | ADD src ./src 53 | ADD requirements.txt . 54 | RUN pip install -r requirements.txt 55 | -------------------------------------------------------------------------------- /Finetuning_Details.md: -------------------------------------------------------------------------------- 1 | # Fine-Tuning Details 2 | 3 | 4 | We fine-tune and evaluate on these datasets using several pre-trained models released by Huggingface and compare it with the long-context models (Longformer type models) we have trained. 5 | 6 | We have divided the models firstly based the number of languages, then on the specific dataset and finally which model was fine-tuned. The datasets SQ3 and XQ3 are the long context variants (with concatenated context) of the SQuAD and XQuAD datasets. And to better understand and evaluate how the performance was effected when creating a new dataset, we chose fine-tune on the SQ3 and XQ3 dataset using either the regular attention window (512 tokens) or the attention window learned by the Longformer trained models (4096 tokens). These datasets were denoted SQ3 (512) and SQ3 (2048) respectively for the English dataset and XQ3 (512) and XQ3 (2048) for the multilingual datasets. 7 | 8 | The long context models are trained on a longer context than 2048, but we restricted the long context datasets to this many tokens at time, since the models did not manage to fit in memory on a 48GB GPU otherwise. 9 | 10 | #### Context lengths 11 | Depending on the number of contexts one choses to concatinate together, the maximum number of tokens the model can attend to also changes. The maximum number of contexts and tokens we managed to run on a 48GB GPU was 3 concatinated context, and corresponded to that the average number of tokens for each context were slightly below 2048. Therefore, for the concatinated long datasets, we set the hyper-parameters --nr\_concats=3 and --max\_length=2048. If you want to test out other values, we suggest the following pairings: 12 | 13 | concats=1, max\_length=512 14 | concats=3, max\_length=2048 15 | concats=5, max\_length=4098 16 | 17 | 18 | #### Seeds 19 | Each model is trained with 5 different SEEDS. To replicate our experiments, re-run each code segment and replace the SEED with the following seeds: 20 | 21 | - 42 22 | - 1337 23 | - 1729 24 | - 165 25 | - 758241 26 | 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | anyio==2.1.0 3 | argon2-cffi==20.1.0 4 | asn1crypto==0.24.0 5 | async-generator==1.10 6 | attrs==20.3.0 7 | Babel==2.9.0 8 | backcall==0.2.0 9 | bleach==3.3.0 10 | cached-property==1.5.2 11 | cachetools==4.2.1 12 | certifi==2020.12.5 13 | cffi==1.14.5 14 | chardet==4.0.0 15 | click==7.1.2 16 | cloudpickle==1.6.0 17 | colorama==0.4.4 18 | contextvars==2.4 19 | cryptography==2.1.4 20 | cycler==0.10.0 21 | Cython==0.29.21 22 | dask==2021.2.0 23 | dataclasses==0.8 24 | datasets==1.3.0 25 | decorator==4.4.2 26 | defusedxml==0.6.0 27 | dill==0.3.3 28 | distributed==2021.2.0 29 | dnspython==2.1.0 30 | docopt==0.6.2 31 | entrypoints==0.3 32 | filelock==3.0.12 33 | fsspec==0.8.5 34 | gitdb==4.0.5 35 | GitPython==3.1.13 36 | google-auth==1.27.0 37 | google-auth-oauthlib==0.4.2 38 | graphviz==0.16 39 | grpcio==1.35.0 40 | h5py==3.1.0 41 | HeapDict==1.0.1 42 | hiddenlayer==0.3 43 | huggingface-hub==0.0.2 44 | idna==2.10 45 | immutables==0.15 46 | importlib-metadata==3.4.0 47 | intel-openmp==2021.1.2 48 | ipykernel==5.5.0 49 | ipython==7.16.1 50 | ipython-genutils==0.2.0 51 | ipywidgets==7.6.3 52 | jedi==0.18.0 53 | Jinja2==2.11.3 54 | joblib==1.0.1 55 | json5==0.9.5 56 | jsonpickle==1.5.2 57 | jsonschema==3.2.0 58 | jupyter-client==6.1.11 59 | jupyter-core==4.7.1 60 | jupyter-server==1.4.0 61 | jupyterlab==3.0.8 62 | jupyterlab-pygments==0.1.2 63 | jupyterlab-server==2.3.0 64 | jupyterlab-widgets==1.0.0 65 | keyring==10.6.0 66 | keyrings.alt==3.0 67 | kiwisolver==1.3.1 68 | Markdown==3.3.3 69 | MarkupSafe==1.1.1 70 | matplotlib==3.3.4 71 | mistune==0.8.4 72 | mkl==2021.1.1 73 | msgpack==1.0.2 74 | multiprocess==0.70.11.1 75 | munch==2.5.0 76 | nbclassic==0.2.6 77 | nbclient==0.5.2 78 | nbconvert==6.0.7 79 | nbformat==5.1.2 80 | nest-asyncio==1.5.1 81 | notebook==6.2.0 82 | numpy==1.19.5 83 | oauthlib==3.1.0 84 | packaging==20.9 85 | pandas==1.1.5 86 | pandocfilters==1.4.3 87 | parso==0.8.1 88 | pexpect==4.8.0 89 | pickleshare==0.7.5 90 | Pillow==8.1.0 91 | pip==20.3.3 92 | prometheus-client==0.9.0 93 | prompt-toolkit==3.0.16 94 | protobuf==3.15.0 95 | psutil==5.8.0 96 | ptyprocess==0.7.0 97 | py-cpuinfo==7.0.0 98 | pyarrow==1.0.1 99 | pyasn1==0.4.8 100 | pyasn1-modules==0.2.8 101 | pycparser==2.20 102 | pycrypto==2.6.1 103 | Pygments==2.8.0 104 | pygobject==3.26.1 105 | pymongo==3.11.3 106 | pyparsing==2.4.7 107 | pyrsistent==0.17.3 108 | python-dateutil==2.8.1 109 | pytz==2021.1 110 | pyxdg==0.25 111 | PyYAML==5.4.1 112 | pyzmq==22.0.3 113 | regex==2020.11.13 114 | requests==2.25.1 115 | requests-oauthlib==1.3.0 116 | rsa==4.7.1 117 | sacred==0.8.2 118 | sacremoses==0.0.43 119 | scikit-learn==0.24.1 120 | scipy==1.5.4 121 | seaborn==0.11.1 122 | SecretStorage==2.3.1 123 | Send2Trash==1.5.0 124 | sentencepiece==0.1.95 125 | setuptools==53.0.0 126 | six==1.11.0 127 | sklearn 128 | smmap==3.0.5 129 | sniffio==1.2.0 130 | sortedcontainers==2.3.0 131 | tbb==2021.1.1 132 | tblib==1.7.0 133 | tensorboard==2.4.1 134 | tensorboard-plugin-wit==1.8.0 135 | terminado==0.9.2 136 | testpath==0.4.4 137 | threadpoolctl==2.1.0 138 | tokenizers==0.9.2 139 | toolz==0.11.1 140 | torch==1.7.1 141 | torchsummary==1.5.1 142 | tornado==6.1 143 | tqdm==4.49.0 144 | traitlets==4.3.3 145 | transformers==3.4.0 146 | typing-extensions==3.7.4.3 147 | urllib3==1.26.3 148 | wcwidth==0.2.5 149 | webencodings==0.5.1 150 | Werkzeug==1.0.1 151 | wget==3.2 152 | wheel==0.30.0 153 | widgetsnbextension==3.5.1 154 | wrapt==1.12.1 155 | xxhash==2.0.0 156 | zict==2.0.0 157 | zipp==3.4.0 158 | -------------------------------------------------------------------------------- /notebooks/Longformer TriviaQA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import nlp\n", 10 | "import torch\n", 11 | "import datasets\n", 12 | "\n", 13 | "# ATTENTION. Rerunning this command remove the cached trivia qa dataset completely \n", 14 | "#!rm -rf /.cache/" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "trivia_qa wikitext-103-raw\n", 27 | "mkdir: cannot create directory '../data/trivia_qa': File exists\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "# https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb#scrollTo=wyDYG4YDXFV7\n", 33 | "!ls ../data\n", 34 | "!mkdir ../data/trivia_qa" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "%%time\n", 44 | "validation_dataset = datasets.load_dataset(\"trivia_qa\", \"rc\", split=\"validation[:5%]\", cache_dir=\"/workspace/data/trivia_qa\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "absl-py==0.11.0\n", 57 | "apex==0.1\n", 58 | "argon2-cffi==20.1.0\n", 59 | "asn1crypto==0.24.0\n", 60 | "async-generator==1.10\n", 61 | "attrs==20.3.0\n", 62 | "backcall==0.2.0\n", 63 | "bleach==3.2.1\n", 64 | "cached-property==1.5.2\n", 65 | "cachetools==4.1.1\n", 66 | "certifi==2020.11.8\n", 67 | "cffi==1.14.4\n", 68 | "chardet==3.0.4\n", 69 | "click==7.1.2\n", 70 | "cloudpickle==1.6.0\n", 71 | "colorama==0.4.4\n", 72 | "contextvars==2.4\n", 73 | "cryptography==2.1.4\n", 74 | "cycler==0.10.0\n", 75 | "Cython==0.29.21\n", 76 | "dask==2.30.0\n", 77 | "dataclasses==0.8\n", 78 | "datasets==1.1.3\n", 79 | "decorator==4.4.2\n", 80 | "defusedxml==0.6.0\n", 81 | "dill==0.3.3\n", 82 | "distributed==2.30.1\n", 83 | "dnspython==2.0.0\n", 84 | "docopt==0.6.2\n", 85 | "entrypoints==0.3\n", 86 | "filelock==3.0.12\n", 87 | "future==0.18.2\n", 88 | "gitdb==4.0.5\n", 89 | "GitPython==3.1.11\n", 90 | "google-auth==1.23.0\n", 91 | "google-auth-oauthlib==0.4.2\n", 92 | "graphviz==0.15\n", 93 | "grpcio==1.33.2\n", 94 | "h5py==3.1.0\n", 95 | "HeapDict==1.0.1\n", 96 | "hiddenlayer==0.3\n", 97 | "idna==2.6\n", 98 | "immutables==0.14\n", 99 | "importlib-metadata==3.1.0\n", 100 | "intel-openmp==2020.0.133\n", 101 | "ipykernel==5.3.4\n", 102 | "ipython==7.16.1\n", 103 | "ipython-genutils==0.2.0\n", 104 | "ipywidgets==7.5.1\n", 105 | "jedi==0.17.2\n", 106 | "Jinja2==2.11.2\n", 107 | "joblib==0.17.0\n", 108 | "json5==0.9.5\n", 109 | "jsonpickle==1.4.1\n", 110 | "jsonschema==3.2.0\n", 111 | "jupyter-client==6.1.7\n", 112 | "jupyter-core==4.7.0\n", 113 | "jupyterlab==2.2.9\n", 114 | "jupyterlab-pygments==0.1.2\n", 115 | "jupyterlab-server==1.2.0\n", 116 | "keyring==10.6.0\n", 117 | "keyrings.alt==3.0\n", 118 | "kiwisolver==1.3.1\n", 119 | "Markdown==3.3.3\n", 120 | "MarkupSafe==1.1.1\n", 121 | "matplotlib==3.3.3\n", 122 | "mistune==0.8.4\n", 123 | "mkl==2019.0\n", 124 | "msgpack==1.0.0\n", 125 | "multiprocess==0.70.11.1\n", 126 | "munch==2.5.0\n", 127 | "nbclient==0.5.1\n", 128 | "nbconvert==6.0.7\n", 129 | "nbformat==5.0.8\n", 130 | "nest-asyncio==1.4.3\n", 131 | "notebook==6.1.5\n", 132 | "numpy==1.19.4\n", 133 | "oauthlib==3.1.0\n", 134 | "packaging==20.4\n", 135 | "pandas==1.1.4\n", 136 | "pandocfilters==1.4.3\n", 137 | "parso==0.7.1\n", 138 | "pexpect==4.8.0\n", 139 | "pickleshare==0.7.5\n", 140 | "Pillow==8.0.1\n", 141 | "prometheus-client==0.9.0\n", 142 | "prompt-toolkit==3.0.8\n", 143 | "protobuf==3.14.0\n", 144 | "psutil==5.7.3\n", 145 | "ptyprocess==0.6.0\n", 146 | "py-cpuinfo==7.0.0\n", 147 | "pyarrow==2.0.0\n", 148 | "pyasn1==0.4.8\n", 149 | "pyasn1-modules==0.2.8\n", 150 | "pycparser==2.20\n", 151 | "pycrypto==2.6.1\n", 152 | "Pygments==2.7.2\n", 153 | "pygobject==3.26.1\n", 154 | "pymongo==3.11.1\n", 155 | "pyparsing==2.4.7\n", 156 | "pyrsistent==0.17.3\n", 157 | "python-dateutil==2.8.1\n", 158 | "pytz==2020.4\n", 159 | "pyxdg==0.25\n", 160 | "PyYAML==5.3.1\n", 161 | "pyzmq==20.0.0\n", 162 | "regex==2020.11.13\n", 163 | "requests==2.25.0\n", 164 | "requests-oauthlib==1.3.0\n", 165 | "rsa==4.6\n", 166 | "sacred==0.8.1\n", 167 | "sacremoses==0.0.43\n", 168 | "scikit-learn==0.23.2\n", 169 | "scipy==1.5.4\n", 170 | "seaborn==0.11.0\n", 171 | "SecretStorage==2.3.1\n", 172 | "Send2Trash==1.5.0\n", 173 | "sentencepiece==0.1.94\n", 174 | "six==1.11.0\n", 175 | "sklearn==0.0\n", 176 | "smmap==3.0.4\n", 177 | "sortedcontainers==2.3.0\n", 178 | "tblib==1.7.0\n", 179 | "tensorboard==2.4.0\n", 180 | "tensorboard-plugin-wit==1.7.0\n", 181 | "terminado==0.9.1\n", 182 | "testpath==0.4.4\n", 183 | "threadpoolctl==2.1.0\n", 184 | "tokenizers==0.9.2\n", 185 | "toolz==0.11.1\n", 186 | "torch==1.7.0\n", 187 | "torchsummary==1.5.1\n", 188 | "tornado==6.1\n", 189 | "tqdm==4.49.0\n", 190 | "traitlets==4.3.3\n", 191 | "transformers==3.4.0\n", 192 | "typing-extensions==3.7.4.3\n", 193 | "urllib3==1.26.2\n", 194 | "wcwidth==0.2.5\n", 195 | "webencodings==0.5.1\n", 196 | "Werkzeug==1.0.1\n", 197 | "wget==3.2\n", 198 | "widgetsnbextension==3.5.1\n", 199 | "wrapt==1.12.1\n", 200 | "xxhash==2.0.0\n", 201 | "zict==2.0.0\n", 202 | "zipp==3.4.0\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "!pip freeze\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# define the mapping function\n", 224 | "def format_dataset(example):\n", 225 | " # the context might be comprised of multiple contexts => me merge them here\n", 226 | " example[\"context\"] = \" \".join((\"\\n\".join(example[\"entity_pages\"][\"wiki_context\"])).split(\"\\n\"))\n", 227 | " example[\"targets\"] = example[\"answer\"][\"aliases\"]\n", 228 | " example[\"norm_target\"] = example[\"answer\"][\"normalized_value\"]\n", 229 | " return example\n", 230 | "\n", 231 | "# map the dataset and throw out all unnecessary columns\n", 232 | "validation_dataset = validation_dataset.map(format_dataset, remove_columns=[\"search_results\", \"question_source\", \"entity_pages\", \"answer\", \"question_id\"])" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "validation_dataset[8]" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "validation_dataset = validation_dataset.filter(lambda x: len(x[\"context\"]) > 0)\n", 251 | "# check out how many samples are left\n", 252 | "validation_dataset" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "print(\"\\n\\nLength for each example\")\n", 262 | "print(30 * \"=\")\n", 263 | "\n", 264 | "# length for each example\n", 265 | "validation_dataset.map(lambda x, i: print(f\"Id: {i} - Question Length: {len(x['question'])} - context Length: {len(x['context'])}\"), with_indices=True)\n", 266 | "print(30 * \"=\")\n", 267 | "\n", 268 | "print(\"\\n\")\n", 269 | "print(\"Num examples larger than 4 * 4096 characters: \")\n", 270 | "# filter out examples smaller than 4 * 4096\n", 271 | "short_validation_dataset = validation_dataset.filter(lambda x: (len(x['question']) + len(x['context'])) < 4 * 4096)\n", 272 | "short_validation_dataset" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "# EVAL" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering\n", 291 | "\n", 292 | "tokenizer = LongformerTokenizerFast.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n", 293 | "\n", 294 | "# download the 1.7 GB pretrained model. It might take ~1min\n", 295 | "model = LongformerForQuestionAnswering.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n", 296 | "model.to(\"cuda\")\n", 297 | "\n", 298 | "def evaluate(example):\n", 299 | " def get_answer(question, context):\n", 300 | " # encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length\n", 301 | " encoding = tokenizer.encode_plus(question, context, return_tensors=\"pt\", max_length=4096, truncation=True)\n", 302 | " input_ids = encoding[\"input_ids\"].to(\"cuda\")\n", 303 | " attention_mask = encoding[\"attention_mask\"].to(\"cuda\")\n", 304 | "\n", 305 | " # the forward method will automatically set global attention on question tokens\n", 306 | " # The scores for the possible start token and end token of the answer are retrived\n", 307 | " # wrap the function in torch.no_grad() to save memory\n", 308 | " with torch.no_grad():\n", 309 | " start_scores, end_scores = model(input_ids=input_ids, attention_mask=attention_mask)\n", 310 | "\n", 311 | " # Let's take the most likely token using `argmax` and retrieve the answer\n", 312 | " all_tokens = tokenizer.convert_ids_to_tokens(encoding[\"input_ids\"][0].tolist())\n", 313 | " answer_tokens = all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1]\n", 314 | " answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))[1:].replace('\"', '') # remove space prepending space token and remove unnecessary '\"'\n", 315 | " \n", 316 | " return answer\n", 317 | "\n", 318 | " # save the model's outut here\n", 319 | " example[\"output\"] = get_answer(example[\"question\"], example[\"context\"])\n", 320 | "\n", 321 | " # save if it's a match or not\n", 322 | " example[\"match\"] = (example[\"output\"] in example[\"targets\"]) or (example[\"output\"] == example[\"norm_target\"])\n", 323 | "\n", 324 | " return example\n" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "results_short = short_validation_dataset.map(evaluate)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "print(f\"\\nNum Correct examples: {sum(results_short['match'])}/{len(results_short)}\")\n", 343 | "wrong_results = results_short.filter(lambda x: x['match'] is False)\n", 344 | "print(f\"\\nWrong examples: \")\n", 345 | "wrong_results.map(lambda x, i: print(f\"{i} - Output: {x['output']} - Target: {x['norm_target']}\"), with_indices=True)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "results = validation_dataset.map(evaluate)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "print(f\"Correct examples: {sum(results['match'])}/{len(results)}\")" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "# TriviaQA json to SQUAD format dataloader" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 1, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "import json\n", 415 | "from pathlib import Path\n", 416 | "\n", 417 | "def read_squad_files(path: str):\n", 418 | " path = Path(path)\n", 419 | " with open(path, 'rb') as f:\n", 420 | " squad_dict = json.load(f)\n", 421 | " contexts = []\n", 422 | " questions = []\n", 423 | " answers = []\n", 424 | " for group in squad_dict['data']:\n", 425 | " for passage in group['paragraphs']:\n", 426 | " context = passage['context']\n", 427 | " for qa in passage['qas']:\n", 428 | " question = qa['question']\n", 429 | " for answer in qa['answers']:\n", 430 | " contexts.append(context)\n", 431 | " questions.append(question)\n", 432 | " answers.append(answer)\n", 433 | "\n", 434 | " return contexts, questions, answers\n", 435 | " \n", 436 | "\n", 437 | "train_contexts, train_questions, train_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-train-4096.json')\n", 438 | "val_contexts, val_questions, val_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-dev-4096.json')" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 2, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "## Add start and end tokens correctly" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 3, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "def add_end_idx(answers, contexts):\n", 457 | " for answer, context in zip(answers, contexts):\n", 458 | " gold_text = answer['text']\n", 459 | " start_idx = answer['answer_start']\n", 460 | " end_idx = start_idx + len(gold_text)\n", 461 | "\n", 462 | " # sometimes squad answers are off by a character or two – fix this\n", 463 | " if context[start_idx:end_idx].lower() == gold_text:\n", 464 | " answer['answer_end'] = end_idx\n", 465 | " elif context[start_idx-1:end_idx-1].lower() == gold_text:\n", 466 | " answer['answer_start'] = start_idx - 1\n", 467 | " answer['answer_end'] = end_idx - 1 # When the gold label is off by one character\n", 468 | " elif context[start_idx-2:end_idx-2].lower() == gold_text:\n", 469 | " answer['answer_start'] = start_idx - 2\n", 470 | " answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters\n", 471 | "\n", 472 | "add_end_idx(train_answers, train_contexts)\n", 473 | "add_end_idx(val_answers, val_contexts)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 4, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "## Tokenize results" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "from transformers import RobertaTokenizerFast\n", 492 | "tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lowercase=True)\n", 493 | "\n", 494 | "train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "### convert start-end pos to token start/end pos" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "def add_token_positions(encodings, answers):\n", 522 | " start_positions = []\n", 523 | " end_positions = []\n", 524 | " for i in range(len(answers)):\n", 525 | " start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))\n", 526 | " end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))\n", 527 | " # if None, the answer passage has been truncated\n", 528 | " if start_positions[-1] is None:\n", 529 | " start_positions[-1] = tokenizer.model_max_length\n", 530 | " if end_positions[-1] is None:\n", 531 | " end_positions[-1] = tokenizer.model_max_length\n", 532 | " encodings.update({'start_positions': start_positions, 'end_positions': end_positions})\n", 533 | "\n", 534 | "add_token_positions(train_encodings, train_answers)\n", 535 | "add_token_positions(val_encodings, val_answers)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "### Dataloader" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "import torch\n", 554 | "from torch.utils.data import DataLoader, Dataset\n", 555 | "\n", 556 | "class SquadDataset(torch.utils.data.Dataset):\n", 557 | " def __init__(self, encodings):\n", 558 | " self.encodings = encodings\n", 559 | "\n", 560 | " def __getitem__(self, idx):\n", 561 | " return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", 562 | "\n", 563 | " def __len__(self):\n", 564 | " return len(self.encodings.input_ids)\n", 565 | "\n", 566 | "train_dataset = SquadDataset(train_encodings)\n", 567 | "val_dataset = SquadDataset(val_encodings)\n", 568 | "\n" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [ 577 | "train_dataset = DataLoader(train_dataset, batch_size=16, shuffle=True)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [] 586 | } 587 | ], 588 | "metadata": { 589 | "kernelspec": { 590 | "display_name": "Python 3", 591 | "language": "python", 592 | "name": "python3" 593 | }, 594 | "language_info": { 595 | "codemirror_mode": { 596 | "name": "ipython", 597 | "version": 3 598 | }, 599 | "file_extension": ".py", 600 | "mimetype": "text/x-python", 601 | "name": "python", 602 | "nbconvert_exporter": "python", 603 | "pygments_lexer": "ipython3", 604 | "version": "3.6.9" 605 | } 606 | }, 607 | "nbformat": 4, 608 | "nbformat_minor": 4 609 | } 610 | -------------------------------------------------------------------------------- /scripts/run_long_lm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import datetime 5 | from dataclasses import dataclass, field 6 | import functools 7 | import logging 8 | import math 9 | import os 10 | import pickle 11 | import re 12 | import sys 13 | import time 14 | import threading 15 | from typing import Optional 16 | 17 | import torch 18 | from torch.utils.data.dataset import Dataset 19 | from torch.utils.tensorboard import SummaryWriter 20 | import tqdm 21 | from transformers import logging as hf_logging 22 | from transformers.modeling_longformer import LongformerSelfAttention 23 | from transformers import ( 24 | PreTrainedModel, 25 | PreTrainedTokenizer, 26 | AutoModelForMaskedLM, 27 | RobertaForMaskedLM, 28 | XLMRobertaForMaskedLM, 29 | AutoTokenizer, 30 | ) 31 | 32 | from transformers import ( 33 | HfArgumentParser, 34 | DataCollatorForLanguageModeling, 35 | Trainer, 36 | TrainingArguments, 37 | set_seed, 38 | ) 39 | 40 | 41 | class color: 42 | """Help print colors to terminal.""" 43 | PURPLE = "\033[95m" 44 | CYAN = "\033[96m" 45 | DARKCYAN = "\033[36m" 46 | BLUE = "\033[94m" 47 | GREEN = "\033[92m" 48 | YELLOW = "\033[93m" 49 | RED = "\033[91m" 50 | BOLD = "\033[1m" 51 | UNDERLINE = "\033[4m" 52 | END = "\033[0m" 53 | 54 | 55 | def is_roberta_based_model(model_name: str) -> str: 56 | """Validate if the model to pre-train is of roberta architecture.""" 57 | 58 | r = re.compile('(.*)roberta(.*)') 59 | matches = r.findall(model_name) 60 | base_name = 'none' 61 | if len(matches) > 0: 62 | base_name = '-'.join(model_name.split('-')[:-1]) 63 | 64 | return base_name 65 | 66 | 67 | ########################################## 68 | # 69 | # Arguments 70 | # 71 | ########################################## 72 | 73 | """Helper function: Define argparser and args.""" 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument( 76 | "--model_name", 77 | default=None, 78 | type=str, 79 | help="Name to save the model as.", 80 | ) 81 | parser.add_argument( 82 | "--output_dir", 83 | default=None, 84 | type=str, 85 | help="The output directory for the trained model.", 86 | ) 87 | parser.add_argument( 88 | "--model_type", 89 | default=None, 90 | type=str, 91 | help="Model type selected in the list from Huggingface ex:" 92 | " `bert, roberta, xlm-roberta, ...`", 93 | ) 94 | parser.add_argument( 95 | "--model_name_or_path", 96 | default=None, 97 | type=str, 98 | required=True, 99 | help="Path to pretrained model from huggingface.co/models. " 100 | "Only tested on `xlm-roberta-base` and `roberta-base`.", 101 | ) 102 | parser.add_argument( 103 | "--logging_dir", 104 | default=None, 105 | type=str, 106 | help="Where logs are stored.", 107 | ) 108 | parser.add_argument( 109 | "--model_max_length", 110 | default=4096, 111 | type=int, 112 | choices=[ 113 | 512, 114 | 1024, 115 | 2048, 116 | 4096, 117 | 8192, 118 | 16384, 119 | 32768, 120 | 65536, 121 | 131072, 122 | 262144, 123 | 524288, 124 | 1048576, 125 | ], 126 | help="The maxiumum position of the model", 127 | ) 128 | parser.add_argument( 129 | "--attention_window", 130 | default=512, 131 | type=int, 132 | help="Size of attention window", 133 | ) 134 | parser.add_argument( 135 | "--evaluation_strategy", 136 | default="no", 137 | type=str, 138 | help="How evaluation should be logged, 'steps', 'epochs', 'no'.", 139 | ) 140 | parser.add_argument( 141 | "--do_train", 142 | action="store_true", 143 | help="Whether to run training." 144 | ) 145 | parser.add_argument( 146 | "--do_eval", 147 | action="store_true", 148 | help="Whether to run eval on the dev set." 149 | ) 150 | parser.add_argument( 151 | "--evaluate_during_training", 152 | action="store_true", 153 | help="Run evaluation during training at each logging step.", 154 | ) 155 | parser.add_argument( 156 | "--per_device_train_batch_size", 157 | default=8, 158 | type=int, 159 | help="Batch size per GPU/CPU for training.", 160 | ) 161 | parser.add_argument( 162 | "--per_device_eval_batch_size", 163 | default=8, 164 | type=int, 165 | help="Batch size per GPU/CPU for evaluation.", 166 | ) 167 | parser.add_argument( 168 | "--learning_rate", 169 | default=5e-5, 170 | type=float, 171 | help="The initial learning rate for Adam.", 172 | ) 173 | parser.add_argument( 174 | "--gradient_accumulation_steps", 175 | type=int, 176 | default=1, 177 | help="Number of gradient updates to perform before updating the weights", 178 | ) 179 | parser.add_argument( 180 | "--weight_decay", 181 | default=0.0, 182 | type=float, 183 | help="Weight decay if we apply some." 184 | ) 185 | parser.add_argument( 186 | "--adam_epsilon", 187 | default=1e-8, 188 | type=float, 189 | help="Epsilon for Adam optimizer." 190 | ) 191 | parser.add_argument( 192 | "--max_grad_norm", default=1.0, type=float, help="Max gradient norm." 193 | ) 194 | parser.add_argument( 195 | "--num_train_epochs", 196 | default=3.0, 197 | type=float, 198 | help="Total number of training epochs to perform.", 199 | ) 200 | parser.add_argument( 201 | "--max_steps", 202 | default=-1, 203 | type=int, 204 | help="If > 0: set total number of training steps to perform. " 205 | "Override num_train_epochs.", 206 | ) 207 | parser.add_argument( 208 | "--warmup_steps", 209 | default=0, 210 | type=int, 211 | help="Linear warmup over warmup_steps." 212 | ) 213 | parser.add_argument( 214 | "--verbose_logging", 215 | action="store_true", 216 | help="If true, log all information when loading datasets.", 217 | ) 218 | parser.add_argument( 219 | "--cache_dir", 220 | default=None, 221 | help="Where do you want to store the pretrained models.", 222 | ) 223 | parser.add_argument( 224 | "--lang_id", 225 | default=0, 226 | type=int, 227 | help="language id of input for language-specific xlm models " 228 | "(see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", 229 | ) 230 | parser.add_argument( 231 | "--logging_steps", 232 | type=int, 233 | default=500, 234 | help="Log every X updates steps." 235 | ) 236 | parser.add_argument( 237 | "--save_steps", 238 | type=int, 239 | default=500, 240 | help="Save checkpoint every X updates steps.", 241 | ) 242 | parser.add_argument( 243 | "--eval_all_checkpoints", 244 | action="store_true", 245 | help="Evaluate all checkpoints starting with the same prefix as model_name" 246 | "ending and ending with step number", 247 | ) 248 | parser.add_argument( 249 | "--overwrite_output_dir", 250 | action="store_true", 251 | help="Overwrite the content of the output directory", 252 | ) 253 | parser.add_argument( 254 | "--seed", 255 | type=int, 256 | default=42, 257 | help="random seed for initialization" 258 | ) 259 | parser.add_argument( 260 | "--local_rank", 261 | type=int, 262 | default=-1, 263 | help="local_rank for distributed training on gpus", 264 | ) 265 | parser.add_argument( 266 | "--fp16", 267 | action="store_true", 268 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex)", 269 | ) 270 | parser.add_argument( 271 | "--fp16_opt_level", 272 | type=str, 273 | default="O1", 274 | help="For fp16: Apex AMP optimization level selected in" 275 | "['O0', 'O1', 'O2', and 'O3'].", 276 | ) 277 | parser.add_argument( 278 | "--train_file_path", 279 | type=str, 280 | default="/workspace/data/wikitext-103/wiki.train.raw", 281 | help="File path to language model training file", 282 | ) 283 | parser.add_argument( 284 | "--val_file_path", 285 | type=str, 286 | default="/workspace/data/wikitext-103/wiki.valid.raw", 287 | help="File path to language model training file", 288 | ) 289 | parser.add_argument( 290 | "--eval_steps", 291 | type=int, 292 | default=None, 293 | help="File path to language model training file", 294 | ) 295 | 296 | args = parser.parse_args() 297 | 298 | hf_logging.enable_default_handler() 299 | hf_logging.set_verbosity_info() 300 | hf_logging.enable_explicit_format() 301 | 302 | tb_writer = SummaryWriter(log_dir=args.logging_dir) 303 | 304 | logger = logging.getLogger("") 305 | logger.setLevel(logging.INFO) 306 | fh = logging.FileHandler(f"{args.logging_dir}.log") 307 | sh = logging.StreamHandler(sys.stdout) 308 | formatter = logging.Formatter( 309 | "[%(asctime)s], %(levelname)s %(message)s", 310 | datefmt="%a, %d %b %Y %H:%M:%S", 311 | ) 312 | fh.setFormatter(formatter) 313 | sh.setFormatter(formatter) 314 | logger.addHandler(fh) 315 | logger.addHandler(sh) 316 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n") 317 | 318 | logger.warning( 319 | f"Process rank: {args.local_rank}, \ 320 | distributed training: {bool(args.local_rank != -1)}, \ 321 | 16-bits training: {args.fp16}" 322 | ) 323 | 324 | 325 | ########################################## 326 | # 327 | # Replace Huggingface - TextDataset 328 | # 329 | ########################################## 330 | 331 | # https://github.com/tqdm/tqdm/issues/458 332 | def provide_progress_bar( 333 | function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={} 334 | ): 335 | ret = [None] # Mutable var so the function can store its return value 336 | 337 | def myrunner(function, ret, *args, **kwargs): 338 | ret[0] = function(*args, **kwargs) 339 | 340 | thread = threading.Thread( 341 | target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs 342 | ) 343 | pbar = tqdm.tqdm(total=estimated_time, **tqdm_kwargs) 344 | 345 | thread.start() 346 | while thread.is_alive(): 347 | thread.join(timeout=tstep) 348 | pbar.update(tstep) 349 | pbar.close() 350 | return ret[0] 351 | 352 | 353 | def progress_wrapped(estimated_time, tstep=0.2, tqdm_kwargs={}): 354 | def real_decorator(function): 355 | @functools.wraps(function) 356 | def wrapper(*args, **kwargs): 357 | return provide_progress_bar( 358 | function, 359 | estimated_time=estimated_time, 360 | tstep=tstep, 361 | tqdm_kwargs=tqdm_kwargs, 362 | args=args, 363 | kwargs=kwargs, 364 | ) 365 | 366 | return wrapper 367 | return real_decorator 368 | 369 | 370 | class TextDataset(Dataset): 371 | # Ugly HACK on older transformers 372 | # Use same code as Huggingface TextDataset 373 | def __init__( 374 | self, 375 | tokenizer: PreTrainedTokenizer, 376 | file_path: str, 377 | block_size: int, 378 | overwrite_cache=False, 379 | cache_dir: Optional[str] = None, 380 | ): 381 | assert os.path.isfile( 382 | file_path), f"Input file path {file_path} not found" 383 | block_size = block_size - \ 384 | tokenizer.num_special_tokens_to_add(pair=False) 385 | 386 | directory, filename = os.path.split(file_path) 387 | cached_features_file = os.path.join( 388 | cache_dir if cache_dir is not None else directory, 389 | "cached_lm_{}_{}_{}".format( 390 | tokenizer.__class__.__name__, 391 | str(block_size), 392 | filename, 393 | ), 394 | ) 395 | 396 | # Make sure only the first process in distributed training processes the dataset, 397 | # and the others will use the cache. 398 | @progress_wrapped(estimated_time=200) 399 | def tokenize_text(text): 400 | return tokenizer.tokenize(text) 401 | 402 | @progress_wrapped(estimated_time=300) 403 | def convert_tokens_to_ids(tokenized_text): 404 | return tokenizer.convert_tokens_to_ids(tokenized_text) 405 | 406 | if os.path.exists(cached_features_file) and not overwrite_cache: 407 | start = time.time() 408 | with open(cached_features_file, "rb") as handle: 409 | self.examples = pickle.load(handle) 410 | logger.info( 411 | f"Loading features from cached file {cached_features_file} [took %.3f s]", 412 | time.time() - start, 413 | ) 414 | 415 | else: 416 | logger.info( 417 | f"Creating features from dataset file at {directory}\n\n") 418 | 419 | self.examples = [] 420 | with open(file_path, encoding="utf-8") as f: 421 | text = f.read() 422 | 423 | # For large texts and models, this could take a long time 424 | # Done i two steps, since each part can take between 5-10 min 425 | start = time.time() 426 | text = tokenize_text(text) 427 | logger.info("Tokenizing text [took %.3f s]", time.time() - start) 428 | start = time.time() 429 | tokenized_text = convert_tokens_to_ids(text) 430 | logger.info( 431 | "Converting text to id [took %.3f s]\n", time.time() - start) 432 | 433 | start = time.time() 434 | for i in range( 435 | 0, len(tokenized_text) - block_size + 1, block_size 436 | ): # Truncate in block of block_size 437 | self.examples.append( 438 | tokenizer.build_inputs_with_special_tokens( 439 | tokenized_text[i: i + block_size] 440 | ) 441 | ) 442 | logger.info( 443 | "Build tokenizer inputs by block_size length [took %.3f s]", 444 | time.time() - start, 445 | ) 446 | 447 | start = time.time() 448 | with open(cached_features_file, "wb") as handle: 449 | pickle.dump(self.examples, handle, 450 | protocol=pickle.HIGHEST_PROTOCOL) 451 | logger.info( 452 | "Saving features into cached file %s [took %.3f s]", 453 | cached_features_file, 454 | time.time() - start, 455 | ) 456 | 457 | def __len__(self): 458 | return len(self.examples) 459 | 460 | def __getitem__(self, i) -> torch.Tensor: 461 | return torch.tensor(self.examples[i], dtype=torch.long) 462 | 463 | 464 | ########################################################### 465 | # 466 | # Longformer conversion 467 | # 468 | ########################################################### 469 | 470 | # TODO: Huggingface transformers v. >3.5.1 breaks this 471 | class LongModelSelfAttention(LongformerSelfAttention): 472 | def forward( 473 | self, 474 | hidden_states, 475 | attention_mask=None, 476 | head_mask=None, 477 | encoder_hidden_states=None, 478 | encoder_attention_mask=None, 479 | output_attentions=False, 480 | ): 481 | print() 482 | 483 | return super().forward( 484 | hidden_states, 485 | attention_mask=attention_mask, 486 | ) 487 | 488 | 489 | # Load initial model 490 | MODEL: PreTrainedModel 491 | 492 | if is_roberta_based_model(args.model_name_or_path) == "xlm-roberta": 493 | MODEL = XLMRobertaForMaskedLM 494 | elif is_roberta_based_model(args.model_name_or_path) == "roberta": 495 | MODEL = RobertaForMaskedLM 496 | else: 497 | raise NotImplementedError( 498 | "Currently only supports roberta-based architectures.") 499 | 500 | 501 | class LongModelForMaskedLM: 502 | def __init__(self, config): 503 | super().__init__(config) 504 | print(f"\n{color.YELLOW}Converting models to Longformer is currently only tested for RoBERTa like architectures.{color.END}") 505 | for i, layer in enumerate(self.roberta.encoder.layer): 506 | layer.attention.self = LongModelSelfAttention(config, layer_id=i) 507 | 508 | 509 | def create_long_model( 510 | save_model_to, 511 | model, 512 | tokenizer, 513 | attention_window, 514 | model_max_length 515 | ): 516 | 517 | config = model.config 518 | position_embeddings = model.roberta.embeddings.position_embeddings 519 | 520 | tokenizer.model_max_length = model_max_length 521 | tokenizer.init_kwargs['model_max_length'] = model_max_length 522 | current_model_max_length, embed_size = position_embeddings.weight.shape 523 | 524 | # NOTE: RoBERTa has positions 0,1 reserved 525 | # embedding size is max position + 2 526 | model_max_length += 2 527 | config.max_position_embeddings = model_max_length 528 | assert model_max_length > current_model_max_length, \ 529 | "New model max_length must be longer than current max_length" 530 | 531 | # BUG for XLM: Need to make all zeros sice too large base model 532 | new_pos_embed = position_embeddings.weight.new_zeros( 533 | model_max_length, embed_size 534 | ) 535 | 536 | k = 2 537 | step = current_model_max_length - 2 538 | while k < model_max_length - 1: 539 | new_pos_embed[k:( 540 | k + step)] = position_embeddings.weight[2:] 541 | k += step 542 | 543 | # HACK for Huggingface transformers >=3.4.0 and < 4.0 544 | # https://github.com/huggingface/transformers/issues/6465#issuecomment-719042969 545 | position_embeddings.weight.data = new_pos_embed 546 | model.roberta.embeddings.position_embeddings.num_embeddings = len( 547 | new_pos_embed.data 548 | ) 549 | num_model_embeddings = position_embeddings.num_embeddings 550 | model.roberta.embeddings.position_ids = torch.arange( 551 | 0, num_model_embeddings 552 | )[None] 553 | 554 | # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` 555 | config.attention_window = [attention_window] * config.num_hidden_layers 556 | for i, layer in enumerate(model.roberta.encoder.layer): 557 | longformer_self_attn = LongformerSelfAttention(config, layer_id=i) 558 | longformer_self_attn.query = layer.attention.self.query 559 | longformer_self_attn.key = layer.attention.self.key 560 | longformer_self_attn.value = layer.attention.self.value 561 | 562 | longformer_self_attn.query_global = layer.attention.self.query 563 | longformer_self_attn.key_global = layer.attention.self.key 564 | longformer_self_attn.value_global = layer.attention.self.value 565 | 566 | layer.attention.self = longformer_self_attn 567 | 568 | logger.info(f'saving model to {save_model_to}') 569 | model.save_pretrained(save_model_to) 570 | tokenizer.save_pretrained(save_model_to) 571 | return model, tokenizer 572 | 573 | 574 | def copy_proj_layers(model): 575 | for _, layer in enumerate(model.roberta.encoder.layer): 576 | layer.attention.self.query_global = layer.attention.self.query 577 | layer.attention.self.key_global = layer.attention.self.key 578 | layer.attention.self.value_global = layer.attention.self.value 579 | return model 580 | 581 | 582 | def pretrain_and_evaluate( 583 | training_args, data_args, model, tokenizer, eval_only, model_path 584 | ): 585 | val_dataset = TextDataset( 586 | tokenizer=tokenizer, 587 | file_path=data_args.val_file_path, 588 | block_size=tokenizer.max_len, 589 | ) 590 | if eval_only: 591 | train_dataset = val_dataset 592 | else: 593 | logger.info( 594 | f"Loading and tokenizing training data is usually slow: {data_args.train_file_path}" 595 | ) 596 | train_dataset = TextDataset( 597 | tokenizer=tokenizer, 598 | file_path=data_args.train_file_path, 599 | block_size=tokenizer.max_len, 600 | ) 601 | 602 | data_collator = DataCollatorForLanguageModeling( 603 | tokenizer=tokenizer, mlm=True, mlm_probability=0.15 604 | ) 605 | 606 | trainer = Trainer( 607 | model=model, 608 | args=training_args, 609 | data_collator=data_collator, 610 | train_dataset=train_dataset, 611 | eval_dataset=val_dataset, 612 | prediction_loss_only=True, 613 | ) 614 | 615 | eval_loss = trainer.evaluate() 616 | eval_loss = eval_loss["eval_loss"] 617 | print(f"Initial eval bpc: {color.GREEN}{eval_loss/math.log(2)}{color.END}") 618 | logger.info(f"Initial eval bpc: {eval_loss/math.log(2)}") 619 | 620 | if not eval_only: 621 | trainer.train(model_path=model_path) 622 | trainer.save_model() 623 | 624 | eval_loss = trainer.evaluate() 625 | eval_loss = eval_loss["eval_loss"] 626 | print( 627 | f"Eval bpc after pretraining: \ 628 | {color.GREEN}{eval_loss/math.log(2)}{color.END}" 629 | ) 630 | logger.info(f"Eval bpc after pretraining: {eval_loss/math.log(2)}") 631 | 632 | 633 | @dataclass 634 | class ModelArguments: 635 | """Huggingface parameters for the model training.""" 636 | 637 | model_name_or_path: str = field( 638 | default=None, 639 | metadata={ 640 | "help": "Name of pretrained model to load for model and tokenizer" 641 | ", based on huggingface.co/models, ex 'roberta-base'" 642 | }, 643 | ) 644 | model_name: str = field( 645 | default="roberta-base-long-4096-lm", 646 | metadata={"help": "Name to use when saving model."}, 647 | ) 648 | attention_window: int = field( 649 | default=512, 650 | metadata={"help": "Size of attention window"} 651 | ) 652 | model_max_length: int = field( 653 | default=4096, 654 | metadata={"help": "Maximum position"} 655 | ) 656 | cache_dir: Optional[str] = field( 657 | default=None, 658 | metadata={ 659 | "help": "Where do you want to store the pretrained models." 660 | }, 661 | ) 662 | 663 | 664 | @dataclass 665 | class DataTrainingArguments: 666 | """Training and validation data arguments.""" 667 | 668 | val_file_path: str = field( 669 | default="/workspace/data/wikitext-103-raw/wiki.valid.raw", 670 | metadata={"help": "File for training a Language Model"}, 671 | ) 672 | train_file_path: str = field( 673 | default="/workspace/data/wikitext-103-raw/wiki.train.raw", 674 | metadata={"help": "File for evaluating a Language Model"}, 675 | ) 676 | 677 | 678 | def main(): 679 | ############################################ 680 | # 681 | # Define model params 682 | # 683 | ############################################ 684 | 685 | parser = HfArgumentParser( 686 | (ModelArguments, DataTrainingArguments, TrainingArguments) 687 | ) 688 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 689 | 690 | set_seed(training_args.seed) 691 | 692 | if ( 693 | os.path.exists(training_args.output_dir) 694 | and os.listdir(training_args.output_dir) 695 | and training_args.do_train 696 | and not training_args.overwrite_output_dir 697 | ): 698 | raise ValueError( 699 | f"Output directory ({training_args.output_dir}) \ 700 | already exists and is not empty.\ 701 | Use --overwrite_output_dir to overcome." 702 | ) 703 | 704 | ########################################### 705 | # 706 | # RUN 707 | # 708 | ########################################### 709 | 710 | start = time.time() 711 | print("---------------------------------------------------------") 712 | print( 713 | f"\nLoading from Huggingface pretrained model: \ 714 | `{color.BOLD}{color.GREEN}\ 715 | {model_args.model_name_or_path}\ 716 | {color.END}{color.END}` \ 717 | with name: {model_args.model_name}\n" 718 | ) 719 | 720 | model = AutoModelForMaskedLM.from_pretrained( 721 | model_args.model_name_or_path, 722 | cache_dir=model_args.cache_dir, 723 | ) 724 | tokenizer = AutoTokenizer.from_pretrained( 725 | model_args.model_name_or_path, 726 | model_max_length=model_args.model_max_length, 727 | cache_dir=model_args.cache_dir, 728 | use_fast=True, 729 | ) 730 | 731 | print(f"{color.RED}Creating Longformer model{color.END}") 732 | model_path = training_args.output_dir 733 | if not os.path.exists(model_path): 734 | os.makedirs(model_path) 735 | 736 | logger.info( 737 | f"Converting {model_args.model_name_or_path} \ 738 | into {model_args.model_name}" 739 | ) 740 | model, tokenizer = create_long_model( 741 | save_model_to=model_path, 742 | model=model, 743 | tokenizer=tokenizer, 744 | attention_window=model_args.attention_window, 745 | model_max_length=model_args.model_max_length, 746 | ) 747 | 748 | print(f"{color.RED}Loading Model{color.END}") 749 | logger.info(f"Loading the model from {model_path}") 750 | model = LongModelForMaskedLM.from_pretrained(model_path) 751 | tokenizer = AutoTokenizer.from_pretrained( 752 | model_path, 753 | model_max_length=model_args.model_max_length, 754 | use_fast=True 755 | ) 756 | 757 | print(f"{color.RED}Evaluate{color.END}") 758 | logger.info( 759 | f"Pretraining \ 760 | {model_args.model_name_or_path}-{model_args.model_max_length}... " 761 | ) 762 | pretrain_and_evaluate( 763 | training_args, 764 | data_args, 765 | model, 766 | tokenizer, 767 | eval_only=False, 768 | model_path=training_args.output_dir, 769 | ) 770 | 771 | print( 772 | f"{color.PURPLE}TIME elapsed{color.END}: {datetime.datetime.fromtimestamp(time.time()-start).strftime('%d days, %H:%M:%S')}" 773 | ) 774 | 775 | logger.info( 776 | "Copying local projection layers into global projection layers..." 777 | ) 778 | model = copy_proj_layers(model) 779 | logger.info(f"Saving model to {model_path}") 780 | model.save_pretrained(model_path) 781 | 782 | print(f"{color.RED}Loading Done model{color.END}") 783 | 784 | logger.info(f"Loading the model from {model_path}") 785 | model = LongModelForMaskedLM.from_pretrained(model_path) 786 | tokenizer = AutoTokenizer.from_pretrained(model_path) 787 | 788 | 789 | if __name__ == "__main__": 790 | main() 791 | -------------------------------------------------------------------------------- /scripts/finetune_qa_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import print_function 4 | import argparse 5 | from collections import Counter 6 | from dataclasses import dataclass, field 7 | import logging 8 | import os 9 | import re 10 | import string 11 | import sys 12 | from typing import Any, Dict, List, Optional, Union 13 | 14 | import datasets 15 | import torch 16 | from torch.utils.tensorboard import SummaryWriter 17 | from tqdm.auto import tqdm 18 | from transformers import logging as hf_logging 19 | from transformers import ( 20 | AutoModelForQuestionAnswering, 21 | AutoTokenizer, 22 | DataCollator, 23 | HfArgumentParser, 24 | PreTrainedModel, 25 | PreTrainedTokenizer, 26 | set_seed, 27 | Trainer, 28 | TrainingArguments, 29 | ) 30 | 31 | 32 | # helper 33 | class color: 34 | PURPLE = "\033[95m" 35 | CYAN = "\033[96m" 36 | DARKCYAN = "\033[36m" 37 | BLUE = "\033[94m" 38 | GREEN = "\033[92m" 39 | YELLOW = "\033[93m" 40 | RED = "\033[91m" 41 | BOLD = "\033[1m" 42 | UNDERLINE = "\033[4m" 43 | END = "\033[0m" 44 | 45 | 46 | @dataclass 47 | class QADataset: 48 | """Collection for the language to load in HF datasets 49 | 50 | args: 51 | - langs: includes the number of languages to load, 52 | - text_on_eval: the print statements when evaluating the datasets 53 | - data: the tokenized datasets 54 | """ 55 | langs: List[str] 56 | text_on_eval: List[str] 57 | data: List[Any] = None 58 | 59 | 60 | SQUAD = QADataset( 61 | ["squad"], 62 | [ 63 | "SQuAD 1.1 validation dataset" 64 | ] 65 | ) 66 | 67 | 68 | # base xquad 69 | XQUAD = QADataset( 70 | ["ar", "de", "el", "en", "es", "hi", "ru", "th", "tr", "vi", "zh", ], 71 | [ 72 | "XQuAD Arabic validation", 73 | "XQuAD German validation", 74 | "XQuAD Greek validation", 75 | "XQuAD English validation", 76 | "XQuAD Spanish validation", 77 | "XQuAD Hindi validation", 78 | "XQuAD Russian validation", 79 | "XQuAD Thai validation", 80 | "XQuAD Turkish validation", 81 | "XQuAD Vietnamese validation", 82 | "XQuAD Chinese validation", 83 | ] 84 | ) 85 | 86 | # base mlqa 87 | MLQA = QADataset( 88 | ["ar", "de", "en", "es", "hi", "vi", "zh"], 89 | [ 90 | "SQuAD 1.1 validation dataset", 91 | "MLQA Arabic validation", 92 | "MLQA German validation", 93 | "MLQA English validation", 94 | "MLQA Spanish validation", 95 | "MLQA Hindi validation", 96 | "MLQA Vietnamese validation", 97 | "MLQA Chinese validation", 98 | ] 99 | ) 100 | 101 | 102 | def check_positive_concats(nr_concats): 103 | """Helper funtion for argparse 104 | Instructs how many contexts to concatinate together. 105 | Defualt for longer contexts are three. 106 | More can be used, but then it requires larger GPUs. 107 | 108 | *NOTE* this is only used when using the datasets: 109 | - squad_long or 110 | - xquad_long 111 | """ 112 | try: 113 | nr_concats_int = int(nr_concats) 114 | if nr_concats_int <= 0: 115 | raise argparse.ArgumentTypeError( 116 | f"--nr_concats expects a positive int as a value, \ 117 | not {nr_concats}" 118 | ) 119 | except ValueError as e: 120 | if hasattr(e, "message"): 121 | print(e.message) 122 | else: 123 | print(e) 124 | return nr_concats_int 125 | 126 | 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument( 129 | "--nr_concats", 130 | default=3, 131 | type=check_positive_concats, 132 | help="How many context to concatinate when using a `long` QA dataset.\n" 133 | "3 is default and yields an average context lenght of 2048 tokens", 134 | ) 135 | parser.add_argument( 136 | "--model_name", 137 | default=None, 138 | type=str, 139 | help="Name to save the model as.", 140 | ) 141 | parser.add_argument( 142 | "--output_dir", 143 | default=None, 144 | type=str, 145 | help="The output directory for the model checkpoints and predictions.", 146 | ) 147 | parser.add_argument( 148 | "--model_type", 149 | default=None, 150 | type=str, 151 | help="Model type selected from Huggingface ex: `roberta, xlm-roberta`", 152 | ) 153 | parser.add_argument( 154 | "--model_name_or_path", 155 | default=None, 156 | type=str, 157 | required=True, 158 | help="Path to pretrained model from huggingface.co/models. \n" 159 | "Only tested on `xlm-roberta-base` and `roberta-base`.", 160 | ) 161 | parser.add_argument( 162 | "--datasets", 163 | default=None, 164 | type=str, 165 | required=True, 166 | help="Define one of Huggingface Datasets Question Answering Tasks.", 167 | ) 168 | parser.add_argument( 169 | "--train_file_path", 170 | default=None, 171 | type=str, 172 | help="File path to where torch training file is stored (.pt files).", 173 | ) 174 | parser.add_argument( 175 | "--valid_file_path", 176 | default=None, 177 | type=str, 178 | help="File path to where torch validation file is stored (.pt files).", 179 | ) 180 | parser.add_argument( 181 | "--data_dir", 182 | default=None, 183 | type=str, 184 | help="Directory to where to store training and validation torch files.", 185 | ) 186 | parser.add_argument( 187 | "--logging_dir", 188 | default=None, 189 | type=str, 190 | help="The output directory where the the loggs are stored.", 191 | ) 192 | parser.add_argument( 193 | "--max_length", 194 | default=512, 195 | type=int, 196 | choices=[ 197 | 512, 198 | 1024, 199 | 2048, 200 | 4096, 201 | ], 202 | help="The maxiumum position of the model", 203 | ) 204 | parser.add_argument( 205 | "--attention_window", 206 | default=512, 207 | type=int, 208 | help="Size of attention window", 209 | ) 210 | parser.add_argument( 211 | "--do_train", 212 | action="store_true", 213 | help="Whether to run training." 214 | ) 215 | parser.add_argument( 216 | "--do_eval", 217 | action="store_true", 218 | help="Whether to run eval on the dev set." 219 | ) 220 | parser.add_argument( 221 | "--evaluate_during_training", 222 | action="store_true", 223 | help="Run evaluation during training at each logging step.", 224 | ) 225 | parser.add_argument( 226 | "--per_device_train_batch_size", 227 | default=8, 228 | type=int, 229 | help="Batch size per GPU/CPU for training.", 230 | ) 231 | parser.add_argument( 232 | "--per_device_eval_batch_size", 233 | default=8, 234 | type=int, 235 | help="Batch size per GPU/CPU for evaluation.", 236 | ) 237 | parser.add_argument( 238 | "--learning_rate", 239 | default=5e-5, 240 | type=float, 241 | help="The initial learning rate for Adam.", 242 | ) 243 | parser.add_argument( 244 | "--gradient_accumulation_steps", 245 | type=int, 246 | default=1, 247 | help="Number of updates to acummulate the gradient for before updating.", 248 | ) 249 | parser.add_argument( 250 | "--weight_decay", 251 | default=0.0, 252 | type=float, 253 | help="Weight decay if we apply some." 254 | ) 255 | parser.add_argument( 256 | "--adam_epsilon", 257 | default=1e-8, 258 | type=float, 259 | help="Epsilon for Adam optimizer." 260 | ) 261 | parser.add_argument( 262 | "--max_grad_norm", 263 | default=1.0, 264 | type=float, 265 | help="Max gradient norm." 266 | ) 267 | parser.add_argument( 268 | "--num_train_epochs", 269 | default=3.0, 270 | type=float, 271 | help="Total number of training epochs to perform.", 272 | ) 273 | parser.add_argument( 274 | "--max_steps", 275 | default=-1, 276 | type=int, 277 | help="If > 0: set total number of training steps to perform." 278 | " Override num_train_epochs.", 279 | ) 280 | parser.add_argument( 281 | "--warmup_steps", 282 | default=0, 283 | type=int, 284 | help="Linear warmup over warmup_steps." 285 | ) 286 | parser.add_argument( 287 | "--verbose_logging", 288 | action="store_true", 289 | help="If true, display all logging messages from huggingface libraries." 290 | "A number of warnings are expected for a normal SQuAD evaluation.", 291 | ) 292 | parser.add_argument( 293 | "--lang_id", 294 | default=0, 295 | type=int, 296 | help="language id of input for language-specific xlm models.", 297 | ) 298 | parser.add_argument( 299 | "--logging_steps", type=int, default=500, help="Log every X updates steps." 300 | ) 301 | parser.add_argument( 302 | "--save_steps", 303 | type=int, 304 | default=500, 305 | help="Save checkpoint every X updates steps.", 306 | ) 307 | parser.add_argument( 308 | "--eval_all_checkpoints", 309 | action="store_true", 310 | help="Evaluate all checkpoints starting with the same prefix as model_name", 311 | ) 312 | parser.add_argument( 313 | "--overwrite_output_dir", 314 | action="store_true", 315 | help="Overwrite the content of the output directory", 316 | ) 317 | parser.add_argument( 318 | "--seed", type=int, default=42, help="random seed for initialization" 319 | ) 320 | parser.add_argument( 321 | "--local_rank", 322 | type=int, 323 | default=-1, 324 | help="local_rank for distributed training on gpus", 325 | ) 326 | parser.add_argument( 327 | "--fp16", 328 | action="store_true", 329 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex).", 330 | ) 331 | parser.add_argument( 332 | "--fp16_opt_level", 333 | type=str, 334 | default="O1", 335 | help="For fp16: Apex AMP optimization level selected in " 336 | "['O0', 'O1', 'O2', and 'O3']." 337 | ) 338 | parser.add_argument( 339 | "--prediction_loss_only", 340 | action="store_true", 341 | help="If only prediciton loss should be returned", 342 | ) 343 | parser.add_argument( 344 | "--eval_steps", 345 | type=int, 346 | default=500, 347 | help="If input should be tokenized to only lowercase", 348 | ) 349 | parser.add_argument( 350 | "--do_lowercase", 351 | action="store_true", 352 | help="If input should be lowercase or not when tokenizing", 353 | ) 354 | 355 | 356 | args = parser.parse_args() 357 | 358 | hf_logging.enable_default_handler() 359 | hf_logging.set_verbosity_info() 360 | hf_logging.enable_explicit_format() 361 | 362 | # Setup logging 363 | tb_writer = SummaryWriter(log_dir=args.logging_dir) 364 | 365 | logger = logging.getLogger("") 366 | logger.setLevel(logging.INFO) 367 | 368 | fh = logging.FileHandler(f"{args.logging_dir}.log") 369 | sh = logging.StreamHandler(sys.stdout) 370 | formatter = logging.Formatter( 371 | "[%(asctime)s], %(levelname)s %(message)s", 372 | datefmt="%a, %d %b %Y %H:%M:%S", 373 | ) 374 | fh.setFormatter(formatter) 375 | sh.setFormatter(formatter) 376 | logger.addHandler(fh) 377 | logger.addHandler(sh) 378 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n") 379 | 380 | logger.warning( 381 | f"Process rank: {args.local_rank}, \ 382 | distributed training: {bool(args.local_rank != -1)}, \ 383 | 16-bits training: {args.fp16}" 384 | ) 385 | 386 | 387 | logger.info("=" * 50) 388 | logger.info("=" + "\t" * 6 + " =") 389 | logger.info("=" + "\tInitialization" + "\t" * 4 + " =") 390 | logger.info("=" + "\t" * 6 + " =") 391 | logger.info("=" * 50 + "\n\n") 392 | 393 | 394 | tokenizer = AutoTokenizer.from_pretrained( 395 | args.model_name_or_path, 396 | cache_dir=args.cache_dir, 397 | do_lowercase=args.do_lowercase, 398 | pad_to_max_length=True, 399 | max_length=args.max_length, 400 | truncation=True, 401 | use_fast=True, 402 | ) 403 | model = AutoModelForQuestionAnswering.from_pretrained( 404 | args.model_name_or_path, 405 | cache_dir=args.cache_dir, 406 | ) 407 | 408 | 409 | ######################################### 410 | # # 411 | # SQuADs Evaluation metrics # 412 | # # 413 | ######################################### 414 | 415 | def normalize_answer(s: str) -> str: 416 | """Lower text and remove punctuation, articles and extra whitespace.""" 417 | 418 | def remove_articles(text): 419 | return re.sub(r"\b(a|an|the)\b", " ", text) 420 | 421 | def white_space_fix(text): 422 | return " ".join(text.split()) 423 | 424 | def remove_punc(text): 425 | exclude = set(string.punctuation) 426 | return "".join(ch for ch in text if ch not in exclude) 427 | 428 | def lower(text): 429 | return text.lower() 430 | 431 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 432 | 433 | 434 | def f1_score(prediction, ground_truth): 435 | prediction_tokens = normalize_answer(prediction).split() 436 | ground_truth_tokens = normalize_answer(ground_truth).split() 437 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 438 | num_same = sum(common.values()) 439 | if num_same == 0: 440 | return 0 441 | precision = 1.0 * num_same / len(prediction_tokens) 442 | recall = 1.0 * num_same / len(ground_truth_tokens) 443 | f1 = (2 * precision * recall) / (precision + recall) 444 | return f1 445 | 446 | 447 | def exact_match_score(prediction: str, ground_truth: str) -> bool: 448 | return normalize_answer(prediction) == normalize_answer(ground_truth) 449 | 450 | 451 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 452 | scores_for_ground_truths = [] 453 | for ground_truth in ground_truths: 454 | score = metric_fn(prediction, ground_truth) 455 | scores_for_ground_truths.append(score) 456 | return max(scores_for_ground_truths) 457 | 458 | 459 | def evaluate( 460 | gold_answers: List[str], 461 | predictions: List[str] 462 | ) -> Dict[Union[str, float]]: 463 | 464 | f1 = exact_match = total = 0 465 | 466 | for ground_truths, prediction in zip(gold_answers, predictions): 467 | total += 1 468 | exact_match += metric_max_over_ground_truths( 469 | exact_match_score, prediction, ground_truths 470 | ) 471 | f1 += metric_max_over_ground_truths(f1_score, 472 | prediction, ground_truths) 473 | 474 | exact_match = 100.0 * exact_match / total 475 | f1 = 100.0 * f1 / total 476 | 477 | return {"exact_match": exact_match, "f1": f1} 478 | 479 | 480 | #################################################### 481 | # 482 | # Evaluation 483 | # 484 | #################################################### 485 | 486 | 487 | def get_squad_evaluation( 488 | valid_dataset: DataCollator, 489 | model: PreTrainedModel, 490 | tokenizer: PreTrainedTokenizer, 491 | dataset_name: str, 492 | batch_size: int 493 | ) -> None: 494 | """ 495 | Makes a prediction and evaluates it based on the trained model 496 | The evaluation is based on the SQuAD evaluation metric: 497 | valdid_datset is expected to be converted to a torch Tensor type: 498 | """ 499 | 500 | logging.info(f"Generating perdictions for dataset '{dataset_name}'") 501 | dataloader = torch.utils.data.DataLoader( 502 | valid_dataset, batch_size=batch_size) 503 | 504 | # predictions 505 | predicted_answers = [] 506 | with torch.no_grad(): 507 | for batch in tqdm(dataloader): 508 | start_scores, end_scores = model( 509 | input_ids=batch["input_ids"].cuda(), 510 | attention_mask=batch["attention_mask"].cuda(), 511 | ) 512 | for i in range(start_scores.shape[0]): 513 | all_tokens = tokenizer.convert_ids_to_tokens( 514 | batch["input_ids"][i]) 515 | answer = " ".join( 516 | all_tokens[ 517 | torch.argmax(start_scores[i]): 518 | torch.argmax(end_scores[i]) + 1 519 | ] 520 | ) 521 | ans_ids = tokenizer.convert_tokens_to_ids(answer.split()) 522 | answer = tokenizer.decode(ans_ids) 523 | predicted_answers.append(answer) 524 | 525 | valid_dataset.reset_format() 526 | predictions = [] 527 | references = [] 528 | for ref, pred_answer in zip(valid_dataset, predicted_answers): 529 | actual_answer = ref["answers"]["text"] 530 | predictions.append(pred_answer) 531 | references.append(actual_answer) 532 | 533 | eval_score = evaluate(references, predictions) 534 | logging.info(f"Results from prediction:\n{eval_score}\n" + "=" * 55 + "\n") 535 | 536 | 537 | ######################################### 538 | # # 539 | # Convert train and validation datasets # 540 | # # 541 | ######################################### 542 | 543 | def get_correct_alignement(context: str, answer): 544 | """Some original examples in SQuAD have indices wrong by 1 or 2 character. 545 | """ 546 | gold_text = answer["text"][0] 547 | start_idx = answer["answer_start"][0] 548 | end_idx = start_idx + len(gold_text) 549 | if context[start_idx:end_idx] == gold_text: 550 | return start_idx, end_idx 551 | elif context[start_idx - 1: end_idx - 1] == gold_text: 552 | return start_idx - 1, end_idx - 1 553 | elif context[start_idx - 2: end_idx - 2] == gold_text: 554 | return start_idx - 2, end_idx - 2 555 | else: 556 | raise ValueError() 557 | 558 | 559 | MAX_CONTEXT_LENGTH = 0 560 | 561 | 562 | def convert_to_features(example): 563 | 564 | encodings = tokenizer.encode_plus( 565 | example["question"], 566 | example["context"], 567 | pad_to_max_length=True, 568 | max_length=args.max_length, 569 | truncation=True, 570 | ) 571 | context_encodings = tokenizer.encode_plus(example["context"]) 572 | 573 | start_idx, end_idx = get_correct_alignement( 574 | example["context"], example["answers"]) 575 | start_positions_context = context_encodings.char_to_token(start_idx) 576 | end_positions_context = context_encodings.char_to_token(end_idx - 1) 577 | 578 | # FIXME: UGLY HACK because of XLM-R tokenization, works fine with roberta 579 | sep_idx = encodings["input_ids"].index(tokenizer.sep_token_id) 580 | try: 581 | start_positions = start_positions_context + sep_idx + 1 582 | end_positions = end_positions_context + sep_idx + 1 583 | 584 | # if end_positions > 4096: 585 | # start_positions, end_positions = None, None 586 | except: 587 | start_positions = None 588 | end_positions = None 589 | 590 | encodings.update( 591 | { 592 | "start_positions": start_positions, 593 | "end_positions": end_positions, 594 | "attention_mask": encodings["attention_mask"], 595 | } 596 | ) 597 | return encodings 598 | 599 | 600 | def convert_dataset_to_torch_format(data): 601 | data = data.map(convert_to_features).filter( 602 | lambda example: (example["start_positions"] is not None) 603 | and (example["end_positions"] is not None) 604 | ) 605 | 606 | # set the tensor type and the columns which the dataset should return 607 | columns = ["input_ids", "attention_mask", 608 | "start_positions", "end_positions"] 609 | data.set_format(type="torch", columns=columns) 610 | print(max(data["start_positions"])) 611 | print(data.shape) 612 | return data 613 | 614 | 615 | ################## 616 | # 617 | # Training 618 | # 619 | ################## 620 | 621 | 622 | class DummyDataCollator: 623 | def __call__(self, batch): 624 | 625 | input_ids = torch.stack([example["input_ids"] for example in batch]) 626 | attention_mask = torch.stack( 627 | [example["attention_mask"] for example in batch]) 628 | start_positions = torch.stack( 629 | [example["start_positions"] for example in batch]) 630 | end_positions = torch.stack( 631 | [example["end_positions"] for example in batch]) 632 | 633 | return { 634 | "input_ids": input_ids, 635 | "start_positions": start_positions, 636 | "end_positions": end_positions, 637 | "attention_mask": attention_mask, 638 | } 639 | 640 | 641 | @ dataclass 642 | class ModelArguments: 643 | 644 | model_name_or_path: str = field( 645 | metadata={ 646 | "help": "Path to pretrained model or model identifier" 647 | } 648 | ) 649 | tokenizer_name: Optional[str] = field( 650 | default=None, 651 | metadata={ 652 | "help": "Pretrained tokenizer name or path" 653 | }, 654 | ) 655 | cache_dir: Optional[str] = field( 656 | default=None, 657 | metadata={ 658 | "help": "Where do you want to store the pretrained models" 659 | }, 660 | ) 661 | do_lowercase: bool = field( 662 | default=False, 663 | metadata={"help": "If tokenizer should make all to lowercase."}, 664 | ) 665 | max_seq_length: Optional[int] = field( 666 | default=384, 667 | metadata={"help": "TODO"}, 668 | ) 669 | doc_stride: Optional[int] = field( 670 | default=128, 671 | metadata={"help": "TODO"}, 672 | ) 673 | model_type: Optional[str] = field( 674 | default=None, 675 | metadata={"help": "TODO"}, 676 | ) 677 | 678 | 679 | @dataclass 680 | class DataTrainingArguments: 681 | 682 | datasets: str = field(metadata={"help": "The dataset name to load."}) 683 | data_dir: Optional[str] = field( 684 | default=None, 685 | metadata={ 686 | "help": "Path to the dataset containing train and eval datasets."}, 687 | ) 688 | train_file_path: Optional[str] = field( 689 | default="train_data.pt", 690 | metadata={"help": "Path for cached train dataset"}, 691 | ) 692 | valid_file_path: Optional[str] = field( 693 | default="valid_data.pt", 694 | metadata={"help": "Path for cached valid dataset"}, 695 | ) 696 | max_length: Optional[int] = field( 697 | default=512, 698 | metadata={"help": "Max input length for the source text"}, 699 | ) 700 | nr_concats: Optional[int] = field( 701 | default=3, 702 | metadata={"help": "Number of contexts to concatinate"}, 703 | ) 704 | 705 | 706 | def load_datasets( 707 | languages: QADataset, 708 | base_dataset: str = None, 709 | concatinate: bool = False, 710 | split: str = 'validation', 711 | ): 712 | """Loads a Huggingface dataset based on the `base` dataset 713 | (squad/xquad/mlqa).""" 714 | 715 | dataset: List[Any] = [] 716 | 717 | data: List 718 | dataset: str 719 | for lang in languages.langs: 720 | if base_dataset is not None: 721 | dataset = f"{base_dataset}.{lang}" 722 | if base_dataset == "mlqa": 723 | dataset = f"{dataset}.{lang}" 724 | 725 | data = datasets.load_dataset(base_dataset, dataset, split=split) 726 | else: 727 | data = datasets.load_dataset(lang, split=split) 728 | 729 | if concatinate: 730 | data = concatinate_squad_data(data, args.nr_concats) 731 | data = convert_dataset_to_torch_format(data) 732 | dataset.append(data) 733 | 734 | return dataset 735 | 736 | 737 | def concatinate_squad_data(d, span=3): 738 | """ 739 | Concatinate "SPAN" number of SQuAD questions together 740 | """ 741 | 742 | def get_span(index, span): 743 | """ 744 | Returns the value in a range for whole numbers 745 | 746 | Ex: index=4, span=5 747 | lower=0, upper=5 748 | 749 | index=5, span=5 750 | lower=0, upper=5 751 | 752 | index=8, span=5 753 | lower=5, upper=10 754 | """ 755 | lower_bound = (index) // span 756 | lower_bound = lower_bound * span 757 | upper_bound = lower_bound + span 758 | return lower_bound, upper_bound 759 | 760 | def set_start_pos(example, idx): 761 | """ 762 | Get correct new starting position when concatinating SQuAD datasets 763 | """ 764 | low, high = get_span(idx, span) 765 | 766 | # Get new starting position 767 | prev_start = 0 768 | if idx != low: 769 | prev_start = len("".join(data["context"][low:idx])) 770 | 771 | start_pos = data["answers"][idx]["answer_start"][0] 772 | if not isinstance(start_pos, int): 773 | start_pos = start_pos[0] 774 | new_start = [prev_start + start_pos] 775 | example["answers"]["answer_start"] = new_start 776 | return example 777 | 778 | def set_context(example, idx): 779 | """ 780 | Concatinate "SPAN" number of SQuAD samples 781 | """ 782 | low, high = get_span(idx, span) 783 | 784 | # Get new context 785 | example["context"] = "".join(data["context"][low:high]) 786 | return example 787 | 788 | # Filters out questions using the same context but different questions 789 | data = d.filter( 790 | lambda example, idx: example["context"] != d["context"][idx - 1], 791 | with_indices=True, 792 | ) 793 | 794 | data = data.map( 795 | lambda example, idx: set_start_pos(example, idx), 796 | with_indices=True 797 | ) 798 | data = data.map( 799 | lambda example, idx: set_context(example, idx), 800 | with_indices=True 801 | ) 802 | 803 | return data 804 | 805 | 806 | ################################################################# 807 | # 808 | # Main function 809 | # 810 | ################################################################# 811 | 812 | 813 | def main(): 814 | 815 | parser = HfArgumentParser( 816 | (ModelArguments, DataTrainingArguments, TrainingArguments) 817 | ) 818 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 819 | 820 | # Needed for getting eval_loss for QA in transformer v. 3.0.2 and 4.0.0 821 | training_args.label_names = ["start_positions", "end_positions"] 822 | 823 | if ( 824 | os.path.exists(training_args.output_dir) 825 | and os.listdir(training_args.output_dir) 826 | and training_args.do_train 827 | and not training_args.overwrite_output_dir 828 | ): 829 | raise ValueError( 830 | f"Output directory ({training_args.output_dir}) \ 831 | already exists and is not empty. \ 832 | Use --overwrite_output_dir to overcome." 833 | ) 834 | 835 | if data_args.data_dir is None: 836 | data_args.data_dir = "." 837 | 838 | if data_args.train_file_path is None or data_args.valid_file_path is None: 839 | data_args.train_file_path = f"{data_args.data_dir}/train_data.pt" 840 | data_args.valid_file_path = f"{data_args.data_dir}/val_data.pt" 841 | 842 | logger.info("Model parameters set: \n", model_args) 843 | logging.info(f"Logging to file: {training_args.logging_dir}.log") 844 | 845 | set_seed(training_args.seed) 846 | 847 | tokenizer = AutoTokenizer.from_pretrained( 848 | model_args.model_name_or_path, 849 | cache_dir=model_args.cache_dir, 850 | do_lowercase=args.do_lowercase, 851 | pad_to_max_length=True, 852 | max_length=args.max_length, 853 | truncation=True, 854 | use_fast=True, 855 | ) 856 | 857 | model = AutoModelForQuestionAnswering.from_pretrained( 858 | model_args.model_name_or_path, 859 | cache_dir=model_args.cache_dir, 860 | ) 861 | 862 | if data_args.datasets == "xquad": 863 | XQUAD.data = load_datasets(XQUAD, base_dataset="xquad") 864 | 865 | if data_args.datasets == "mlqa": 866 | MLQA.data = load_datasets(MLQA, base_dataset="mlqa") 867 | 868 | if data_args.datasets == "tydiqa": 869 | raise ValueError("Not yet implemented") 870 | 871 | if data_args.datasets == "xquad_long": 872 | XQUAD.data = load_datasets(XQUAD, "xquad", concatinate=True) 873 | 874 | if data_args.datasets in ["squad_long", "xquad_long"]: 875 | train_dataset = load_datasets( 876 | SQUAD, split="train", concatinate=True)[0] 877 | valid_dataset = load_datasets(SQUAD, concatinate=True)[0] 878 | SQUAD.data = valid_dataset 879 | 880 | if (data_args.datasets in ["xquad", "mlqa", "squad"]): 881 | train_dataset = load_datasets( 882 | SQUAD, split="train", concatinate=True)[0] 883 | valid_dataset = load_datasets(SQUAD, concatinate=True)[0] 884 | SQUAD.data = valid_dataset 885 | 886 | torch.save(train_dataset, data_args.train_file_path) 887 | torch.save(valid_dataset, data_args.valid_file_path) 888 | 889 | train_dataset = torch.load(data_args.train_file_path) 890 | valid_dataset = torch.load(data_args.valid_file_path) 891 | 892 | #################################### 893 | # 894 | # Train the model 895 | # 896 | #################################### 897 | 898 | if training_args.do_train: 899 | 900 | trainer = Trainer( 901 | model=model, 902 | args=training_args, 903 | train_dataset=train_dataset, 904 | eval_dataset=valid_dataset, 905 | data_collator=DummyDataCollator(), 906 | prediction_loss_only=True, 907 | ) 908 | 909 | if training_args.do_train: 910 | trainer.train( 911 | model_path=model_args.model_name_or_path 912 | if os.path.isdir(model_args.model_name_or_path) 913 | else None 914 | ) 915 | trainer.save_model() 916 | if trainer.is_world_process_zero(): 917 | tokenizer.save_pretrained(training_args.output_dir) 918 | 919 | results = {} 920 | if training_args.do_eval and training_args.local_rank in [-1, 0]: 921 | logger.info("*** Evaluation ***") 922 | 923 | eval_output = trainer.evaluate() 924 | output_eval_file = os.path.join( 925 | training_args.output_dir, "eval_results.txt" 926 | ) 927 | print("\n'==========================================\n") 928 | print("Eval output: ", eval_output) 929 | print("\n'==========================================\n") 930 | 931 | with open(output_eval_file, "w") as writer: 932 | logger.info("***** Eval results *****") 933 | for key in sorted(eval_output.keys()): 934 | logger.info(" %s = %s", key, str(eval_output[key])) 935 | writer.write("%s = %s\n" % (key, str(eval_output[key]))) 936 | print(key, str(eval_output[key])) 937 | 938 | results.update(eval_output) 939 | 940 | logging.info("=" * 45) 941 | logging.info("Results from evaluation:") 942 | logging.info(results) 943 | logging.info("\n") 944 | 945 | logging.info("" * 45) 946 | 947 | #################################### 948 | # 949 | # Evaluate the trained model 950 | # 951 | #################################### 952 | 953 | if training_args.do_train: 954 | tokenizer = AutoTokenizer.from_pretrained( 955 | training_args.output_dir, 956 | use_fast=True, 957 | do_lowercase=args.do_lowercase 958 | ) 959 | model = AutoModelForQuestionAnswering.from_pretrained( 960 | training_args.output_dir, 961 | ) 962 | else: 963 | try: 964 | model_path = training_args.output_dir 965 | tokenizer = AutoTokenizer.from_pretrained( 966 | training_args.output_dir, 967 | use_fast=True, 968 | do_lowercase=args.do_lowercase 969 | ) 970 | model = AutoModelForQuestionAnswering.from_pretrained( 971 | training_args.output_dir, 972 | ) 973 | except: 974 | model_path = model_args.model_name_or_path 975 | tokenizer = AutoTokenizer.from_pretrained( 976 | model_path, use_fast=True, do_lowercase=args.do_lowercase 977 | ) 978 | model = AutoModelForQuestionAnswering.from_pretrained( 979 | model_path 980 | ) 981 | 982 | model = model.cuda() 983 | model.eval() 984 | 985 | get_squad_evaluation( 986 | SQUAD.data, 987 | model, 988 | tokenizer, 989 | SQUAD.text_on_eval, 990 | training_args.per_device_eval_batch_size, 991 | ) 992 | if data_args.datasets == "xquad" or data_args.datasets == "xquad_long": 993 | for i, _ in enumerate(XQUAD.langs): 994 | get_squad_evaluation( 995 | XQUAD.data[i], 996 | model, 997 | tokenizer, 998 | XQUAD.text_on_eval[i], 999 | training_args.per_device_eval_batch_size, 1000 | ) 1001 | elif data_args.datasets == "mlqa": 1002 | for i, _ in enumerate(MLQA.langs): 1003 | get_squad_evaluation( 1004 | MLQA.data[i], 1005 | model, 1006 | tokenizer, 1007 | MLQA.text_on_eval[i], 1008 | training_args.per_device_eval_batch_size, 1009 | ) 1010 | 1011 | elif data_args.datasets == "trivia_qa": 1012 | pass 1013 | 1014 | else: 1015 | print("Not a valid eval dataset...\n Exiting") 1016 | 1017 | 1018 | if __name__ == "__main__": 1019 | main() 1020 | -------------------------------------------------------------------------------- /notebooks/Convert to Long.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import logging\n", 10 | "import os\n", 11 | "import math\n", 12 | "from dataclasses import dataclass, field\n", 13 | "\n", 14 | "import copy # for deep copy\n", 15 | "\n", 16 | "import torch\n", 17 | "from torch import nn\n", 18 | "from transformers import RobertaForMaskedLM, RobertaTokenizerFast, TextDataset, DataCollatorForLanguageModeling, Trainer\n", 19 | "from transformers import TrainingArguments, HfArgumentParser\n", 20 | "from transformers.modeling_longformer import LongformerSelfAttention\n", 21 | "\n", 22 | "logger = logging.getLogger(__name__)\n", 23 | "logging.basicConfig(level=logging.INFO)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 22, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "class RobertaLongSelfAttention(LongformerSelfAttention): \n", 33 | " def forward(\n", 34 | " self,\n", 35 | " hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None\n", 36 | " ):\n", 37 | " return super().forward(hidden_states, attention_mask=attention_mask)\n", 38 | "\n", 39 | "class RobertaLongForMaskedLM(RobertaForMaskedLM):\n", 40 | " def __init__(self, config):\n", 41 | " super().__init__(config)\n", 42 | " for i, layer in enumerate(self.roberta.encoder.layer):\n", 43 | " # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n", 44 | " layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 23, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def create_long_model(save_model_to, attention_window, max_pos):\n", 54 | " model = RobertaForMaskedLM.from_pretrained('roberta-base')\n", 55 | " tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos)\n", 56 | " config = model.config\n", 57 | "\n", 58 | " # extend position embeddings\n", 59 | " tokenizer.model_max_length = max_pos\n", 60 | " tokenizer.init_kwargs['model_max_length'] = max_pos\n", 61 | " current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape\n", 62 | " max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2\n", 63 | " config.max_position_embeddings = max_pos\n", 64 | " assert max_pos > current_max_pos\n", 65 | " # allocate a larger position embedding matrix\n", 66 | " new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)\n", 67 | " # copy position embeddings over and over to initialize the new position embeddings\n", 68 | " k = 2\n", 69 | " step = current_max_pos - 2\n", 70 | " while k < max_pos - 1:\n", 71 | " new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]\n", 72 | " k += step\n", 73 | " \n", 74 | " model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed\n", 75 | " model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)\n", 76 | " \"\"\"\n", 77 | " model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # add after this line\n", 78 | " model.roberta.embeddings.position_embeddings.num_embeddings = len(new_pos_embed.data)\n", 79 | " # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length\n", 80 | " model.roberta.embeddings.position_ids = torch.arange(0, model.roberta.embeddings.position_embeddings.num_embeddings)[None]\n", 81 | " \"\"\"\n", 82 | " \n", 83 | " \n", 84 | " # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n", 85 | " config.attention_window = [attention_window] * config.num_hidden_layers\n", 86 | " for i, layer in enumerate(model.roberta.encoder.layer):\n", 87 | " longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n", 88 | " longformer_self_attn.query = copy.deepcopy(layer.attention.self.query)\n", 89 | " longformer_self_attn.key = copy.deepcopy(layer.attention.self.key)\n", 90 | " longformer_self_attn.value = copy.deepcopy(layer.attention.self.value)\n", 91 | "\n", 92 | " longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)\n", 93 | " longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)\n", 94 | " longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)\n", 95 | "\n", 96 | " \"\"\"\n", 97 | " longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n", 98 | " longformer_self_attn.query = layer.attention.self.query\n", 99 | " longformer_self_attn.key = layer.attention.self.key\n", 100 | " longformer_self_attn.value = layer.attention.self.value\n", 101 | "\n", 102 | " longformer_self_attn.query_global = layer.attention.self.query\n", 103 | " longformer_self_attn.key_global = layer.attention.self.key\n", 104 | " longformer_self_attn.value_global = layer.attention.self.value\n", 105 | " \"\"\"\n", 106 | "\n", 107 | " layer.attention.self = longformer_self_attn\n", 108 | "\n", 109 | " logger.info(f'saving model to {save_model_to}')\n", 110 | " model.save_pretrained(save_model_to)\n", 111 | " tokenizer.save_pretrained(save_model_to)\n", 112 | " return model, tokenizer" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 24, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "def copy_proj_layers(model):\n", 122 | " for i, layer in enumerate(model.roberta.encoder.layer):\n", 123 | " layer.attention.self.query_global = layer.attention.self.query\n", 124 | " layer.attention.self.key_global = layer.attention.self.key\n", 125 | " layer.attention.self.value_global = layer.attention.self.value\n", 126 | " return model" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 25, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):\n", 136 | " val_dataset = TextDataset(tokenizer=tokenizer,\n", 137 | " file_path=args.val_datapath,\n", 138 | " block_size=tokenizer.max_len)\n", 139 | " if eval_only:\n", 140 | " train_dataset = val_dataset\n", 141 | " else:\n", 142 | " logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')\n", 143 | " train_dataset = TextDataset(tokenizer=tokenizer,\n", 144 | " file_path=args.train_datapath,\n", 145 | " block_size=tokenizer.max_len)\n", 146 | "\n", 147 | " data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)\n", 148 | " trainer = Trainer(model=model, args=args, data_collator=data_collator,\n", 149 | " train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True)\n", 150 | "\n", 151 | " eval_loss = trainer.evaluate()\n", 152 | " eval_loss = eval_loss['eval_loss']\n", 153 | " logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')\n", 154 | " \n", 155 | " if not eval_only:\n", 156 | " trainer.train(model_path=model_path)\n", 157 | " trainer.save_model()\n", 158 | "\n", 159 | " eval_loss = trainer.evaluate()\n", 160 | " eval_loss = eval_loss['eval_loss']\n", 161 | " logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 26, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "@dataclass\n", 171 | "class ModelArgs:\n", 172 | " attention_window: int = field(default=512, metadata={\"help\": \"Size of attention window\"})\n", 173 | " max_pos: int = field(default=4096, metadata={\"help\": \"Maximum position\"})\n", 174 | "\n", 175 | "parser = HfArgumentParser((TrainingArguments, ModelArgs,))\n", 176 | "\n", 177 | "\n", 178 | "training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[\n", 179 | " '--output_dir', 'tmp',\n", 180 | " '--warmup_steps', '500',\n", 181 | " '--learning_rate', '0.00003',\n", 182 | " '--weight_decay', '0.01',\n", 183 | " '--adam_epsilon', '1e-6',\n", 184 | " '--max_steps', '3000',\n", 185 | " '--logging_steps', '500',\n", 186 | " '--save_steps', '500',\n", 187 | " '--max_grad_norm', '5.0',\n", 188 | " '--per_gpu_eval_batch_size', '8',\n", 189 | " '--per_gpu_train_batch_size', '2', # 32GB gpu with fp32\n", 190 | " '--gradient_accumulation_steps', '32',\n", 191 | " '--evaluate_during_training',\n", 192 | " '--do_train',\n", 193 | " '--do_eval',\n", 194 | "])\n", 195 | "training_args.val_datapath = '/workspace/data/wikitext-103-raw/wiki.valid.raw'\n", 196 | "training_args.train_datapath = '/workspace/data/wikitext-103-raw/wiki.train.raw'\n", 197 | "\n", 198 | "# Choose GPU\n", 199 | "import os\n", 200 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 27, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stderr", 210 | "output_type": "stream", 211 | "text": [ 212 | "Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']\n", 213 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", 214 | "INFO:__main__:Evaluating roberta-base (seqlen: 512) for refernece ...\n", 215 | "INFO:filelock:Lock 140125418510600 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n", 216 | "INFO:filelock:Lock 140125418510600 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n", 217 | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.\n" 218 | ] 219 | }, 220 | { 221 | "data": { 222 | "text/html": [ 223 | "\n", 224 | "
21 | **Please Note**: 22 | Running the following project is quite computationally expensive. It is required to have a Docker container with at least 90GB of RAM allocated for the pre-training and a CUDA enabled GPU with 48GB of memory! 23 | 24 | For the Fine-tuning on QA tasks, 32GB of RAM is sufficient and a smaller GPU can be used when fine-tuning on regular or multilingual SQuAD. However, for the datasets created with a longer context, it requires at least 32GB of RAM 25 |
26 |31 | The following applications and libraries needs to be installed in order to run the application 32 | - [Docker](https://docs.docker.com/get-docker/) 33 | - [Docker Compose](https://docs.docker.com/compose/install/) 34 | - Miniconda or Anaconda with Python3 35 | - make (terminal command) 36 | - wget (terminal command) 37 | - unzip (terminal command) 38 | - tmux (terminal command) 39 | - CUDA enabled GPU (check if set up correctly by entering `nvidia-smi` in your terminal) 40 | - [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html) installed and linked to your Docker container (Needed if encountering error: ```ERROR: for XXX_markussagen_repl1 Cannot create container for service repl: Unknown runtime specified nvidia```) 41 |
42 |
139 |
140 |
142 |
143 | ##### RoBERTa
144 |
145 | export SEED=42
146 | export MAX_LENGTH=4096
147 | export MODEL_DIR=/workspace/models
148 | export MODEL_NAME_OR_PATH=roberta-base
149 | export MODEL_NAME=$MODEL_NAME_OR_PATH-long
150 | export DATA_DIR=/workspace/data
151 | export LOG_DIR=/workspace/logs
152 |
153 | make repl run="scripts/run_long_lm.py \
154 | --model_name_or_path $MODEL_NAME_OR_PATH \
155 | --model_name $MODEL_NAME \
156 | --output_dir $MODEL_DIR/$MODEL_NAME \
157 | --logging_dir $LOG_DIR/$MODEL_NAME \
158 | --val_file_path $DATA_DIR/wiki.valid.raw \
159 | --train_file_path $DATA_DIR/wiki.train.raw \
160 | --seed $SEED \
161 | --model_max_length $MAX_LENGTH \
162 | --adam_epsilon 1e-8 \
163 | --warmup_steps 500 \
164 | --learning_rate 3e-5 \
165 | --weight_decay 0.01 \
166 | --max_steps 6000 \
167 | --evaluate_during_training \
168 | --logging_steps 50 \
169 | --eval_steps 50 \
170 | --save_steps 500 \
171 | --max_grad_norm 1.0 \
172 | --per_device_eval_batch_size 2 \
173 | --per_device_train_batch_size 1 \
174 | --gradient_accumulation_steps 64 \
175 | --overwrite_output_dir \
176 | --fp16 \
177 | --do_train \
178 | --do_eval
179 | "
180 |
181 |
182 | ##### XLM-R
183 |
184 | export SEED=42
185 | export MAX_LENGTH=4096
186 | export MODEL_DIR=/workspace/models
187 | export MODEL_NAME_OR_PATH=xlm-roberta-base
188 | export MODEL_NAME=$MODEL_NAME_OR_PATH-long
189 | export DATA_DIR=/workspace/data
190 | export LOG_DIR=/workspace/logs
191 |
192 | make repl run="scripts/run_long_lm.py \
193 | --model_name_or_path $MODEL_NAME_OR_PATH \
194 | --model_name $MODEL_NAME \
195 | --output_dir $MODEL_DIR/$MODEL_NAME \
196 | --logging_dir $LOG_DIR/$MODEL_NAME \
197 | --val_file_path $DATA_DIR/wiki.valid.raw \
198 | --train_file_path $DATA_DIR/wiki.train.raw \
199 | --seed $SEED \
200 | --model_max_length $MAX_LENGTH \
201 | --adam_epsilon 1e-8 \
202 | --warmup_steps 500 \
203 | --learning_rate 3e-5 \
204 | --weight_decay 0.01 \
205 | --max_steps 6000 \
206 | --evaluate_during_training \
207 | --logging_steps 50 \
208 | --eval_steps 50 \
209 | --save_steps 500 \
210 | --max_grad_norm 1.0 \
211 | --per_device_eval_batch_size 2 \
212 | --per_device_train_batch_size 1 \
213 | --gradient_accumulation_steps 64 \
214 | --overwrite_output_dir \
215 | --fp16 \
216 | --do_train \
217 | --do_eval
218 | "
219 |
220 | Wikitext-103
141 |
273 |
274 | Each fine-tuning are grouped based on the dataset, language and context length and then evaluated for each model.
275 |
276 | ### English
277 |
278 |
280 |
281 | ##### RoBERTa
282 |
283 | export SEED=42
284 | export DATASET=squad
285 | export MODEL_DIR=/workspace/models
286 | export MODEL_NAME_OR_PATH=roberta-base
287 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
288 | export LOG_DIR=/workspace/logs
289 | export DATA_DIR=/workspace/data
290 | # Debugging
291 | CUDA_LAUNCH_BLOCKING=1
292 | # model args
293 | make repl run="scripts/finetune_qa_models.py \
294 | --model_name_or_path $MODEL_NAME_OR_PATH \
295 | --output_dir $MODEL_DIR/$MODEL_NAME \
296 | --logging_dir $LOG_DIR/$MODEL_NAME \
297 | --dataset $DATASET \
298 | --data_dir $DATA_DIR \
299 | --seed $SEED \
300 | --num_train_epochs 3 \
301 | --learning_rate 3e-5 \
302 | --logging_steps 50 \
303 | --eval_steps 50 \
304 | --save_steps 1000 \
305 | --per_device_train_batch_size 4 \
306 | --per_device_eval_batch_size 32 \
307 | --gradient_accumulation_steps 8 \
308 | --overwrite_output_dir \
309 | --evaluate_during_training \
310 | --fp16 \
311 | --do_train \
312 | --do_eval \
313 | --do_lowercase \
314 | --max_length 512 \
315 | "
316 |
317 |
318 | ##### Longformer
319 |
320 | export SEED=42
321 | export DATASET=squad
322 | export MODEL_DIR=/workspace/models
323 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
324 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
325 | export LOG_DIR=/workspace/logs
326 | export DATA_DIR=/workspace/data
327 | # Debugging
328 | CUDA_LAUNCH_BLOCKING=1
329 | # model args
330 | make repl run="scripts/finetune_qa_models.py \
331 | --model_name_or_path $MODEL_NAME_OR_PATH \
332 | --output_dir $MODEL_DIR/$MODEL_NAME \
333 | --logging_dir $LOG_DIR/$MODEL_NAME \
334 | --dataset $DATASET \
335 | --data_dir $DATA_DIR \
336 | --seed $SEED \
337 | --num_train_epochs 3 \
338 | --learning_rate 3e-5 \
339 | --logging_steps 50 \
340 | --eval_steps 50 \
341 | --save_steps 1000 \
342 | --per_device_train_batch_size 4 \
343 | --per_device_eval_batch_size 32 \
344 | --gradient_accumulation_steps 8 \
345 | --overwrite_output_dir \
346 | --evaluate_during_training \
347 | --fp16 \
348 | --do_train \
349 | --do_eval \
350 | --do_lowercase \
351 | --max_length 512 \
352 | "
353 |
354 |
355 | ##### RoBERTa-Long
356 |
357 | export SEED=42
358 | export DATASET=squad
359 | export MODEL_DIR=/workspace/models
360 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
361 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
362 | export LOG_DIR=/workspace/logs
363 | export DATA_DIR=/workspace/data
364 | # Debugging
365 | CUDA_LAUNCH_BLOCKING=1
366 | # model args
367 | make repl run="scripts/finetune_qa_models.py \
368 | --model_name_or_path $MODEL_NAME_OR_PATH \
369 | --output_dir $MODEL_DIR/$MODEL_NAME \
370 | --logging_dir $LOG_DIR/$MODEL_NAME \
371 | --dataset $DATASET \
372 | --data_dir $DATA_DIR \
373 | --seed $SEED \
374 | --num_train_epochs 3 \
375 | --learning_rate 3e-5 \
376 | --logging_steps 50 \
377 | --eval_steps 50 \
378 | --save_steps 1000 \
379 | --per_device_train_batch_size 4 \
380 | --per_device_eval_batch_size 32 \
381 | --gradient_accumulation_steps 8 \
382 | --overwrite_output_dir \
383 | --evaluate_during_training \
384 | --fp16 \
385 | --do_train \
386 | --do_eval \
387 | --do_lowercase \
388 | --max_length 512 \
389 | "
390 |
391 |
392 | ##### XLM-R
393 |
394 | export SEED=42
395 | export DATASET=squad
396 | export MODEL_DIR=/workspace/models
397 | export MODEL_NAME_OR_PATH=xlm-roberta-base
398 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
399 | export LOG_DIR=/workspace/logs
400 | export DATA_DIR=/workspace/data
401 | # Debugging
402 | CUDA_LAUNCH_BLOCKING=1
403 | # model args
404 | make repl run="scripts/finetune_qa_models.py \
405 | --model_name_or_path $MODEL_NAME_OR_PATH \
406 | --output_dir $MODEL_DIR/$MODEL_NAME \
407 | --logging_dir $LOG_DIR/$MODEL_NAME \
408 | --dataset $DATASET \
409 | --data_dir $DATA_DIR \
410 | --seed $SEED \
411 | --num_train_epochs 3 \
412 | --learning_rate 3e-5 \
413 | --logging_steps 50 \
414 | --eval_steps 50 \
415 | --save_steps 1000 \
416 | --per_device_train_batch_size 4 \
417 | --per_device_eval_batch_size 32 \
418 | --gradient_accumulation_steps 8 \
419 | --overwrite_output_dir \
420 | --evaluate_during_training \
421 | --fp16 \
422 | --do_train \
423 | --do_eval \
424 | --do_lowercase \
425 | --max_length 512 \
426 | "
427 |
428 |
429 | ##### XLM-Long
430 |
431 | export SEED=42
432 | export DATASET=squad
433 | export MODEL_DIR=/workspace/models
434 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
435 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
436 | export LOG_DIR=/workspace/logs
437 | export DATA_DIR=/workspace/data
438 | # Debugging
439 | CUDA_LAUNCH_BLOCKING=1
440 | # model args
441 | make repl run="scripts/finetune_qa_models.py \
442 | --model_name_or_path $MODEL_NAME_OR_PATH \
443 | --output_dir $MODEL_DIR/$MODEL_NAME \
444 | --logging_dir $LOG_DIR/$MODEL_NAME \
445 | --dataset $DATASET \
446 | --data_dir $DATA_DIR \
447 | --seed $SEED \
448 | --num_train_epochs 3 \
449 | --learning_rate 3e-5 \
450 | --logging_steps 50 \
451 | --eval_steps 50 \
452 | --save_steps 1000 \
453 | --per_device_train_batch_size 4 \
454 | --per_device_eval_batch_size 32 \
455 | --gradient_accumulation_steps 8 \
456 | --overwrite_output_dir \
457 | --evaluate_during_training \
458 | --fp16 \
459 | --do_train \
460 | --do_eval \
461 | --do_lowercase \
462 | --max_length 512 \
463 | "
464 |
465 |
466 |
471 |
472 | ##### RoBERTa
473 |
474 | export SEED=42
475 | export MAX_LENGTH=512
476 | export NR_CONCATS=1
477 | export DATASET=squad_long
478 | export MODEL_DIR=/workspace/models
479 | export MODEL_NAME_OR_PATH=roberta-base
480 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
481 | export LOG_DIR=/workspace/logs
482 | export DATA_DIR=/workspace/data
483 | # Debugging
484 | CUDA_LAUNCH_BLOCKING=1
485 | # model args
486 | make repl run="scripts/finetune_qa_models.py \
487 | --model_name_or_path $MODEL_NAME_OR_PATH \
488 | --output_dir $MODEL_DIR/$MODEL_NAME \
489 | --logging_dir $LOG_DIR/$MODEL_NAME \
490 | --dataset $DATASET \
491 | --data_dir $DATA_DIR \
492 | --seed $SEED \
493 | --num_train_epochs 3 \
494 | --learning_rate 3e-5 \
495 | --logging_steps 50 \
496 | --eval_steps 50 \
497 | --save_steps 1000 \
498 | --per_device_train_batch_size 4 \
499 | --per_device_eval_batch_size 32 \
500 | --gradient_accumulation_steps 8 \
501 | --overwrite_output_dir \
502 | --evaluate_during_training \
503 | --fp16 \
504 | --do_train \
505 | --do_eval \
506 | --do_lowercase \
507 | --nr_concats $NR_CONCATS \
508 | --max_length $MAX_LENGTH \
509 | "
510 |
511 | ##### Longformer
512 |
513 | export SEED=42
514 | export MAX_LENGTH=512
515 | export NR_CONCATS=1
516 | export DATASET=squad_long
517 | export MODEL_DIR=/workspace/models
518 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
519 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
520 | export LOG_DIR=/workspace/logs
521 | export DATA_DIR=/workspace/data
522 | # Debugging
523 | CUDA_LAUNCH_BLOCKING=1
524 | # model args
525 | make repl run="scripts/finetune_qa_models.py \
526 | --model_name_or_path $MODEL_NAME_OR_PATH \
527 | --output_dir $MODEL_DIR/$MODEL_NAME \
528 | --logging_dir $LOG_DIR/$MODEL_NAME \
529 | --dataset $DATASET \
530 | --data_dir $DATA_DIR \
531 | --seed $SEED \
532 | --num_train_epochs 3 \
533 | --learning_rate 3e-5 \
534 | --logging_steps 50 \
535 | --eval_steps 50 \
536 | --save_steps 1000 \
537 | --per_device_train_batch_size 4 \
538 | --per_device_eval_batch_size 32 \
539 | --gradient_accumulation_steps 8 \
540 | --overwrite_output_dir \
541 | --evaluate_during_training \
542 | --fp16 \
543 | --do_train \
544 | --do_eval \
545 | --do_lowercase \
546 | --nr_concats $NR_CONCATS \
547 | --max_length $MAX_LENGTH \
548 | "
549 |
550 |
551 | ##### RoBERTa-Long
552 |
553 | export SEED=42
554 | export MAX_LENGTH=512
555 | export NR_CONCATS=1
556 | export DATASET=squad_long
557 | export MODEL_DIR=/workspace/models
558 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
559 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
560 | export LOG_DIR=/workspace/logs
561 | export DATA_DIR=/workspace/data
562 | # Debugging
563 | CUDA_LAUNCH_BLOCKING=1
564 | # model args
565 | make repl run="scripts/finetune_qa_models.py \
566 | --model_name_or_path $MODEL_NAME_OR_PATH \
567 | --output_dir $MODEL_DIR/$MODEL_NAME \
568 | --logging_dir $LOG_DIR/$MODEL_NAME \
569 | --dataset $DATASET \
570 | --data_dir $DATA_DIR \
571 | --seed $SEED \
572 | --num_train_epochs 3 \
573 | --learning_rate 3e-5 \
574 | --logging_steps 50 \
575 | --eval_steps 50 \
576 | --save_steps 1000 \
577 | --per_device_train_batch_size 4 \
578 | --per_device_eval_batch_size 32 \
579 | --gradient_accumulation_steps 8 \
580 | --overwrite_output_dir \
581 | --evaluate_during_training \
582 | --fp16 \
583 | --do_train \
584 | --do_eval \
585 | --do_lowercase \
586 | --nr_concats $NR_CONCATS \
587 | --max_length $MAX_LENGTH \
588 | "
589 |
590 | ##### XLM-R
591 |
592 | export SEED=42
593 | export MAX_LENGTH=512
594 | export NR_CONCATS=1
595 | export DATASET=squad_long
596 | export MODEL_DIR=/workspace/models
597 | export MODEL_NAME_OR_PATH=xlm-roberta-base
598 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
599 | export LOG_DIR=/workspace/logs
600 | export DATA_DIR=/workspace/data
601 | # Debugging
602 | CUDA_LAUNCH_BLOCKING=1
603 | # model args
604 | make repl run="scripts/finetune_qa_models.py \
605 | --model_name_or_path $MODEL_NAME_OR_PATH \
606 | --output_dir $MODEL_DIR/$MODEL_NAME \
607 | --logging_dir $LOG_DIR/$MODEL_NAME \
608 | --dataset $DATASET \
609 | --data_dir $DATA_DIR \
610 | --seed $SEED \
611 | --num_train_epochs 3 \
612 | --learning_rate 3e-5 \
613 | --logging_steps 50 \
614 | --eval_steps 50 \
615 | --save_steps 1000 \
616 | --per_device_train_batch_size 4 \
617 | --per_device_eval_batch_size 32 \
618 | --gradient_accumulation_steps 8 \
619 | --overwrite_output_dir \
620 | --evaluate_during_training \
621 | --fp16 \
622 | --do_train \
623 | --do_eval \
624 | --do_lowercase \
625 | --nr_concats $NR_CONCATS \
626 | --max_length $MAX_LENGTH \
627 | "
628 |
629 | ##### XLM-Long
630 |
631 | export SEED=42
632 | export MAX_LENGTH=512
633 | export NR_CONCATS=1
634 | export DATASET=squad_long
635 | export MODEL_DIR=/workspace/models
636 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
637 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
638 | export LOG_DIR=/workspace/logs
639 | export DATA_DIR=/workspace/data
640 | # Debugging
641 | CUDA_LAUNCH_BLOCKING=1
642 | # model args
643 | make repl run="scripts/finetune_qa_models.py \
644 | --model_name_or_path $MODEL_NAME_OR_PATH \
645 | --output_dir $MODEL_DIR/$MODEL_NAME \
646 | --logging_dir $LOG_DIR/$MODEL_NAME \
647 | --dataset $DATASET \
648 | --data_dir $DATA_DIR \
649 | --seed $SEED \
650 | --num_train_epochs 3 \
651 | --learning_rate 3e-5 \
652 | --logging_steps 50 \
653 | --eval_steps 50 \
654 | --save_steps 1000 \
655 | --per_device_train_batch_size 4 \
656 | --per_device_eval_batch_size 32 \
657 | --gradient_accumulation_steps 8 \
658 | --overwrite_output_dir \
659 | --evaluate_during_training \
660 | --fp16 \
661 | --do_train \
662 | --do_eval \
663 | --do_lowercase \
664 | --nr_concats $NR_CONCATS \
665 | --max_length $MAX_LENGTH \
666 | "
667 |
668 |
669 |
674 |
675 | ##### Longformer
676 |
677 | export SEED=42
678 | export MAX_LENGTH=2048
679 | export NR_CONCATS=3
680 | export DATASET=squad_long
681 | export MODEL_DIR=/workspace/models
682 | export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
683 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
684 | export LOG_DIR=/workspace/logs
685 | export DATA_DIR=/workspace/data
686 | # Debugging
687 | CUDA_LAUNCH_BLOCKING=1
688 | # model args
689 | make repl run="scripts/finetune_qa_models.py \
690 | --model_name_or_path $MODEL_NAME_OR_PATH \
691 | --output_dir $MODEL_DIR/$MODEL_NAME \
692 | --logging_dir $LOG_DIR/$MODEL_NAME \
693 | --dataset $DATASET \
694 | --data_dir $DATA_DIR \
695 | --seed $SEED \
696 | --num_train_epochs 3 \
697 | --learning_rate 3e-5 \
698 | --logging_steps 50 \
699 | --eval_steps 50 \
700 | --save_steps 1000 \
701 | --per_device_train_batch_size 1 \
702 | --per_device_eval_batch_size 32 \
703 | --gradient_accumulation_steps 32 \
704 | --overwrite_output_dir \
705 | --evaluate_during_training \
706 | --fp16 \
707 | --do_train \
708 | --do_eval \
709 | --do_lowercase \
710 | --nr_concats $NR_CONCATS \
711 | --max_length $MAX_LENGTH \
712 | "
713 |
714 |
715 | ##### RoBERTa-Long
716 |
717 | export SEED=42
718 | export MAX_LENGTH=2048
719 | export NR_CONCATS=3
720 | export DATASET=squad_long
721 | export MODEL_DIR=/workspace/models
722 | export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
723 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
724 | export LOG_DIR=/workspace/logs
725 | export DATA_DIR=/workspace/data
726 | # Debugging
727 | CUDA_LAUNCH_BLOCKING=1
728 | # model args
729 | make repl run="scripts/finetune_qa_models.py \
730 | --model_name_or_path $MODEL_NAME_OR_PATH \
731 | --output_dir $MODEL_DIR/$MODEL_NAME \
732 | --logging_dir $LOG_DIR/$MODEL_NAME \
733 | --dataset $DATASET \
734 | --data_dir $DATA_DIR \
735 | --seed $SEED \
736 | --num_train_epochs 3 \
737 | --learning_rate 3e-5 \
738 | --logging_steps 50 \
739 | --eval_steps 50 \
740 | --save_steps 1000 \
741 | --per_device_train_batch_size 1 \
742 | --per_device_eval_batch_size 32 \
743 | --gradient_accumulation_steps 32 \
744 | --overwrite_output_dir \
745 | --evaluate_during_training \
746 | --fp16 \
747 | --do_train \
748 | --do_eval \
749 | --do_lowercase \
750 | --nr_concats $NR_CONCATS \
751 | --max_length $MAX_LENGTH \
752 | "
753 |
754 |
755 | ##### XLM-Long
756 |
757 | export SEED=42
758 | export MAX_LENGTH=2048
759 | export NR_CONCATS=3
760 | export DATASET=squad_long
761 | export MODEL_DIR=/workspace/models
762 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
763 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
764 | export LOG_DIR=/workspace/logs
765 | export DATA_DIR=/workspace/data
766 | # Debugging
767 | CUDA_LAUNCH_BLOCKING=1
768 | # model args
769 | make repl run="scripts/finetune_qa_models.py \
770 | --model_name_or_path $MODEL_NAME_OR_PATH \
771 | --output_dir $MODEL_DIR/$MODEL_NAME \
772 | --logging_dir $LOG_DIR/$MODEL_NAME \
773 | --dataset $DATASET \
774 | --data_dir $DATA_DIR \
775 | --seed $SEED \
776 | --num_train_epochs 3 \
777 | --learning_rate 3e-5 \
778 | --logging_steps 50 \
779 | --eval_steps 50 \
780 | --save_steps 1000 \
781 | --per_device_train_batch_size 1 \
782 | --per_device_eval_batch_size 32 \
783 | --gradient_accumulation_steps 32 \
784 | --overwrite_output_dir \
785 | --evaluate_during_training \
786 | --fp16 \
787 | --do_train \
788 | --do_eval \
789 | --do_lowercase \
790 | --nr_concats $NR_CONCATS \
791 | --max_length $MAX_LENGTH \
792 | "
793 |
794 |
795 |
800 |
806 |
807 | ##### RoBERTa
808 |
809 | export SEED=42
810 | export DATASET=xquad
811 | export MODEL_DIR=/workspace/models
812 | export MODEL_NAME_OR_PATH=roberta-base
813 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
814 | export LOG_DIR=/workspace/logs
815 | export DATA_DIR=/workspace/data
816 | # Debugging
817 | CUDA_LAUNCH_BLOCKING=1
818 | # model args
819 | make repl run="scripts/finetune_qa_models.py \
820 | --model_name_or_path $MODEL_NAME_OR_PATH \
821 | --output_dir $MODEL_DIR/$MODEL_NAME \
822 | --logging_dir $LOG_DIR/$MODEL_NAME \
823 | --dataset $DATASET \
824 | --data_dir $DATA_DIR \
825 | --seed $SEED \
826 | --num_train_epochs 3 \
827 | --learning_rate 3e-5 \
828 | --logging_steps 50 \
829 | --eval_steps 50 \
830 | --save_steps 1000 \
831 | --per_device_train_batch_size 4 \
832 | --per_device_eval_batch_size 32 \
833 | --gradient_accumulation_steps 8 \
834 | --overwrite_output_dir \
835 | --evaluate_during_training \
836 | --fp16 \
837 | --do_train \
838 | --do_eval \
839 | --do_lowercase \
840 | --max_length 512 \
841 | "
842 |
843 | ##### XLM-R
844 |
845 | export SEED=42
846 | export DATASET=xquad
847 | export MODEL_DIR=/workspace/models
848 | export MODEL_NAME_OR_PATH=xlm-roberta-base
849 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
850 | export LOG_DIR=/workspace/logs
851 | export DATA_DIR=/workspace/data
852 | # Debugging
853 | CUDA_LAUNCH_BLOCKING=1
854 | # model args
855 | make repl run="scripts/finetune_qa_models.py \
856 | --model_name_or_path $MODEL_NAME_OR_PATH \
857 | --output_dir $MODEL_DIR/$MODEL_NAME \
858 | --logging_dir $LOG_DIR/$MODEL_NAME \
859 | --dataset $DATASET \
860 | --data_dir $DATA_DIR \
861 | --seed $SEED \
862 | --num_train_epochs 3 \
863 | --learning_rate 3e-5 \
864 | --logging_steps 50 \
865 | --eval_steps 50 \
866 | --save_steps 1000 \
867 | --per_device_train_batch_size 4 \
868 | --per_device_eval_batch_size 32 \
869 | --gradient_accumulation_steps 8 \
870 | --overwrite_output_dir \
871 | --evaluate_during_training \
872 | --fp16 \
873 | --do_train \
874 | --do_eval \
875 | --do_lowercase \
876 | --max_length 512 \
877 | "
878 |
879 | ##### XLM-Long
880 |
881 | export SEED=42
882 | export DATASET=xquad
883 | export MODEL_DIR=/workspace/models
884 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
885 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
886 | export LOG_DIR=/workspace/logs
887 | export DATA_DIR=/workspace/data
888 | # Debugging
889 | CUDA_LAUNCH_BLOCKING=1
890 | # model args
891 | make repl run="scripts/finetune_qa_models.py \
892 | --model_name_or_path $MODEL_NAME_OR_PATH \
893 | --output_dir $MODEL_DIR/$MODEL_NAME \
894 | --logging_dir $LOG_DIR/$MODEL_NAME \
895 | --dataset $DATASET \
896 | --data_dir $DATA_DIR \
897 | --seed $SEED \
898 | --num_train_epochs 3 \
899 | --learning_rate 3e-5 \
900 | --logging_steps 50 \
901 | --eval_steps 50 \
902 | --save_steps 1000 \
903 | --per_device_train_batch_size 4 \
904 | --per_device_eval_batch_size 32 \
905 | --gradient_accumulation_steps 8 \
906 | --overwrite_output_dir \
907 | --evaluate_during_training \
908 | --fp16 \
909 | --do_train \
910 | --do_eval \
911 | --do_lowercase \
912 | --max_length 512 \
913 | "
914 |
915 |
916 |
921 |
922 | ##### XLM-R
923 |
924 | export SEED=42
925 | export MAX_LENGTH=512
926 | export NR_CONCATS=1
927 | export DATASET=xquad_long
928 | export MODEL_DIR=/workspace/models
929 | export MODEL_NAME_OR_PATH=xlm-roberta-base
930 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
931 | export LOG_DIR=/workspace/logs
932 | export DATA_DIR=/workspace/data
933 | # Debugging
934 | CUDA_LAUNCH_BLOCKING=1
935 | # model args
936 | make repl run="scripts/finetune_qa_models.py \
937 | --model_name_or_path $MODEL_NAME_OR_PATH \
938 | --output_dir $MODEL_DIR/$MODEL_NAME \
939 | --logging_dir $LOG_DIR/$MODEL_NAME \
940 | --dataset $DATASET \
941 | --data_dir $DATA_DIR \
942 | --seed $SEED \
943 | --num_train_epochs 3 \
944 | --learning_rate 3e-5 \
945 | --logging_steps 50 \
946 | --eval_steps 50 \
947 | --save_steps 1000 \
948 | --per_device_train_batch_size 4 \
949 | --per_device_eval_batch_size 32 \
950 | --gradient_accumulation_steps 8 \
951 | --overwrite_output_dir \
952 | --evaluate_during_training \
953 | --fp16 \
954 | --do_train \
955 | --do_eval \
956 | --do_lowercase \
957 | --nr_concats $NR_CONCATS \
958 | --max_length $MAX_LENGTH \
959 | "
960 |
961 |
962 | ##### XLM-Long
963 |
964 | export SEED=42
965 | export MAX_LENGTH=512
966 | export NR_CONCATS=1
967 | export DATASET=xquad_long
968 | export MODEL_DIR=/workspace/models
969 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
970 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
971 | export LOG_DIR=/workspace/logs
972 | export DATA_DIR=/workspace/data
973 | # Debugging
974 | CUDA_LAUNCH_BLOCKING=1
975 | # model args
976 | make repl run="scripts/finetune_qa_models.py \
977 | --model_name_or_path $MODEL_NAME_OR_PATH \
978 | --output_dir $MODEL_DIR/$MODEL_NAME \
979 | --logging_dir $LOG_DIR/$MODEL_NAME \
980 | --dataset $DATASET \
981 | --data_dir $DATA_DIR \
982 | --seed $SEED \
983 | --num_train_epochs 3 \
984 | --learning_rate 3e-5 \
985 | --logging_steps 50 \
986 | --eval_steps 50 \
987 | --save_steps 1000 \
988 | --per_device_train_batch_size 4 \
989 | --per_device_eval_batch_size 32 \
990 | --gradient_accumulation_steps 8 \
991 | --overwrite_output_dir \
992 | --evaluate_during_training \
993 | --fp16 \
994 | --do_train \
995 | --do_eval \
996 | --do_lowercase \
997 | --nr_concats $NR_CONCATS \
998 | --max_length $MAX_LENGTH \
999 | "
1000 |
1001 |
1002 |
1007 |
1008 | ##### XLM-Long
1009 |
1010 | export SEED=42
1011 | export MAX_LENGTH=2048
1012 | export NR_CONCATS=3
1013 | export DATASET=xquad_long
1014 | export MODEL_DIR=/workspace/models
1015 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
1016 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1017 | export LOG_DIR=/workspace/logs
1018 | export DATA_DIR=/workspace/data
1019 | # Debugging
1020 | CUDA_LAUNCH_BLOCKING=1
1021 | # model args
1022 | make repl run="scripts/finetune_qa_models.py \
1023 | --model_name_or_path $MODEL_NAME_OR_PATH \
1024 | --output_dir $MODEL_DIR/$MODEL_NAME \
1025 | --logging_dir $LOG_DIR/$MODEL_NAME \
1026 | --dataset $DATASET \
1027 | --data_dir $DATA_DIR \
1028 | --seed $SEED \
1029 | --num_train_epochs 3 \
1030 | --learning_rate 3e-5 \
1031 | --logging_steps 50 \
1032 | --eval_steps 50 \
1033 | --save_steps 1000 \
1034 | --per_device_train_batch_size 1 \
1035 | --per_device_eval_batch_size 32 \
1036 | --gradient_accumulation_steps 32 \
1037 | --overwrite_output_dir \
1038 | --evaluate_during_training \
1039 | --fp16 \
1040 | --do_train \
1041 | --do_eval \
1042 | --do_lowercase \
1043 | --nr_concats $NR_CONCATS \
1044 | --max_length $MAX_LENGTH \
1045 | "
1046 |
1047 |
1048 |
1054 |
1055 | ##### XLM-R
1056 |
1057 | export SEED=42
1058 | export DATASET=mlqa
1059 | export MODEL_DIR=/workspace/models
1060 | export MODEL_NAME_OR_PATH=xlm-roberta-base
1061 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1062 | export LOG_DIR=/workspace/logs
1063 | export DATA_DIR=/workspace/data
1064 | # Debugging
1065 | CUDA_LAUNCH_BLOCKING=1
1066 | # model args
1067 | make repl run="scripts/finetune_qa_models.py \
1068 | --model_name_or_path $MODEL_NAME_OR_PATH \
1069 | --output_dir $MODEL_DIR/$MODEL_NAME \
1070 | --logging_dir $LOG_DIR/$MODEL_NAME \
1071 | --dataset $DATASET \
1072 | --data_dir $DATA_DIR \
1073 | --seed $SEED \
1074 | --num_train_epochs 3 \
1075 | --learning_rate 3e-5 \
1076 | --logging_steps 50 \
1077 | --eval_steps 50 \
1078 | --save_steps 1000 \
1079 | --per_device_train_batch_size 4 \
1080 | --per_device_eval_batch_size 32 \
1081 | --gradient_accumulation_steps 8 \
1082 | --overwrite_output_dir \
1083 | --evaluate_during_training \
1084 | --fp16 \
1085 | --do_train \
1086 | --do_eval \
1087 | --do_lowercase \
1088 | --max_length 512 \
1089 | "
1090 |
1091 |
1092 | ##### XLM-Long
1093 |
1094 | export SEED=42
1095 | export DATASET=mlqa
1096 | export MODEL_DIR=/workspace/models
1097 | export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
1098 | export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1099 | export LOG_DIR=/workspace/logs
1100 | export DATA_DIR=/workspace/data
1101 | # Debugging
1102 | CUDA_LAUNCH_BLOCKING=1
1103 | # model args
1104 | make repl run="scripts/finetune_qa_models.py \
1105 | --model_name_or_path $MODEL_NAME_OR_PATH \
1106 | --output_dir $MODEL_DIR/$MODEL_NAME \
1107 | --logging_dir $LOG_DIR/$MODEL_NAME \
1108 | --dataset $DATASET \
1109 | --data_dir $DATA_DIR \
1110 | --seed $SEED \
1111 | --num_train_epochs 3 \
1112 | --learning_rate 3e-5 \
1113 | --logging_steps 50 \
1114 | --eval_steps 50 \
1115 | --save_steps 1000 \
1116 | --per_device_train_batch_size 4 \
1117 | --per_device_eval_batch_size 32 \
1118 | --gradient_accumulation_steps 8 \
1119 | --overwrite_output_dir \
1120 | --evaluate_during_training \
1121 | --fp16 \
1122 | --do_train \
1123 | --do_eval \
1124 | --do_lowercase \
1125 | --max_length 512 \
1126 | "
1127 |
1128 |
1129 |
1130 |
1131 | SQuAD
279 | SQ3 (512)
470 | SQ3 (4096)
673 | TODO TriviaQA (4096)
799 | XQuAD
805 | XQ3 (512)
920 | XQ3 (4096)
1006 | MLQA
1053 |