├── logs
    └── .gitignore
├── scripts
    ├── __init__.py
    ├── run_long_lm.py
    └── finetune_qa_models.py
├── src
    └── __init__.py
├── report
    ├── Master Thesis.pdf
    └── Thesis Presentation.pdf
├── .env.template
├── LICENSE
├── Makefile
├── docker-compose.yaml
├── Pretraining_Details.md
├── Dockerfile
├── Finetuning_Details.md
├── .gitignore
├── requirements.txt
├── notebooks
    ├── Longformer TriviaQA.ipynb
    ├── Convert to Long.ipynb
    └── Try Train Longformer SQuAD.ipynb
└── README.md


/logs/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | import .tracking
2 | import .lib
3 | 


--------------------------------------------------------------------------------
/report/Master Thesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Master Thesis.pdf


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | PROJECT_NAME=xlm-l
2 | DATA_DIR=
3 | MODEL_DIR=
4 | GPU_IDS=0
5 | JUPYTER_PW=
6 | JUPYTER_PORT=8999
7 | PRIVATE_DEPS=none
8 | 


--------------------------------------------------------------------------------
/report/Thesis Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarkusSagen/Master-Thesis-Multilingual-Longformer/HEAD/report/Thesis Presentation.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Markus Sagen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include .env
 2 | 
 3 | export USER_ID      := $(shell id -u)
 4 | export USER_NAME    := $(shell whoami)
 5 | export PROJECT_DIR  := $(shell pwd)
 6 | export COMPOSE_CMD  := docker-compose -f docker-compose.yaml -p ${PROJECT_NAME}_${USER_NAME}
 7 | export PKG_DIR      := pkg
 8 | 
 9 | # Enable running e3k on machines with no GPU
10 | ifeq (${GPU_IDS}, none)
11 | 	export RUNTIME := runc
12 | else
13 | 	export RUNTIME := nvidia
14 | endif
15 | 
16 | # Enable pulling in dependencies in private repos
17 | ifneq (${PRIVATE_DEPS}, none)
18 | 	clone_private_deps := for item in ${PRIVATE_DEPS}; do \
19 |             git clone $$item ${PKG_DIR}/$$item; \
20 | 	    echo $$item; \
21 | 	done
22 | else
23 | 	clone_private_deps := echo "Nothing to clone"
24 | endif
25 | 
26 | .PHONY: build
27 | build:
28 | 	mkdir -p ${PKG_DIR}
29 | 	$(call clone_private_deps)
30 | 	$(COMPOSE_CMD) build
31 | 	rm -rf ${PKG_DIR}
32 | 
33 | .PHONY: logs
34 | logs:
35 | 	${COMPOSE_CMD} logs
36 | 
37 | .PHONY: up
38 | up:
39 | 	$(COMPOSE_CMD) up --detach 
40 | 
41 | .PHONY: down
42 | down:
43 | 	$(COMPOSE_CMD) down
44 | 
45 | .PHONY: repl
46 | repl:
47 | 	${COMPOSE_CMD} exec repl python3 $(run)
48 | 
49 | .PHONY: ipython
50 | ipython:
51 | 	${COMPOSE_CMD} exec repl ipython $(run)
52 | 
53 | 
54 | .PHONY: shell
55 | shell:
56 | 	${COMPOSE_CMD} exec repl bash
57 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '2.3'
 2 | services:
 3 |   jupyter:
 4 |     image: ${PROJECT_NAME}
 5 |     command: jupyter lab --ip=0.0.0.0 --no-browser --NotebookApp.token='${JUPYTER_PW}'
 6 |     build:
 7 |       context: .
 8 |       dockerfile: Dockerfile
 9 |       args:
10 |         - PKG_DIR=${PKG_DIR}
11 |         - PRIVATE_DEPS=${PRIVATE_DEPS}
12 |     shm_size: '16gb'
13 |     ports:
14 |       - ${JUPYTER_PORT}:8888
15 |     user: ${USER_ID}:${USER_ID}
16 |     runtime: ${RUNTIME}
17 |     network_mode: bridge
18 |     environment:
19 |       - NVIDIA_VISIBLE_DEVICES=${GPU_IDS}
20 |     volumes:
21 |       - ${DATA_DIR}:/workspace/data
22 |       - ${MODEL_DIR}:/workspace/models
23 |       - ${PROJECT_DIR}/src:/workspace/src
24 |       - ${PROJECT_DIR}/notebooks:/workspace/notebooks
25 |       - ${PROJECT_DIR}/logs:/workspace/logs
26 | 
27 |   repl:
28 |     image: ${PROJECT_NAME}
29 |     tty: true
30 |     shm_size: '16gb'
31 |     user: ${USER_ID}:${USER_ID}
32 |     runtime: ${RUNTIME}
33 |     network_mode: bridge
34 |     environment:
35 |       - NVIDIA_VISIBLE_DEVICES=${GPU_IDS}
36 |     volumes:
37 |       - ${DATA_DIR}:/workspace/data
38 |       - ${MODEL_DIR}:/workspace/models
39 |       - ${PROJECT_DIR}/src:/workspace/src
40 |       - ${PROJECT_DIR}/scripts:/workspace/scripts
41 |       - ${PROJECT_DIR}/logs:/workspace/logs
42 | 


--------------------------------------------------------------------------------
/Pretraining_Details.md:
--------------------------------------------------------------------------------
 1 | # Pre-Training Details 
 2 | 
 3 | ### Models
 4 | Converting transformer models are based on the [Longformer conversion script](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb). The script can be run for any pre-trained RoBERTa based model and can be extended to be used with other pre-traned models.  
 5 | 
 6 | Training with these parameters on a 48GB GPU takes ~5 days   
 7 | We pre-trained both a monolingual RoBERTa and multilingual XLM-R model using the Longformer pre-training scheme to extend the context's of the models. These models were trained on the same datasets and same hyper-parameters and only trained with one seed because of the long training time.   
 8 | 
 9 | The arguments `MAX_POS` indicate how many tokens the model should learn to attend. The number of tokens it can learn to attend to must be of the form $2^x$ and be larger than $512$.   
10 | 
11 | The `MODEL_NAME_OR_PATH` indicated the pre-trained model that the Longformer can be extended from. The names of the models must be pre-trained model names available at [Huggingface](https://huggingface.co/models), such as `roberta-base`, `xlm-roberta-base` or similar. The pre-training scheme should in theory work for all encoder-type Transformers, such as BERT, RoBERTa, Alberta, etc. However, we have only tested it for RoBERTa and XLM-R, so the training script may need to be changed if used for BERT.   
12 | 
13 | We refer to these models that we have trained using the Longformer pre-training scheme as:   
14 | 
15 | 1. `RoBERTa-Long`   
16 | 2. `XLM-Long`   
17 |    
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://hub.docker.com/r/huggingface/transformers-pytorch-gpu/dockerfile
 3 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 4 | 
 5 | ARG PKG_DIR
 6 | ARG PRIVATE_DEPS
 7 | 
 8 | WORKDIR /workspace
 9 | 
10 | RUN apt update && \
11 |     apt install -y bash \
12 |                    build-essential \
13 |                    git \
14 |                    wget \
15 |                    curl \
16 |                    ca-certificates \
17 |                    python3 \
18 |                    python3-pip && \
19 |     rm -rf /var/lib/apt/lists
20 | 
21 | # RUN apt-get update && apt-get install -y git
22 | 
23 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
24 |     python3 -m pip install --no-cache-dir \
25 |     mkl \
26 |     torch
27 | 
28 | #RUN git clone https://github.com/NVIDIA/apex
29 | #RUN cd apex && \
30 | #    python3 setup.py install && \
31 | #    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
32 | 
33 | 
34 | # Install packages from private repositories
35 | COPY ${PKG_DIR}/ /pkg/
36 | RUN if [ "${PRIVATE_DEPS}" != "none" ]; then \
37 | 	for pkg in /pkg/*/* ; \
38 | 	do pip install -e $pkg ; \
39 | 	done; \
40 | 	fi
41 | 
42 | 
43 | # Fix permissions
44 | RUN chmod 0777 /workspace
45 | RUN mkdir /.local && chmod 0777 /.local
46 | RUN mkdir /.jupyter && chmod 0777 /.jupyter
47 | RUN mkdir /.cache && chmod 0777 /.cache
48 | # Workaround for transformers library permissions
49 | RUN mkdir /.config && chmod 0777 /.config
50 | 
51 | # Install python packages
52 | ADD src ./src
53 | ADD requirements.txt .
54 | RUN pip install -r requirements.txt
55 | 


--------------------------------------------------------------------------------
/Finetuning_Details.md:
--------------------------------------------------------------------------------
 1 | # Fine-Tuning Details   
 2 | 
 3 | 
 4 | We fine-tune and evaluate on these datasets using several pre-trained models released by Huggingface and compare it with the long-context models (Longformer type models) we have trained.   
 5 | 
 6 | We have divided the models firstly based the number of languages, then on the specific dataset and finally which model was fine-tuned. The datasets SQ3 and XQ3 are the long context variants (with concatenated context) of the SQuAD and XQuAD datasets. And to better understand and evaluate how the performance was effected when creating a new dataset, we chose fine-tune on the SQ3 and XQ3 dataset using either the regular attention window (512 tokens) or the attention window learned by the Longformer trained models (4096 tokens). These datasets were denoted SQ3 (512) and SQ3 (2048) respectively for the English dataset and XQ3 (512) and XQ3 (2048) for the multilingual datasets.   
 7 | 
 8 | The long context models are trained on a longer context than 2048, but we restricted the long context datasets to this many tokens at time, since the models did not manage to fit in memory on a 48GB GPU otherwise.   
 9 | 
10 | #### Context lengths   
11 | Depending on the number of contexts one choses to concatinate together, the maximum number of tokens the model can attend to also changes. The maximum number of contexts and tokens we managed to run on a 48GB GPU was 3 concatinated context, and corresponded to that the average number of tokens for each context were slightly below 2048. Therefore, for the concatinated long datasets, we set the hyper-parameters --nr\_concats=3 and --max\_length=2048. If you want to test out other values, we suggest the following pairings:   
12 | 
13 | concats=1, max\_length=512   
14 | concats=3, max\_length=2048   
15 | concats=5, max\_length=4098   
16 | 
17 | 
18 | #### Seeds
19 | Each model is trained with 5 different SEEDS. To replicate our experiments, re-run each code segment and replace the SEED with the following seeds:
20 | 
21 | - 42
22 | - 1337
23 | - 1729
24 | - 165
25 | - 758241
26 | 
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.11.0
  2 | anyio==2.1.0
  3 | argon2-cffi==20.1.0
  4 | asn1crypto==0.24.0
  5 | async-generator==1.10
  6 | attrs==20.3.0
  7 | Babel==2.9.0
  8 | backcall==0.2.0
  9 | bleach==3.3.0
 10 | cached-property==1.5.2
 11 | cachetools==4.2.1
 12 | certifi==2020.12.5
 13 | cffi==1.14.5
 14 | chardet==4.0.0
 15 | click==7.1.2
 16 | cloudpickle==1.6.0
 17 | colorama==0.4.4
 18 | contextvars==2.4
 19 | cryptography==2.1.4
 20 | cycler==0.10.0
 21 | Cython==0.29.21
 22 | dask==2021.2.0
 23 | dataclasses==0.8
 24 | datasets==1.3.0
 25 | decorator==4.4.2
 26 | defusedxml==0.6.0
 27 | dill==0.3.3
 28 | distributed==2021.2.0
 29 | dnspython==2.1.0
 30 | docopt==0.6.2
 31 | entrypoints==0.3
 32 | filelock==3.0.12
 33 | fsspec==0.8.5
 34 | gitdb==4.0.5
 35 | GitPython==3.1.13
 36 | google-auth==1.27.0
 37 | google-auth-oauthlib==0.4.2
 38 | graphviz==0.16
 39 | grpcio==1.35.0
 40 | h5py==3.1.0
 41 | HeapDict==1.0.1
 42 | hiddenlayer==0.3
 43 | huggingface-hub==0.0.2
 44 | idna==2.10
 45 | immutables==0.15
 46 | importlib-metadata==3.4.0
 47 | intel-openmp==2021.1.2
 48 | ipykernel==5.5.0
 49 | ipython==7.16.1
 50 | ipython-genutils==0.2.0
 51 | ipywidgets==7.6.3
 52 | jedi==0.18.0
 53 | Jinja2==2.11.3
 54 | joblib==1.0.1
 55 | json5==0.9.5
 56 | jsonpickle==1.5.2
 57 | jsonschema==3.2.0
 58 | jupyter-client==6.1.11
 59 | jupyter-core==4.7.1
 60 | jupyter-server==1.4.0
 61 | jupyterlab==3.0.8
 62 | jupyterlab-pygments==0.1.2
 63 | jupyterlab-server==2.3.0
 64 | jupyterlab-widgets==1.0.0
 65 | keyring==10.6.0
 66 | keyrings.alt==3.0
 67 | kiwisolver==1.3.1
 68 | Markdown==3.3.3
 69 | MarkupSafe==1.1.1
 70 | matplotlib==3.3.4
 71 | mistune==0.8.4
 72 | mkl==2021.1.1
 73 | msgpack==1.0.2
 74 | multiprocess==0.70.11.1
 75 | munch==2.5.0
 76 | nbclassic==0.2.6
 77 | nbclient==0.5.2
 78 | nbconvert==6.0.7
 79 | nbformat==5.1.2
 80 | nest-asyncio==1.5.1
 81 | notebook==6.2.0
 82 | numpy==1.19.5
 83 | oauthlib==3.1.0
 84 | packaging==20.9
 85 | pandas==1.1.5
 86 | pandocfilters==1.4.3
 87 | parso==0.8.1
 88 | pexpect==4.8.0
 89 | pickleshare==0.7.5
 90 | Pillow==8.1.0
 91 | pip==20.3.3
 92 | prometheus-client==0.9.0
 93 | prompt-toolkit==3.0.16
 94 | protobuf==3.15.0
 95 | psutil==5.8.0
 96 | ptyprocess==0.7.0
 97 | py-cpuinfo==7.0.0
 98 | pyarrow==1.0.1
 99 | pyasn1==0.4.8
100 | pyasn1-modules==0.2.8
101 | pycparser==2.20
102 | pycrypto==2.6.1
103 | Pygments==2.8.0
104 | pygobject==3.26.1
105 | pymongo==3.11.3
106 | pyparsing==2.4.7
107 | pyrsistent==0.17.3
108 | python-dateutil==2.8.1
109 | pytz==2021.1
110 | pyxdg==0.25
111 | PyYAML==5.4.1
112 | pyzmq==22.0.3
113 | regex==2020.11.13
114 | requests==2.25.1
115 | requests-oauthlib==1.3.0
116 | rsa==4.7.1
117 | sacred==0.8.2
118 | sacremoses==0.0.43
119 | scikit-learn==0.24.1
120 | scipy==1.5.4
121 | seaborn==0.11.1
122 | SecretStorage==2.3.1
123 | Send2Trash==1.5.0
124 | sentencepiece==0.1.95
125 | setuptools==53.0.0
126 | six==1.11.0
127 | sklearn
128 | smmap==3.0.5
129 | sniffio==1.2.0
130 | sortedcontainers==2.3.0
131 | tbb==2021.1.1
132 | tblib==1.7.0
133 | tensorboard==2.4.1
134 | tensorboard-plugin-wit==1.8.0
135 | terminado==0.9.2
136 | testpath==0.4.4
137 | threadpoolctl==2.1.0
138 | tokenizers==0.9.2
139 | toolz==0.11.1
140 | torch==1.7.1
141 | torchsummary==1.5.1
142 | tornado==6.1
143 | tqdm==4.49.0
144 | traitlets==4.3.3
145 | transformers==3.4.0
146 | typing-extensions==3.7.4.3
147 | urllib3==1.26.3
148 | wcwidth==0.2.5
149 | webencodings==0.5.1
150 | Werkzeug==1.0.1
151 | wget==3.2
152 | wheel==0.30.0
153 | widgetsnbextension==3.5.1
154 | wrapt==1.12.1
155 | xxhash==2.0.0
156 | zict==2.0.0
157 | zipp==3.4.0
158 | 


--------------------------------------------------------------------------------
/notebooks/Longformer TriviaQA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#import nlp\n",
 10 |     "import torch\n",
 11 |     "import datasets\n",
 12 |     "\n",
 13 |     "# ATTENTION. Rerunning this command remove the cached trivia qa dataset completely \n",
 14 |     "#!rm -rf /.cache/"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "trivia_qa  wikitext-103-raw\n",
 27 |       "mkdir: cannot create directory '../data/trivia_qa': File exists\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "# https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb#scrollTo=wyDYG4YDXFV7\n",
 33 |     "!ls ../data\n",
 34 |     "!mkdir ../data/trivia_qa"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "%%time\n",
 44 |     "validation_dataset = datasets.load_dataset(\"trivia_qa\", \"rc\", split=\"validation[:5%]\", cache_dir=\"/workspace/data/trivia_qa\")"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "absl-py==0.11.0\n",
 57 |       "apex==0.1\n",
 58 |       "argon2-cffi==20.1.0\n",
 59 |       "asn1crypto==0.24.0\n",
 60 |       "async-generator==1.10\n",
 61 |       "attrs==20.3.0\n",
 62 |       "backcall==0.2.0\n",
 63 |       "bleach==3.2.1\n",
 64 |       "cached-property==1.5.2\n",
 65 |       "cachetools==4.1.1\n",
 66 |       "certifi==2020.11.8\n",
 67 |       "cffi==1.14.4\n",
 68 |       "chardet==3.0.4\n",
 69 |       "click==7.1.2\n",
 70 |       "cloudpickle==1.6.0\n",
 71 |       "colorama==0.4.4\n",
 72 |       "contextvars==2.4\n",
 73 |       "cryptography==2.1.4\n",
 74 |       "cycler==0.10.0\n",
 75 |       "Cython==0.29.21\n",
 76 |       "dask==2.30.0\n",
 77 |       "dataclasses==0.8\n",
 78 |       "datasets==1.1.3\n",
 79 |       "decorator==4.4.2\n",
 80 |       "defusedxml==0.6.0\n",
 81 |       "dill==0.3.3\n",
 82 |       "distributed==2.30.1\n",
 83 |       "dnspython==2.0.0\n",
 84 |       "docopt==0.6.2\n",
 85 |       "entrypoints==0.3\n",
 86 |       "filelock==3.0.12\n",
 87 |       "future==0.18.2\n",
 88 |       "gitdb==4.0.5\n",
 89 |       "GitPython==3.1.11\n",
 90 |       "google-auth==1.23.0\n",
 91 |       "google-auth-oauthlib==0.4.2\n",
 92 |       "graphviz==0.15\n",
 93 |       "grpcio==1.33.2\n",
 94 |       "h5py==3.1.0\n",
 95 |       "HeapDict==1.0.1\n",
 96 |       "hiddenlayer==0.3\n",
 97 |       "idna==2.6\n",
 98 |       "immutables==0.14\n",
 99 |       "importlib-metadata==3.1.0\n",
100 |       "intel-openmp==2020.0.133\n",
101 |       "ipykernel==5.3.4\n",
102 |       "ipython==7.16.1\n",
103 |       "ipython-genutils==0.2.0\n",
104 |       "ipywidgets==7.5.1\n",
105 |       "jedi==0.17.2\n",
106 |       "Jinja2==2.11.2\n",
107 |       "joblib==0.17.0\n",
108 |       "json5==0.9.5\n",
109 |       "jsonpickle==1.4.1\n",
110 |       "jsonschema==3.2.0\n",
111 |       "jupyter-client==6.1.7\n",
112 |       "jupyter-core==4.7.0\n",
113 |       "jupyterlab==2.2.9\n",
114 |       "jupyterlab-pygments==0.1.2\n",
115 |       "jupyterlab-server==1.2.0\n",
116 |       "keyring==10.6.0\n",
117 |       "keyrings.alt==3.0\n",
118 |       "kiwisolver==1.3.1\n",
119 |       "Markdown==3.3.3\n",
120 |       "MarkupSafe==1.1.1\n",
121 |       "matplotlib==3.3.3\n",
122 |       "mistune==0.8.4\n",
123 |       "mkl==2019.0\n",
124 |       "msgpack==1.0.0\n",
125 |       "multiprocess==0.70.11.1\n",
126 |       "munch==2.5.0\n",
127 |       "nbclient==0.5.1\n",
128 |       "nbconvert==6.0.7\n",
129 |       "nbformat==5.0.8\n",
130 |       "nest-asyncio==1.4.3\n",
131 |       "notebook==6.1.5\n",
132 |       "numpy==1.19.4\n",
133 |       "oauthlib==3.1.0\n",
134 |       "packaging==20.4\n",
135 |       "pandas==1.1.4\n",
136 |       "pandocfilters==1.4.3\n",
137 |       "parso==0.7.1\n",
138 |       "pexpect==4.8.0\n",
139 |       "pickleshare==0.7.5\n",
140 |       "Pillow==8.0.1\n",
141 |       "prometheus-client==0.9.0\n",
142 |       "prompt-toolkit==3.0.8\n",
143 |       "protobuf==3.14.0\n",
144 |       "psutil==5.7.3\n",
145 |       "ptyprocess==0.6.0\n",
146 |       "py-cpuinfo==7.0.0\n",
147 |       "pyarrow==2.0.0\n",
148 |       "pyasn1==0.4.8\n",
149 |       "pyasn1-modules==0.2.8\n",
150 |       "pycparser==2.20\n",
151 |       "pycrypto==2.6.1\n",
152 |       "Pygments==2.7.2\n",
153 |       "pygobject==3.26.1\n",
154 |       "pymongo==3.11.1\n",
155 |       "pyparsing==2.4.7\n",
156 |       "pyrsistent==0.17.3\n",
157 |       "python-dateutil==2.8.1\n",
158 |       "pytz==2020.4\n",
159 |       "pyxdg==0.25\n",
160 |       "PyYAML==5.3.1\n",
161 |       "pyzmq==20.0.0\n",
162 |       "regex==2020.11.13\n",
163 |       "requests==2.25.0\n",
164 |       "requests-oauthlib==1.3.0\n",
165 |       "rsa==4.6\n",
166 |       "sacred==0.8.1\n",
167 |       "sacremoses==0.0.43\n",
168 |       "scikit-learn==0.23.2\n",
169 |       "scipy==1.5.4\n",
170 |       "seaborn==0.11.0\n",
171 |       "SecretStorage==2.3.1\n",
172 |       "Send2Trash==1.5.0\n",
173 |       "sentencepiece==0.1.94\n",
174 |       "six==1.11.0\n",
175 |       "sklearn==0.0\n",
176 |       "smmap==3.0.4\n",
177 |       "sortedcontainers==2.3.0\n",
178 |       "tblib==1.7.0\n",
179 |       "tensorboard==2.4.0\n",
180 |       "tensorboard-plugin-wit==1.7.0\n",
181 |       "terminado==0.9.1\n",
182 |       "testpath==0.4.4\n",
183 |       "threadpoolctl==2.1.0\n",
184 |       "tokenizers==0.9.2\n",
185 |       "toolz==0.11.1\n",
186 |       "torch==1.7.0\n",
187 |       "torchsummary==1.5.1\n",
188 |       "tornado==6.1\n",
189 |       "tqdm==4.49.0\n",
190 |       "traitlets==4.3.3\n",
191 |       "transformers==3.4.0\n",
192 |       "typing-extensions==3.7.4.3\n",
193 |       "urllib3==1.26.2\n",
194 |       "wcwidth==0.2.5\n",
195 |       "webencodings==0.5.1\n",
196 |       "Werkzeug==1.0.1\n",
197 |       "wget==3.2\n",
198 |       "widgetsnbextension==3.5.1\n",
199 |       "wrapt==1.12.1\n",
200 |       "xxhash==2.0.0\n",
201 |       "zict==2.0.0\n",
202 |       "zipp==3.4.0\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "!pip freeze\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": []
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# define the mapping function\n",
224 |     "def format_dataset(example):\n",
225 |     "    # the context might be comprised of multiple contexts => me merge them here\n",
226 |     "    example[\"context\"] = \" \".join((\"\\n\".join(example[\"entity_pages\"][\"wiki_context\"])).split(\"\\n\"))\n",
227 |     "    example[\"targets\"] = example[\"answer\"][\"aliases\"]\n",
228 |     "    example[\"norm_target\"] = example[\"answer\"][\"normalized_value\"]\n",
229 |     "    return example\n",
230 |     "\n",
231 |     "# map the dataset and throw out all unnecessary columns\n",
232 |     "validation_dataset = validation_dataset.map(format_dataset, remove_columns=[\"search_results\", \"question_source\", \"entity_pages\", \"answer\", \"question_id\"])"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "validation_dataset[8]"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "validation_dataset = validation_dataset.filter(lambda x: len(x[\"context\"]) > 0)\n",
251 |     "# check out how many samples are left\n",
252 |     "validation_dataset"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "print(\"\\n\\nLength for each example\")\n",
262 |     "print(30 * \"=\")\n",
263 |     "\n",
264 |     "# length for each example\n",
265 |     "validation_dataset.map(lambda x, i: print(f\"Id: {i} - Question Length: {len(x['question'])} - context Length: {len(x['context'])}\"), with_indices=True)\n",
266 |     "print(30 * \"=\")\n",
267 |     "\n",
268 |     "print(\"\\n\")\n",
269 |     "print(\"Num examples larger than 4 * 4096 characters: \")\n",
270 |     "# filter out examples smaller than 4 * 4096\n",
271 |     "short_validation_dataset = validation_dataset.filter(lambda x: (len(x['question']) + len(x['context'])) < 4 * 4096)\n",
272 |     "short_validation_dataset"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "# EVAL"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering\n",
291 |     "\n",
292 |     "tokenizer = LongformerTokenizerFast.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n",
293 |     "\n",
294 |     "# download the 1.7 GB pretrained model. It might take ~1min\n",
295 |     "model = LongformerForQuestionAnswering.from_pretrained(\"allenai/longformer-large-4096-finetuned-triviaqa\")\n",
296 |     "model.to(\"cuda\")\n",
297 |     "\n",
298 |     "def evaluate(example):\n",
299 |     "    def get_answer(question, context):\n",
300 |     "        # encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length\n",
301 |     "        encoding = tokenizer.encode_plus(question, context, return_tensors=\"pt\", max_length=4096, truncation=True)\n",
302 |     "        input_ids = encoding[\"input_ids\"].to(\"cuda\")\n",
303 |     "        attention_mask = encoding[\"attention_mask\"].to(\"cuda\")\n",
304 |     "\n",
305 |     "        # the forward method will automatically set global attention on question tokens\n",
306 |     "        # The scores for the possible start token and end token of the answer are retrived\n",
307 |     "        # wrap the function in torch.no_grad() to save memory\n",
308 |     "        with torch.no_grad():\n",
309 |     "            start_scores, end_scores = model(input_ids=input_ids, attention_mask=attention_mask)\n",
310 |     "\n",
311 |     "        # Let's take the most likely token using `argmax` and retrieve the answer\n",
312 |     "        all_tokens = tokenizer.convert_ids_to_tokens(encoding[\"input_ids\"][0].tolist())\n",
313 |     "        answer_tokens = all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1]\n",
314 |     "        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))[1:].replace('\"', '')  # remove space prepending space token and remove unnecessary '\"'\n",
315 |     "        \n",
316 |     "        return answer\n",
317 |     "\n",
318 |     "    # save the model's outut here\n",
319 |     "    example[\"output\"] = get_answer(example[\"question\"], example[\"context\"])\n",
320 |     "\n",
321 |     "    # save if it's a match or not\n",
322 |     "    example[\"match\"] = (example[\"output\"] in example[\"targets\"]) or (example[\"output\"] == example[\"norm_target\"])\n",
323 |     "\n",
324 |     "    return example\n"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "results_short = short_validation_dataset.map(evaluate)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "print(f\"\\nNum Correct examples: {sum(results_short['match'])}/{len(results_short)}\")\n",
343 |     "wrong_results = results_short.filter(lambda x: x['match'] is False)\n",
344 |     "print(f\"\\nWrong examples: \")\n",
345 |     "wrong_results.map(lambda x, i: print(f\"{i} - Output: {x['output']} - Target: {x['norm_target']}\"), with_indices=True)"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "results = validation_dataset.map(evaluate)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "print(f\"Correct examples: {sum(results['match'])}/{len(results)}\")"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": []
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": []
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": []
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": []
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": []
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "metadata": {},
404 |    "source": [
405 |     "# TriviaQA json to SQUAD format dataloader"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 1,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "import json\n",
415 |     "from pathlib import Path\n",
416 |     "\n",
417 |     "def read_squad_files(path: str):\n",
418 |     "    path = Path(path)\n",
419 |     "    with open(path, 'rb') as f:\n",
420 |     "        squad_dict = json.load(f)\n",
421 |     "    contexts = []\n",
422 |     "    questions = []\n",
423 |     "    answers = []\n",
424 |     "    for group in squad_dict['data']:\n",
425 |     "        for passage in group['paragraphs']:\n",
426 |     "            context = passage['context']\n",
427 |     "            for qa in passage['qas']:\n",
428 |     "                question = qa['question']\n",
429 |     "                for answer in qa['answers']:\n",
430 |     "                    contexts.append(context)\n",
431 |     "                    questions.append(question)\n",
432 |     "                    answers.append(answer)\n",
433 |     "\n",
434 |     "    return contexts, questions, answers\n",
435 |     "    \n",
436 |     "\n",
437 |     "train_contexts, train_questions, train_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-train-4096.json')\n",
438 |     "val_contexts, val_questions, val_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-dev-4096.json')"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 2,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "## Add start and end tokens correctly"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 3,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "def add_end_idx(answers, contexts):\n",
457 |     "    for answer, context in zip(answers, contexts):\n",
458 |     "        gold_text = answer['text']\n",
459 |     "        start_idx = answer['answer_start']\n",
460 |     "        end_idx = start_idx + len(gold_text)\n",
461 |     "\n",
462 |     "        # sometimes squad answers are off by a character or two – fix this\n",
463 |     "        if context[start_idx:end_idx].lower() == gold_text:\n",
464 |     "            answer['answer_end'] = end_idx\n",
465 |     "        elif context[start_idx-1:end_idx-1].lower() == gold_text:\n",
466 |     "            answer['answer_start'] = start_idx - 1\n",
467 |     "            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character\n",
468 |     "        elif context[start_idx-2:end_idx-2].lower() == gold_text:\n",
469 |     "            answer['answer_start'] = start_idx - 2\n",
470 |     "            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters\n",
471 |     "\n",
472 |     "add_end_idx(train_answers, train_contexts)\n",
473 |     "add_end_idx(val_answers, val_contexts)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 4,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "## Tokenize results"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "from transformers import RobertaTokenizerFast\n",
492 |     "tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lowercase=True)\n",
493 |     "\n",
494 |     "train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": null,
500 |    "metadata": {},
501 |    "outputs": [],
502 |    "source": [
503 |     "val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "### convert start-end pos to token start/end pos"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "def add_token_positions(encodings, answers):\n",
522 |     "    start_positions = []\n",
523 |     "    end_positions = []\n",
524 |     "    for i in range(len(answers)):\n",
525 |     "        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))\n",
526 |     "        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))\n",
527 |     "        # if None, the answer passage has been truncated\n",
528 |     "        if start_positions[-1] is None:\n",
529 |     "            start_positions[-1] = tokenizer.model_max_length\n",
530 |     "        if end_positions[-1] is None:\n",
531 |     "            end_positions[-1] = tokenizer.model_max_length\n",
532 |     "    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})\n",
533 |     "\n",
534 |     "add_token_positions(train_encodings, train_answers)\n",
535 |     "add_token_positions(val_encodings, val_answers)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "### Dataloader"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "import torch\n",
554 |     "from torch.utils.data import DataLoader, Dataset\n",
555 |     "\n",
556 |     "class SquadDataset(torch.utils.data.Dataset):\n",
557 |     "    def __init__(self, encodings):\n",
558 |     "        self.encodings = encodings\n",
559 |     "\n",
560 |     "    def __getitem__(self, idx):\n",
561 |     "        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
562 |     "\n",
563 |     "    def __len__(self):\n",
564 |     "        return len(self.encodings.input_ids)\n",
565 |     "\n",
566 |     "train_dataset = SquadDataset(train_encodings)\n",
567 |     "val_dataset = SquadDataset(val_encodings)\n",
568 |     "\n"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {},
575 |    "outputs": [],
576 |    "source": [
577 |     "train_dataset = DataLoader(train_dataset, batch_size=16, shuffle=True)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": []
586 |   }
587 |  ],
588 |  "metadata": {
589 |   "kernelspec": {
590 |    "display_name": "Python 3",
591 |    "language": "python",
592 |    "name": "python3"
593 |   },
594 |   "language_info": {
595 |    "codemirror_mode": {
596 |     "name": "ipython",
597 |     "version": 3
598 |    },
599 |    "file_extension": ".py",
600 |    "mimetype": "text/x-python",
601 |    "name": "python",
602 |    "nbconvert_exporter": "python",
603 |    "pygments_lexer": "ipython3",
604 |    "version": "3.6.9"
605 |   }
606 |  },
607 |  "nbformat": 4,
608 |  "nbformat_minor": 4
609 | }
610 | 


--------------------------------------------------------------------------------
/scripts/run_long_lm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import datetime
  5 | from dataclasses import dataclass, field
  6 | import functools
  7 | import logging
  8 | import math
  9 | import os
 10 | import pickle
 11 | import re
 12 | import sys
 13 | import time
 14 | import threading
 15 | from typing import Optional
 16 | 
 17 | import torch
 18 | from torch.utils.data.dataset import Dataset
 19 | from torch.utils.tensorboard import SummaryWriter
 20 | import tqdm
 21 | from transformers import logging as hf_logging
 22 | from transformers.modeling_longformer import LongformerSelfAttention
 23 | from transformers import (
 24 |     PreTrainedModel,
 25 |     PreTrainedTokenizer,
 26 |     AutoModelForMaskedLM,
 27 |     RobertaForMaskedLM,
 28 |     XLMRobertaForMaskedLM,
 29 |     AutoTokenizer,
 30 | )
 31 | 
 32 | from transformers import (
 33 |     HfArgumentParser,
 34 |     DataCollatorForLanguageModeling,
 35 |     Trainer,
 36 |     TrainingArguments,
 37 |     set_seed,
 38 | )
 39 | 
 40 | 
 41 | class color:
 42 |     """Help print colors to terminal."""
 43 |     PURPLE = "\033[95m"
 44 |     CYAN = "\033[96m"
 45 |     DARKCYAN = "\033[36m"
 46 |     BLUE = "\033[94m"
 47 |     GREEN = "\033[92m"
 48 |     YELLOW = "\033[93m"
 49 |     RED = "\033[91m"
 50 |     BOLD = "\033[1m"
 51 |     UNDERLINE = "\033[4m"
 52 |     END = "\033[0m"
 53 | 
 54 | 
 55 | def is_roberta_based_model(model_name: str) -> str:
 56 |     """Validate if the model to pre-train is of roberta architecture."""
 57 | 
 58 |     r = re.compile('(.*)roberta(.*)')
 59 |     matches = r.findall(model_name)
 60 |     base_name = 'none'
 61 |     if len(matches) > 0:
 62 |         base_name = '-'.join(model_name.split('-')[:-1])
 63 | 
 64 |     return base_name
 65 | 
 66 | 
 67 | ##########################################
 68 | #
 69 | # Arguments
 70 | #
 71 | ##########################################
 72 | 
 73 | """Helper function: Define argparser and args."""
 74 | parser = argparse.ArgumentParser()
 75 | parser.add_argument(
 76 |     "--model_name",
 77 |     default=None,
 78 |     type=str,
 79 |     help="Name to save the model as.",
 80 | )
 81 | parser.add_argument(
 82 |     "--output_dir",
 83 |     default=None,
 84 |     type=str,
 85 |     help="The output directory for the trained model.",
 86 | )
 87 | parser.add_argument(
 88 |     "--model_type",
 89 |     default=None,
 90 |     type=str,
 91 |     help="Model type selected in the list from Huggingface ex:"
 92 |     " `bert, roberta, xlm-roberta, ...`",
 93 | )
 94 | parser.add_argument(
 95 |     "--model_name_or_path",
 96 |     default=None,
 97 |     type=str,
 98 |     required=True,
 99 |     help="Path to pretrained model from huggingface.co/models. "
100 |     "Only tested on `xlm-roberta-base` and `roberta-base`.",
101 | )
102 | parser.add_argument(
103 |     "--logging_dir",
104 |     default=None,
105 |     type=str,
106 |     help="Where logs are stored.",
107 | )
108 | parser.add_argument(
109 |     "--model_max_length",
110 |     default=4096,
111 |     type=int,
112 |     choices=[
113 |         512,
114 |         1024,
115 |         2048,
116 |         4096,
117 |         8192,
118 |         16384,
119 |         32768,
120 |         65536,
121 |         131072,
122 |         262144,
123 |         524288,
124 |         1048576,
125 |     ],
126 |     help="The maxiumum position of the model",
127 | )
128 | parser.add_argument(
129 |     "--attention_window",
130 |     default=512,
131 |     type=int,
132 |     help="Size of attention window",
133 | )
134 | parser.add_argument(
135 |     "--evaluation_strategy",
136 |     default="no",
137 |     type=str,
138 |     help="How evaluation should be logged, 'steps', 'epochs', 'no'.",
139 | )
140 | parser.add_argument(
141 |     "--do_train",
142 |     action="store_true",
143 |     help="Whether to run training."
144 | )
145 | parser.add_argument(
146 |     "--do_eval",
147 |     action="store_true",
148 |     help="Whether to run eval on the dev set."
149 | )
150 | parser.add_argument(
151 |     "--evaluate_during_training",
152 |     action="store_true",
153 |     help="Run evaluation during training at each logging step.",
154 | )
155 | parser.add_argument(
156 |     "--per_device_train_batch_size",
157 |     default=8,
158 |     type=int,
159 |     help="Batch size per GPU/CPU for training.",
160 | )
161 | parser.add_argument(
162 |     "--per_device_eval_batch_size",
163 |     default=8,
164 |     type=int,
165 |     help="Batch size per GPU/CPU for evaluation.",
166 | )
167 | parser.add_argument(
168 |     "--learning_rate",
169 |     default=5e-5,
170 |     type=float,
171 |     help="The initial learning rate for Adam.",
172 | )
173 | parser.add_argument(
174 |     "--gradient_accumulation_steps",
175 |     type=int,
176 |     default=1,
177 |     help="Number of gradient updates to perform before updating the weights",
178 | )
179 | parser.add_argument(
180 |     "--weight_decay",
181 |     default=0.0,
182 |     type=float,
183 |     help="Weight decay if we apply some."
184 | )
185 | parser.add_argument(
186 |     "--adam_epsilon",
187 |     default=1e-8,
188 |     type=float,
189 |     help="Epsilon for Adam optimizer."
190 | )
191 | parser.add_argument(
192 |     "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
193 | )
194 | parser.add_argument(
195 |     "--num_train_epochs",
196 |     default=3.0,
197 |     type=float,
198 |     help="Total number of training epochs to perform.",
199 | )
200 | parser.add_argument(
201 |     "--max_steps",
202 |     default=-1,
203 |     type=int,
204 |     help="If > 0: set total number of training steps to perform. "
205 |     "Override num_train_epochs.",
206 | )
207 | parser.add_argument(
208 |     "--warmup_steps",
209 |     default=0,
210 |     type=int,
211 |     help="Linear warmup over warmup_steps."
212 | )
213 | parser.add_argument(
214 |     "--verbose_logging",
215 |     action="store_true",
216 |     help="If true, log all information when loading datasets.",
217 | )
218 | parser.add_argument(
219 |     "--cache_dir",
220 |     default=None,
221 |     help="Where do you want to store the pretrained models.",
222 | )
223 | parser.add_argument(
224 |     "--lang_id",
225 |     default=0,
226 |     type=int,
227 |     help="language id of input for language-specific xlm models "
228 |     "(see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
229 | )
230 | parser.add_argument(
231 |     "--logging_steps",
232 |     type=int,
233 |     default=500,
234 |     help="Log every X updates steps."
235 | )
236 | parser.add_argument(
237 |     "--save_steps",
238 |     type=int,
239 |     default=500,
240 |     help="Save checkpoint every X updates steps.",
241 | )
242 | parser.add_argument(
243 |     "--eval_all_checkpoints",
244 |     action="store_true",
245 |     help="Evaluate all checkpoints starting with the same prefix as model_name"
246 |     "ending and ending with step number",
247 | )
248 | parser.add_argument(
249 |     "--overwrite_output_dir",
250 |     action="store_true",
251 |     help="Overwrite the content of the output directory",
252 | )
253 | parser.add_argument(
254 |     "--seed",
255 |     type=int,
256 |     default=42,
257 |     help="random seed for initialization"
258 | )
259 | parser.add_argument(
260 |     "--local_rank",
261 |     type=int,
262 |     default=-1,
263 |     help="local_rank for distributed training on gpus",
264 | )
265 | parser.add_argument(
266 |     "--fp16",
267 |     action="store_true",
268 |     help="Whether to use 16-bit (mixed) precision (through NVIDIA apex)",
269 | )
270 | parser.add_argument(
271 |     "--fp16_opt_level",
272 |     type=str,
273 |     default="O1",
274 |     help="For fp16: Apex AMP optimization level selected in"
275 |     "['O0', 'O1', 'O2', and 'O3'].",
276 | )
277 | parser.add_argument(
278 |     "--train_file_path",
279 |     type=str,
280 |     default="/workspace/data/wikitext-103/wiki.train.raw",
281 |     help="File path to language model training file",
282 | )
283 | parser.add_argument(
284 |     "--val_file_path",
285 |     type=str,
286 |     default="/workspace/data/wikitext-103/wiki.valid.raw",
287 |     help="File path to language model training file",
288 | )
289 | parser.add_argument(
290 |     "--eval_steps",
291 |     type=int,
292 |     default=None,
293 |     help="File path to language model training file",
294 | )
295 | 
296 | args = parser.parse_args()
297 | 
298 | hf_logging.enable_default_handler()
299 | hf_logging.set_verbosity_info()
300 | hf_logging.enable_explicit_format()
301 | 
302 | tb_writer = SummaryWriter(log_dir=args.logging_dir)
303 | 
304 | logger = logging.getLogger("")
305 | logger.setLevel(logging.INFO)
306 | fh = logging.FileHandler(f"{args.logging_dir}.log")
307 | sh = logging.StreamHandler(sys.stdout)
308 | formatter = logging.Formatter(
309 |     "[%(asctime)s], %(levelname)s %(message)s",
310 |     datefmt="%a, %d %b %Y %H:%M:%S",
311 | )
312 | fh.setFormatter(formatter)
313 | sh.setFormatter(formatter)
314 | logger.addHandler(fh)
315 | logger.addHandler(sh)
316 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n")
317 | 
318 | logger.warning(
319 |     f"Process rank: {args.local_rank}, \
320 |     distributed training: {bool(args.local_rank != -1)}, \
321 |     16-bits training: {args.fp16}"
322 | )
323 | 
324 | 
325 | ##########################################
326 | #
327 | # Replace Huggingface - TextDataset
328 | #
329 | ##########################################
330 | 
331 | # https://github.com/tqdm/tqdm/issues/458
332 | def provide_progress_bar(
333 |     function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={}
334 | ):
335 |     ret = [None]  # Mutable var so the function can store its return value
336 | 
337 |     def myrunner(function, ret, *args, **kwargs):
338 |         ret[0] = function(*args, **kwargs)
339 | 
340 |     thread = threading.Thread(
341 |         target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs
342 |     )
343 |     pbar = tqdm.tqdm(total=estimated_time, **tqdm_kwargs)
344 | 
345 |     thread.start()
346 |     while thread.is_alive():
347 |         thread.join(timeout=tstep)
348 |         pbar.update(tstep)
349 |     pbar.close()
350 |     return ret[0]
351 | 
352 | 
353 | def progress_wrapped(estimated_time, tstep=0.2, tqdm_kwargs={}):
354 |     def real_decorator(function):
355 |         @functools.wraps(function)
356 |         def wrapper(*args, **kwargs):
357 |             return provide_progress_bar(
358 |                 function,
359 |                 estimated_time=estimated_time,
360 |                 tstep=tstep,
361 |                 tqdm_kwargs=tqdm_kwargs,
362 |                 args=args,
363 |                 kwargs=kwargs,
364 |             )
365 | 
366 |         return wrapper
367 |     return real_decorator
368 | 
369 | 
370 | class TextDataset(Dataset):
371 |     # Ugly HACK on older transformers
372 |     # Use same code as Huggingface TextDataset
373 |     def __init__(
374 |         self,
375 |         tokenizer: PreTrainedTokenizer,
376 |         file_path: str,
377 |         block_size: int,
378 |         overwrite_cache=False,
379 |         cache_dir: Optional[str] = None,
380 |     ):
381 |         assert os.path.isfile(
382 |             file_path), f"Input file path {file_path} not found"
383 |         block_size = block_size - \
384 |             tokenizer.num_special_tokens_to_add(pair=False)
385 | 
386 |         directory, filename = os.path.split(file_path)
387 |         cached_features_file = os.path.join(
388 |             cache_dir if cache_dir is not None else directory,
389 |             "cached_lm_{}_{}_{}".format(
390 |                 tokenizer.__class__.__name__,
391 |                 str(block_size),
392 |                 filename,
393 |             ),
394 |         )
395 | 
396 |         # Make sure only the first process in distributed training processes the dataset,
397 |         # and the others will use the cache.
398 |         @progress_wrapped(estimated_time=200)
399 |         def tokenize_text(text):
400 |             return tokenizer.tokenize(text)
401 | 
402 |         @progress_wrapped(estimated_time=300)
403 |         def convert_tokens_to_ids(tokenized_text):
404 |             return tokenizer.convert_tokens_to_ids(tokenized_text)
405 | 
406 |         if os.path.exists(cached_features_file) and not overwrite_cache:
407 |             start = time.time()
408 |             with open(cached_features_file, "rb") as handle:
409 |                 self.examples = pickle.load(handle)
410 |             logger.info(
411 |                 f"Loading features from cached file {cached_features_file} [took %.3f s]",
412 |                 time.time() - start,
413 |             )
414 | 
415 |         else:
416 |             logger.info(
417 |                 f"Creating features from dataset file at {directory}\n\n")
418 | 
419 |             self.examples = []
420 |             with open(file_path, encoding="utf-8") as f:
421 |                 text = f.read()
422 | 
423 |             # For large texts and models, this could take a long time
424 |             # Done i two steps, since each part can take between 5-10 min
425 |             start = time.time()
426 |             text = tokenize_text(text)
427 |             logger.info("Tokenizing text [took %.3f s]", time.time() - start)
428 |             start = time.time()
429 |             tokenized_text = convert_tokens_to_ids(text)
430 |             logger.info(
431 |                 "Converting text to id [took %.3f s]\n", time.time() - start)
432 | 
433 |             start = time.time()
434 |             for i in range(
435 |                 0, len(tokenized_text) - block_size + 1, block_size
436 |             ):  # Truncate in block of block_size
437 |                 self.examples.append(
438 |                     tokenizer.build_inputs_with_special_tokens(
439 |                         tokenized_text[i: i + block_size]
440 |                     )
441 |                 )
442 |             logger.info(
443 |                 "Build tokenizer inputs by block_size length [took %.3f s]",
444 |                 time.time() - start,
445 |             )
446 | 
447 |             start = time.time()
448 |             with open(cached_features_file, "wb") as handle:
449 |                 pickle.dump(self.examples, handle,
450 |                             protocol=pickle.HIGHEST_PROTOCOL)
451 |             logger.info(
452 |                 "Saving features into cached file %s [took %.3f s]",
453 |                 cached_features_file,
454 |                 time.time() - start,
455 |             )
456 | 
457 |     def __len__(self):
458 |         return len(self.examples)
459 | 
460 |     def __getitem__(self, i) -> torch.Tensor:
461 |         return torch.tensor(self.examples[i], dtype=torch.long)
462 | 
463 | 
464 | ###########################################################
465 | #
466 | # Longformer conversion
467 | #
468 | ###########################################################
469 | 
470 | # TODO: Huggingface transformers v. >3.5.1 breaks this
471 | class LongModelSelfAttention(LongformerSelfAttention):
472 |     def forward(
473 |         self,
474 |         hidden_states,
475 |         attention_mask=None,
476 |         head_mask=None,
477 |         encoder_hidden_states=None,
478 |         encoder_attention_mask=None,
479 |         output_attentions=False,
480 |     ):
481 |         print()
482 | 
483 |         return super().forward(
484 |             hidden_states,
485 |             attention_mask=attention_mask,
486 |         )
487 | 
488 | 
489 | # Load initial model
490 | MODEL: PreTrainedModel
491 | 
492 | if is_roberta_based_model(args.model_name_or_path) == "xlm-roberta":
493 |     MODEL = XLMRobertaForMaskedLM
494 | elif is_roberta_based_model(args.model_name_or_path) == "roberta":
495 |     MODEL = RobertaForMaskedLM
496 | else:
497 |     raise NotImplementedError(
498 |         "Currently only supports roberta-based architectures.")
499 | 
500 | 
501 | class LongModelForMaskedLM:
502 |     def __init__(self, config):
503 |         super().__init__(config)
504 |         print(f"\n{color.YELLOW}Converting models to Longformer is currently only tested for RoBERTa like architectures.{color.END}")
505 |         for i, layer in enumerate(self.roberta.encoder.layer):
506 |             layer.attention.self = LongModelSelfAttention(config, layer_id=i)
507 | 
508 | 
509 | def create_long_model(
510 |         save_model_to,
511 |         model,
512 |         tokenizer,
513 |         attention_window,
514 |         model_max_length
515 | ):
516 | 
517 |     config = model.config
518 |     position_embeddings = model.roberta.embeddings.position_embeddings
519 | 
520 |     tokenizer.model_max_length = model_max_length
521 |     tokenizer.init_kwargs['model_max_length'] = model_max_length
522 |     current_model_max_length, embed_size = position_embeddings.weight.shape
523 | 
524 |     # NOTE: RoBERTa has positions 0,1 reserved
525 |     # embedding size is max position + 2
526 |     model_max_length += 2
527 |     config.max_position_embeddings = model_max_length
528 |     assert model_max_length > current_model_max_length, \
529 |         "New model max_length must be longer than current max_length"
530 | 
531 |     # BUG for XLM: Need to make all zeros sice too large base model
532 |     new_pos_embed = position_embeddings.weight.new_zeros(
533 |         model_max_length, embed_size
534 |     )
535 | 
536 |     k = 2
537 |     step = current_model_max_length - 2
538 |     while k < model_max_length - 1:
539 |         new_pos_embed[k:(
540 |             k + step)] = position_embeddings.weight[2:]
541 |         k += step
542 | 
543 |     # HACK for Huggingface transformers >=3.4.0 and < 4.0
544 |     # https://github.com/huggingface/transformers/issues/6465#issuecomment-719042969
545 |     position_embeddings.weight.data = new_pos_embed
546 |     model.roberta.embeddings.position_embeddings.num_embeddings = len(
547 |         new_pos_embed.data
548 |     )
549 |     num_model_embeddings = position_embeddings.num_embeddings
550 |     model.roberta.embeddings.position_ids = torch.arange(
551 |         0, num_model_embeddings
552 |     )[None]
553 | 
554 |     # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
555 |     config.attention_window = [attention_window] * config.num_hidden_layers
556 |     for i, layer in enumerate(model.roberta.encoder.layer):
557 |         longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
558 |         longformer_self_attn.query = layer.attention.self.query
559 |         longformer_self_attn.key = layer.attention.self.key
560 |         longformer_self_attn.value = layer.attention.self.value
561 | 
562 |         longformer_self_attn.query_global = layer.attention.self.query
563 |         longformer_self_attn.key_global = layer.attention.self.key
564 |         longformer_self_attn.value_global = layer.attention.self.value
565 | 
566 |         layer.attention.self = longformer_self_attn
567 | 
568 |     logger.info(f'saving model to {save_model_to}')
569 |     model.save_pretrained(save_model_to)
570 |     tokenizer.save_pretrained(save_model_to)
571 |     return model, tokenizer
572 | 
573 | 
574 | def copy_proj_layers(model):
575 |     for _, layer in enumerate(model.roberta.encoder.layer):
576 |         layer.attention.self.query_global = layer.attention.self.query
577 |         layer.attention.self.key_global = layer.attention.self.key
578 |         layer.attention.self.value_global = layer.attention.self.value
579 |     return model
580 | 
581 | 
582 | def pretrain_and_evaluate(
583 |     training_args, data_args, model, tokenizer, eval_only, model_path
584 | ):
585 |     val_dataset = TextDataset(
586 |         tokenizer=tokenizer,
587 |         file_path=data_args.val_file_path,
588 |         block_size=tokenizer.max_len,
589 |     )
590 |     if eval_only:
591 |         train_dataset = val_dataset
592 |     else:
593 |         logger.info(
594 |             f"Loading and tokenizing training data is usually slow: {data_args.train_file_path}"
595 |         )
596 |         train_dataset = TextDataset(
597 |             tokenizer=tokenizer,
598 |             file_path=data_args.train_file_path,
599 |             block_size=tokenizer.max_len,
600 |         )
601 | 
602 |     data_collator = DataCollatorForLanguageModeling(
603 |         tokenizer=tokenizer, mlm=True, mlm_probability=0.15
604 |     )
605 | 
606 |     trainer = Trainer(
607 |         model=model,
608 |         args=training_args,
609 |         data_collator=data_collator,
610 |         train_dataset=train_dataset,
611 |         eval_dataset=val_dataset,
612 |         prediction_loss_only=True,
613 |     )
614 | 
615 |     eval_loss = trainer.evaluate()
616 |     eval_loss = eval_loss["eval_loss"]
617 |     print(f"Initial eval bpc: {color.GREEN}{eval_loss/math.log(2)}{color.END}")
618 |     logger.info(f"Initial eval bpc: {eval_loss/math.log(2)}")
619 | 
620 |     if not eval_only:
621 |         trainer.train(model_path=model_path)
622 |         trainer.save_model()
623 | 
624 |         eval_loss = trainer.evaluate()
625 |         eval_loss = eval_loss["eval_loss"]
626 |         print(
627 |             f"Eval bpc after pretraining: \
628 |             {color.GREEN}{eval_loss/math.log(2)}{color.END}"
629 |         )
630 |         logger.info(f"Eval bpc after pretraining: {eval_loss/math.log(2)}")
631 | 
632 | 
633 | @dataclass
634 | class ModelArguments:
635 |     """Huggingface parameters for the model training."""
636 | 
637 |     model_name_or_path: str = field(
638 |         default=None,
639 |         metadata={
640 |             "help": "Name of pretrained model to load for model and tokenizer"
641 |             ", based on huggingface.co/models, ex 'roberta-base'"
642 |         },
643 |     )
644 |     model_name: str = field(
645 |         default="roberta-base-long-4096-lm",
646 |         metadata={"help": "Name to use when saving model."},
647 |     )
648 |     attention_window: int = field(
649 |         default=512,
650 |         metadata={"help": "Size of attention window"}
651 |     )
652 |     model_max_length: int = field(
653 |         default=4096,
654 |         metadata={"help": "Maximum position"}
655 |     )
656 |     cache_dir: Optional[str] = field(
657 |         default=None,
658 |         metadata={
659 |             "help": "Where do you want to store the pretrained models."
660 |         },
661 |     )
662 | 
663 | 
664 | @dataclass
665 | class DataTrainingArguments:
666 |     """Training and validation data arguments."""
667 | 
668 |     val_file_path: str = field(
669 |         default="/workspace/data/wikitext-103-raw/wiki.valid.raw",
670 |         metadata={"help": "File for training a Language Model"},
671 |     )
672 |     train_file_path: str = field(
673 |         default="/workspace/data/wikitext-103-raw/wiki.train.raw",
674 |         metadata={"help": "File for evaluating a Language Model"},
675 |     )
676 | 
677 | 
678 | def main():
679 |     ############################################
680 |     #
681 |     # Define model params
682 |     #
683 |     ############################################
684 | 
685 |     parser = HfArgumentParser(
686 |         (ModelArguments, DataTrainingArguments, TrainingArguments)
687 |     )
688 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
689 | 
690 |     set_seed(training_args.seed)
691 | 
692 |     if (
693 |         os.path.exists(training_args.output_dir)
694 |         and os.listdir(training_args.output_dir)
695 |         and training_args.do_train
696 |         and not training_args.overwrite_output_dir
697 |     ):
698 |         raise ValueError(
699 |             f"Output directory ({training_args.output_dir}) \
700 |             already exists and is not empty.\
701 |             Use --overwrite_output_dir to overcome."
702 |         )
703 | 
704 |     ###########################################
705 |     #
706 |     # RUN
707 |     #
708 |     ###########################################
709 | 
710 |     start = time.time()
711 |     print("---------------------------------------------------------")
712 |     print(
713 |         f"\nLoading from Huggingface pretrained model: \
714 |         `{color.BOLD}{color.GREEN}\
715 |         {model_args.model_name_or_path}\
716 |         {color.END}{color.END}` \
717 |         with name: {model_args.model_name}\n"
718 |     )
719 | 
720 |     model = AutoModelForMaskedLM.from_pretrained(
721 |         model_args.model_name_or_path,
722 |         cache_dir=model_args.cache_dir,
723 |     )
724 |     tokenizer = AutoTokenizer.from_pretrained(
725 |         model_args.model_name_or_path,
726 |         model_max_length=model_args.model_max_length,
727 |         cache_dir=model_args.cache_dir,
728 |         use_fast=True,
729 |     )
730 | 
731 |     print(f"{color.RED}Creating Longformer model{color.END}")
732 |     model_path = training_args.output_dir
733 |     if not os.path.exists(model_path):
734 |         os.makedirs(model_path)
735 | 
736 |     logger.info(
737 |         f"Converting {model_args.model_name_or_path} \
738 |         into {model_args.model_name}"
739 |     )
740 |     model, tokenizer = create_long_model(
741 |         save_model_to=model_path,
742 |         model=model,
743 |         tokenizer=tokenizer,
744 |         attention_window=model_args.attention_window,
745 |         model_max_length=model_args.model_max_length,
746 |     )
747 | 
748 |     print(f"{color.RED}Loading Model{color.END}")
749 |     logger.info(f"Loading the model from {model_path}")
750 |     model = LongModelForMaskedLM.from_pretrained(model_path)
751 |     tokenizer = AutoTokenizer.from_pretrained(
752 |         model_path,
753 |         model_max_length=model_args.model_max_length,
754 |         use_fast=True
755 |     )
756 | 
757 |     print(f"{color.RED}Evaluate{color.END}")
758 |     logger.info(
759 |         f"Pretraining \
760 |         {model_args.model_name_or_path}-{model_args.model_max_length}... "
761 |     )
762 |     pretrain_and_evaluate(
763 |         training_args,
764 |         data_args,
765 |         model,
766 |         tokenizer,
767 |         eval_only=False,
768 |         model_path=training_args.output_dir,
769 |     )
770 | 
771 |     print(
772 |         f"{color.PURPLE}TIME elapsed{color.END}: {datetime.datetime.fromtimestamp(time.time()-start).strftime('%d days, %H:%M:%S')}"
773 |     )
774 | 
775 |     logger.info(
776 |         "Copying local projection layers into global projection layers..."
777 |     )
778 |     model = copy_proj_layers(model)
779 |     logger.info(f"Saving model to {model_path}")
780 |     model.save_pretrained(model_path)
781 | 
782 |     print(f"{color.RED}Loading Done model{color.END}")
783 | 
784 |     logger.info(f"Loading the model from {model_path}")
785 |     model = LongModelForMaskedLM.from_pretrained(model_path)
786 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
787 | 
788 | 
789 | if __name__ == "__main__":
790 |     main()
791 | 


--------------------------------------------------------------------------------
/scripts/finetune_qa_models.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | 
   3 | from __future__ import print_function
   4 | import argparse
   5 | from collections import Counter
   6 | from dataclasses import dataclass, field
   7 | import logging
   8 | import os
   9 | import re
  10 | import string
  11 | import sys
  12 | from typing import Any, Dict, List, Optional, Union
  13 | 
  14 | import datasets
  15 | import torch
  16 | from torch.utils.tensorboard import SummaryWriter
  17 | from tqdm.auto import tqdm
  18 | from transformers import logging as hf_logging
  19 | from transformers import (
  20 |     AutoModelForQuestionAnswering,
  21 |     AutoTokenizer,
  22 |     DataCollator,
  23 |     HfArgumentParser,
  24 |     PreTrainedModel,
  25 |     PreTrainedTokenizer,
  26 |     set_seed,
  27 |     Trainer,
  28 |     TrainingArguments,
  29 | )
  30 | 
  31 | 
  32 | # helper
  33 | class color:
  34 |     PURPLE = "\033[95m"
  35 |     CYAN = "\033[96m"
  36 |     DARKCYAN = "\033[36m"
  37 |     BLUE = "\033[94m"
  38 |     GREEN = "\033[92m"
  39 |     YELLOW = "\033[93m"
  40 |     RED = "\033[91m"
  41 |     BOLD = "\033[1m"
  42 |     UNDERLINE = "\033[4m"
  43 |     END = "\033[0m"
  44 | 
  45 | 
  46 | @dataclass
  47 | class QADataset:
  48 |     """Collection for the language to load in HF datasets
  49 | 
  50 |     args:
  51 |     - langs: includes the number of languages to load,
  52 |     - text_on_eval: the print statements when evaluating the datasets
  53 |     - data: the tokenized datasets
  54 |     """
  55 |     langs: List[str]
  56 |     text_on_eval: List[str]
  57 |     data: List[Any] = None
  58 | 
  59 | 
  60 | SQUAD = QADataset(
  61 |     ["squad"],
  62 |     [
  63 |         "SQuAD 1.1 validation dataset"
  64 |     ]
  65 | )
  66 | 
  67 | 
  68 | # base xquad
  69 | XQUAD = QADataset(
  70 |     ["ar", "de", "el", "en", "es", "hi", "ru", "th", "tr", "vi", "zh", ],
  71 |     [
  72 |         "XQuAD Arabic validation",
  73 |         "XQuAD German validation",
  74 |         "XQuAD Greek validation",
  75 |         "XQuAD English validation",
  76 |         "XQuAD Spanish validation",
  77 |         "XQuAD Hindi validation",
  78 |         "XQuAD Russian validation",
  79 |         "XQuAD Thai validation",
  80 |         "XQuAD Turkish validation",
  81 |         "XQuAD Vietnamese validation",
  82 |         "XQuAD Chinese validation",
  83 |     ]
  84 | )
  85 | 
  86 | # base mlqa
  87 | MLQA = QADataset(
  88 |     ["ar", "de", "en", "es", "hi", "vi", "zh"],
  89 |     [
  90 |         "SQuAD 1.1 validation dataset",
  91 |         "MLQA Arabic validation",
  92 |         "MLQA German validation",
  93 |         "MLQA English validation",
  94 |         "MLQA Spanish validation",
  95 |         "MLQA Hindi validation",
  96 |         "MLQA Vietnamese validation",
  97 |         "MLQA Chinese validation",
  98 |     ]
  99 | )
 100 | 
 101 | 
 102 | def check_positive_concats(nr_concats):
 103 |     """Helper funtion for argparse
 104 |     Instructs how many contexts to concatinate together.
 105 |     Defualt for longer contexts are three.
 106 |     More can be used, but then it requires larger GPUs.
 107 | 
 108 |     *NOTE* this is only used when using the datasets:
 109 |     - squad_long or
 110 |     - xquad_long
 111 |     """
 112 |     try:
 113 |         nr_concats_int = int(nr_concats)
 114 |         if nr_concats_int <= 0:
 115 |             raise argparse.ArgumentTypeError(
 116 |                 f"--nr_concats expects a positive int as a value, \
 117 |                 not {nr_concats}"
 118 |             )
 119 |     except ValueError as e:
 120 |         if hasattr(e, "message"):
 121 |             print(e.message)
 122 |         else:
 123 |             print(e)
 124 |     return nr_concats_int
 125 | 
 126 | 
 127 | parser = argparse.ArgumentParser()
 128 | parser.add_argument(
 129 |     "--nr_concats",
 130 |     default=3,
 131 |     type=check_positive_concats,
 132 |     help="How many context to concatinate when using a `long` QA dataset.\n"
 133 |     "3 is default and yields an average context lenght of 2048 tokens",
 134 | )
 135 | parser.add_argument(
 136 |     "--model_name",
 137 |     default=None,
 138 |     type=str,
 139 |     help="Name to save the model as.",
 140 | )
 141 | parser.add_argument(
 142 |     "--output_dir",
 143 |     default=None,
 144 |     type=str,
 145 |     help="The output directory for the model checkpoints and predictions.",
 146 | )
 147 | parser.add_argument(
 148 |     "--model_type",
 149 |     default=None,
 150 |     type=str,
 151 |     help="Model type selected from Huggingface ex: `roberta, xlm-roberta`",
 152 | )
 153 | parser.add_argument(
 154 |     "--model_name_or_path",
 155 |     default=None,
 156 |     type=str,
 157 |     required=True,
 158 |     help="Path to pretrained model from huggingface.co/models. \n"
 159 |     "Only tested on `xlm-roberta-base` and `roberta-base`.",
 160 | )
 161 | parser.add_argument(
 162 |     "--datasets",
 163 |     default=None,
 164 |     type=str,
 165 |     required=True,
 166 |     help="Define one of Huggingface Datasets Question Answering Tasks.",
 167 | )
 168 | parser.add_argument(
 169 |     "--train_file_path",
 170 |     default=None,
 171 |     type=str,
 172 |     help="File path to where torch training file is stored (.pt files).",
 173 | )
 174 | parser.add_argument(
 175 |     "--valid_file_path",
 176 |     default=None,
 177 |     type=str,
 178 |     help="File path to where torch validation file is stored (.pt files).",
 179 | )
 180 | parser.add_argument(
 181 |     "--data_dir",
 182 |     default=None,
 183 |     type=str,
 184 |     help="Directory to where to store training and validation torch files.",
 185 | )
 186 | parser.add_argument(
 187 |     "--logging_dir",
 188 |     default=None,
 189 |     type=str,
 190 |     help="The output directory where the the loggs are stored.",
 191 | )
 192 | parser.add_argument(
 193 |     "--max_length",
 194 |     default=512,
 195 |     type=int,
 196 |     choices=[
 197 |         512,
 198 |         1024,
 199 |         2048,
 200 |         4096,
 201 |     ],
 202 |     help="The maxiumum position of the model",
 203 | )
 204 | parser.add_argument(
 205 |     "--attention_window",
 206 |     default=512,
 207 |     type=int,
 208 |     help="Size of attention window",
 209 | )
 210 | parser.add_argument(
 211 |     "--do_train",
 212 |     action="store_true",
 213 |     help="Whether to run training."
 214 | )
 215 | parser.add_argument(
 216 |     "--do_eval",
 217 |     action="store_true",
 218 |     help="Whether to run eval on the dev set."
 219 | )
 220 | parser.add_argument(
 221 |     "--evaluate_during_training",
 222 |     action="store_true",
 223 |     help="Run evaluation during training at each logging step.",
 224 | )
 225 | parser.add_argument(
 226 |     "--per_device_train_batch_size",
 227 |     default=8,
 228 |     type=int,
 229 |     help="Batch size per GPU/CPU for training.",
 230 | )
 231 | parser.add_argument(
 232 |     "--per_device_eval_batch_size",
 233 |     default=8,
 234 |     type=int,
 235 |     help="Batch size per GPU/CPU for evaluation.",
 236 | )
 237 | parser.add_argument(
 238 |     "--learning_rate",
 239 |     default=5e-5,
 240 |     type=float,
 241 |     help="The initial learning rate for Adam.",
 242 | )
 243 | parser.add_argument(
 244 |     "--gradient_accumulation_steps",
 245 |     type=int,
 246 |     default=1,
 247 |     help="Number of updates to acummulate the gradient for before updating.",
 248 | )
 249 | parser.add_argument(
 250 |     "--weight_decay",
 251 |     default=0.0,
 252 |     type=float,
 253 |     help="Weight decay if we apply some."
 254 | )
 255 | parser.add_argument(
 256 |     "--adam_epsilon",
 257 |     default=1e-8,
 258 |     type=float,
 259 |     help="Epsilon for Adam optimizer."
 260 | )
 261 | parser.add_argument(
 262 |     "--max_grad_norm",
 263 |     default=1.0,
 264 |     type=float,
 265 |     help="Max gradient norm."
 266 | )
 267 | parser.add_argument(
 268 |     "--num_train_epochs",
 269 |     default=3.0,
 270 |     type=float,
 271 |     help="Total number of training epochs to perform.",
 272 | )
 273 | parser.add_argument(
 274 |     "--max_steps",
 275 |     default=-1,
 276 |     type=int,
 277 |     help="If > 0: set total number of training steps to perform."
 278 |     " Override num_train_epochs.",
 279 | )
 280 | parser.add_argument(
 281 |     "--warmup_steps",
 282 |     default=0,
 283 |     type=int,
 284 |     help="Linear warmup over warmup_steps."
 285 | )
 286 | parser.add_argument(
 287 |     "--verbose_logging",
 288 |     action="store_true",
 289 |     help="If true, display all logging messages from huggingface libraries."
 290 |     "A number of warnings are expected for a normal SQuAD evaluation.",
 291 | )
 292 | parser.add_argument(
 293 |     "--lang_id",
 294 |     default=0,
 295 |     type=int,
 296 |     help="language id of input for language-specific xlm models.",
 297 | )
 298 | parser.add_argument(
 299 |     "--logging_steps", type=int, default=500, help="Log every X updates steps."
 300 | )
 301 | parser.add_argument(
 302 |     "--save_steps",
 303 |     type=int,
 304 |     default=500,
 305 |     help="Save checkpoint every X updates steps.",
 306 | )
 307 | parser.add_argument(
 308 |     "--eval_all_checkpoints",
 309 |     action="store_true",
 310 |     help="Evaluate all checkpoints starting with the same prefix as model_name",
 311 | )
 312 | parser.add_argument(
 313 |     "--overwrite_output_dir",
 314 |     action="store_true",
 315 |     help="Overwrite the content of the output directory",
 316 | )
 317 | parser.add_argument(
 318 |     "--seed", type=int, default=42, help="random seed for initialization"
 319 | )
 320 | parser.add_argument(
 321 |     "--local_rank",
 322 |     type=int,
 323 |     default=-1,
 324 |     help="local_rank for distributed training on gpus",
 325 | )
 326 | parser.add_argument(
 327 |     "--fp16",
 328 |     action="store_true",
 329 |     help="Whether to use 16-bit (mixed) precision (through NVIDIA apex).",
 330 | )
 331 | parser.add_argument(
 332 |     "--fp16_opt_level",
 333 |     type=str,
 334 |     default="O1",
 335 |     help="For fp16: Apex AMP optimization level selected in "
 336 |     "['O0', 'O1', 'O2', and 'O3']."
 337 | )
 338 | parser.add_argument(
 339 |     "--prediction_loss_only",
 340 |     action="store_true",
 341 |     help="If only prediciton loss should be returned",
 342 | )
 343 | parser.add_argument(
 344 |     "--eval_steps",
 345 |     type=int,
 346 |     default=500,
 347 |     help="If input should be tokenized to only lowercase",
 348 | )
 349 | parser.add_argument(
 350 |     "--do_lowercase",
 351 |     action="store_true",
 352 |     help="If input should be lowercase or not when tokenizing",
 353 | )
 354 | 
 355 | 
 356 | args = parser.parse_args()
 357 | 
 358 | hf_logging.enable_default_handler()
 359 | hf_logging.set_verbosity_info()
 360 | hf_logging.enable_explicit_format()
 361 | 
 362 | # Setup logging
 363 | tb_writer = SummaryWriter(log_dir=args.logging_dir)
 364 | 
 365 | logger = logging.getLogger("")
 366 | logger.setLevel(logging.INFO)
 367 | 
 368 | fh = logging.FileHandler(f"{args.logging_dir}.log")
 369 | sh = logging.StreamHandler(sys.stdout)
 370 | formatter = logging.Formatter(
 371 |     "[%(asctime)s], %(levelname)s %(message)s",
 372 |     datefmt="%a, %d %b %Y %H:%M:%S",
 373 | )
 374 | fh.setFormatter(formatter)
 375 | sh.setFormatter(formatter)
 376 | logger.addHandler(fh)
 377 | logger.addHandler(sh)
 378 | logger.info("\n --> Starting logger:\n" + "=" * 55 + "\n")
 379 | 
 380 | logger.warning(
 381 |     f"Process rank: {args.local_rank}, \
 382 |     distributed training: {bool(args.local_rank != -1)}, \
 383 |     16-bits training: {args.fp16}"
 384 | )
 385 | 
 386 | 
 387 | logger.info("=" * 50)
 388 | logger.info("=" + "\t" * 6 + " =")
 389 | logger.info("=" + "\tInitialization" + "\t" * 4 + " =")
 390 | logger.info("=" + "\t" * 6 + " =")
 391 | logger.info("=" * 50 + "\n\n")
 392 | 
 393 | 
 394 | tokenizer = AutoTokenizer.from_pretrained(
 395 |     args.model_name_or_path,
 396 |     cache_dir=args.cache_dir,
 397 |     do_lowercase=args.do_lowercase,
 398 |     pad_to_max_length=True,
 399 |     max_length=args.max_length,
 400 |     truncation=True,
 401 |     use_fast=True,
 402 | )
 403 | model = AutoModelForQuestionAnswering.from_pretrained(
 404 |     args.model_name_or_path,
 405 |     cache_dir=args.cache_dir,
 406 | )
 407 | 
 408 | 
 409 | #########################################
 410 | #                                       #
 411 | #       SQuADs Evaluation metrics       #
 412 | #                                       #
 413 | #########################################
 414 | 
 415 | def normalize_answer(s: str) -> str:
 416 |     """Lower text and remove punctuation, articles and extra whitespace."""
 417 | 
 418 |     def remove_articles(text):
 419 |         return re.sub(r"\b(a|an|the)\b", " ", text)
 420 | 
 421 |     def white_space_fix(text):
 422 |         return " ".join(text.split())
 423 | 
 424 |     def remove_punc(text):
 425 |         exclude = set(string.punctuation)
 426 |         return "".join(ch for ch in text if ch not in exclude)
 427 | 
 428 |     def lower(text):
 429 |         return text.lower()
 430 | 
 431 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 432 | 
 433 | 
 434 | def f1_score(prediction, ground_truth):
 435 |     prediction_tokens = normalize_answer(prediction).split()
 436 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 437 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 438 |     num_same = sum(common.values())
 439 |     if num_same == 0:
 440 |         return 0
 441 |     precision = 1.0 * num_same / len(prediction_tokens)
 442 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 443 |     f1 = (2 * precision * recall) / (precision + recall)
 444 |     return f1
 445 | 
 446 | 
 447 | def exact_match_score(prediction: str, ground_truth: str) -> bool:
 448 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 449 | 
 450 | 
 451 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 452 |     scores_for_ground_truths = []
 453 |     for ground_truth in ground_truths:
 454 |         score = metric_fn(prediction, ground_truth)
 455 |         scores_for_ground_truths.append(score)
 456 |     return max(scores_for_ground_truths)
 457 | 
 458 | 
 459 | def evaluate(
 460 |     gold_answers: List[str],
 461 |     predictions: List[str]
 462 | ) -> Dict[Union[str, float]]:
 463 | 
 464 |     f1 = exact_match = total = 0
 465 | 
 466 |     for ground_truths, prediction in zip(gold_answers, predictions):
 467 |         total += 1
 468 |         exact_match += metric_max_over_ground_truths(
 469 |             exact_match_score, prediction, ground_truths
 470 |         )
 471 |         f1 += metric_max_over_ground_truths(f1_score,
 472 |                                             prediction, ground_truths)
 473 | 
 474 |     exact_match = 100.0 * exact_match / total
 475 |     f1 = 100.0 * f1 / total
 476 | 
 477 |     return {"exact_match": exact_match, "f1": f1}
 478 | 
 479 | 
 480 | ####################################################
 481 | #
 482 | # Evaluation
 483 | #
 484 | ####################################################
 485 | 
 486 | 
 487 | def get_squad_evaluation(
 488 |         valid_dataset: DataCollator,
 489 |         model: PreTrainedModel,
 490 |         tokenizer: PreTrainedTokenizer,
 491 |         dataset_name: str,
 492 |         batch_size: int
 493 | ) -> None:
 494 |     """
 495 |     Makes a prediction and evaluates it based on the trained model
 496 |     The evaluation is based on the SQuAD evaluation metric:
 497 |     valdid_datset is expected to be converted to a torch Tensor type:
 498 |     """
 499 | 
 500 |     logging.info(f"Generating perdictions for dataset '{dataset_name}'")
 501 |     dataloader = torch.utils.data.DataLoader(
 502 |         valid_dataset, batch_size=batch_size)
 503 | 
 504 |     # predictions
 505 |     predicted_answers = []
 506 |     with torch.no_grad():
 507 |         for batch in tqdm(dataloader):
 508 |             start_scores, end_scores = model(
 509 |                 input_ids=batch["input_ids"].cuda(),
 510 |                 attention_mask=batch["attention_mask"].cuda(),
 511 |             )
 512 |             for i in range(start_scores.shape[0]):
 513 |                 all_tokens = tokenizer.convert_ids_to_tokens(
 514 |                     batch["input_ids"][i])
 515 |                 answer = " ".join(
 516 |                     all_tokens[
 517 |                         torch.argmax(start_scores[i]):
 518 |                         torch.argmax(end_scores[i]) + 1
 519 |                     ]
 520 |                 )
 521 |                 ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
 522 |                 answer = tokenizer.decode(ans_ids)
 523 |                 predicted_answers.append(answer)
 524 | 
 525 |     valid_dataset.reset_format()
 526 |     predictions = []
 527 |     references = []
 528 |     for ref, pred_answer in zip(valid_dataset, predicted_answers):
 529 |         actual_answer = ref["answers"]["text"]
 530 |         predictions.append(pred_answer)
 531 |         references.append(actual_answer)
 532 | 
 533 |     eval_score = evaluate(references, predictions)
 534 |     logging.info(f"Results from prediction:\n{eval_score}\n" + "=" * 55 + "\n")
 535 | 
 536 | 
 537 | #########################################
 538 | #                                       #
 539 | # Convert train and validation datasets #
 540 | #                                       #
 541 | #########################################
 542 | 
 543 | def get_correct_alignement(context: str, answer):
 544 |     """Some original examples in SQuAD have indices wrong by 1 or 2 character.
 545 |     """
 546 |     gold_text = answer["text"][0]
 547 |     start_idx = answer["answer_start"][0]
 548 |     end_idx = start_idx + len(gold_text)
 549 |     if context[start_idx:end_idx] == gold_text:
 550 |         return start_idx, end_idx
 551 |     elif context[start_idx - 1: end_idx - 1] == gold_text:
 552 |         return start_idx - 1, end_idx - 1
 553 |     elif context[start_idx - 2: end_idx - 2] == gold_text:
 554 |         return start_idx - 2, end_idx - 2
 555 |     else:
 556 |         raise ValueError()
 557 | 
 558 | 
 559 | MAX_CONTEXT_LENGTH = 0
 560 | 
 561 | 
 562 | def convert_to_features(example):
 563 | 
 564 |     encodings = tokenizer.encode_plus(
 565 |         example["question"],
 566 |         example["context"],
 567 |         pad_to_max_length=True,
 568 |         max_length=args.max_length,
 569 |         truncation=True,
 570 |     )
 571 |     context_encodings = tokenizer.encode_plus(example["context"])
 572 | 
 573 |     start_idx, end_idx = get_correct_alignement(
 574 |         example["context"], example["answers"])
 575 |     start_positions_context = context_encodings.char_to_token(start_idx)
 576 |     end_positions_context = context_encodings.char_to_token(end_idx - 1)
 577 | 
 578 |     # FIXME: UGLY HACK because of XLM-R tokenization, works fine with roberta
 579 |     sep_idx = encodings["input_ids"].index(tokenizer.sep_token_id)
 580 |     try:
 581 |         start_positions = start_positions_context + sep_idx + 1
 582 |         end_positions = end_positions_context + sep_idx + 1
 583 | 
 584 |         # if end_positions > 4096:
 585 |         #    start_positions, end_positions = None, None
 586 |     except:
 587 |         start_positions = None
 588 |         end_positions = None
 589 | 
 590 |     encodings.update(
 591 |         {
 592 |             "start_positions": start_positions,
 593 |             "end_positions": end_positions,
 594 |             "attention_mask": encodings["attention_mask"],
 595 |         }
 596 |     )
 597 |     return encodings
 598 | 
 599 | 
 600 | def convert_dataset_to_torch_format(data):
 601 |     data = data.map(convert_to_features).filter(
 602 |         lambda example: (example["start_positions"] is not None)
 603 |         and (example["end_positions"] is not None)
 604 |     )
 605 | 
 606 |     # set the tensor type and the columns which the dataset should return
 607 |     columns = ["input_ids", "attention_mask",
 608 |                "start_positions", "end_positions"]
 609 |     data.set_format(type="torch", columns=columns)
 610 |     print(max(data["start_positions"]))
 611 |     print(data.shape)
 612 |     return data
 613 | 
 614 | 
 615 | ##################
 616 | #
 617 | #  Training
 618 | #
 619 | ##################
 620 | 
 621 | 
 622 | class DummyDataCollator:
 623 |     def __call__(self, batch):
 624 | 
 625 |         input_ids = torch.stack([example["input_ids"] for example in batch])
 626 |         attention_mask = torch.stack(
 627 |             [example["attention_mask"] for example in batch])
 628 |         start_positions = torch.stack(
 629 |             [example["start_positions"] for example in batch])
 630 |         end_positions = torch.stack(
 631 |             [example["end_positions"] for example in batch])
 632 | 
 633 |         return {
 634 |             "input_ids": input_ids,
 635 |             "start_positions": start_positions,
 636 |             "end_positions": end_positions,
 637 |             "attention_mask": attention_mask,
 638 |         }
 639 | 
 640 | 
 641 | @ dataclass
 642 | class ModelArguments:
 643 | 
 644 |     model_name_or_path: str = field(
 645 |         metadata={
 646 |             "help": "Path to pretrained model or model identifier"
 647 |         }
 648 |     )
 649 |     tokenizer_name: Optional[str] = field(
 650 |         default=None,
 651 |         metadata={
 652 |             "help": "Pretrained tokenizer name or path"
 653 |         },
 654 |     )
 655 |     cache_dir: Optional[str] = field(
 656 |         default=None,
 657 |         metadata={
 658 |             "help": "Where do you want to store the pretrained models"
 659 |         },
 660 |     )
 661 |     do_lowercase: bool = field(
 662 |         default=False,
 663 |         metadata={"help": "If tokenizer should make all to lowercase."},
 664 |     )
 665 |     max_seq_length: Optional[int] = field(
 666 |         default=384,
 667 |         metadata={"help": "TODO"},
 668 |     )
 669 |     doc_stride: Optional[int] = field(
 670 |         default=128,
 671 |         metadata={"help": "TODO"},
 672 |     )
 673 |     model_type: Optional[str] = field(
 674 |         default=None,
 675 |         metadata={"help": "TODO"},
 676 |     )
 677 | 
 678 | 
 679 | @dataclass
 680 | class DataTrainingArguments:
 681 | 
 682 |     datasets: str = field(metadata={"help": "The dataset name to load."})
 683 |     data_dir: Optional[str] = field(
 684 |         default=None,
 685 |         metadata={
 686 |             "help": "Path to the dataset containing train and eval datasets."},
 687 |     )
 688 |     train_file_path: Optional[str] = field(
 689 |         default="train_data.pt",
 690 |         metadata={"help": "Path for cached train dataset"},
 691 |     )
 692 |     valid_file_path: Optional[str] = field(
 693 |         default="valid_data.pt",
 694 |         metadata={"help": "Path for cached valid dataset"},
 695 |     )
 696 |     max_length: Optional[int] = field(
 697 |         default=512,
 698 |         metadata={"help": "Max input length for the source text"},
 699 |     )
 700 |     nr_concats: Optional[int] = field(
 701 |         default=3,
 702 |         metadata={"help": "Number of contexts to concatinate"},
 703 |     )
 704 | 
 705 | 
 706 | def load_datasets(
 707 |         languages: QADataset,
 708 |         base_dataset: str = None,
 709 |         concatinate: bool = False,
 710 |         split: str = 'validation',
 711 | ):
 712 |     """Loads a Huggingface dataset based on the `base` dataset
 713 |     (squad/xquad/mlqa)."""
 714 | 
 715 |     dataset: List[Any] = []
 716 | 
 717 |     data: List
 718 |     dataset: str
 719 |     for lang in languages.langs:
 720 |         if base_dataset is not None:
 721 |             dataset = f"{base_dataset}.{lang}"
 722 |             if base_dataset == "mlqa":
 723 |                 dataset = f"{dataset}.{lang}"
 724 | 
 725 |             data = datasets.load_dataset(base_dataset, dataset, split=split)
 726 |         else:
 727 |             data = datasets.load_dataset(lang, split=split)
 728 | 
 729 |         if concatinate:
 730 |             data = concatinate_squad_data(data, args.nr_concats)
 731 |         data = convert_dataset_to_torch_format(data)
 732 |         dataset.append(data)
 733 | 
 734 |     return dataset
 735 | 
 736 | 
 737 | def concatinate_squad_data(d, span=3):
 738 |     """
 739 |     Concatinate "SPAN" number of SQuAD questions together
 740 |     """
 741 | 
 742 |     def get_span(index, span):
 743 |         """
 744 |         Returns the value in a range for whole numbers
 745 | 
 746 |         Ex: index=4, span=5
 747 |             lower=0, upper=5
 748 | 
 749 |             index=5, span=5
 750 |             lower=0, upper=5
 751 | 
 752 |             index=8, span=5
 753 |             lower=5, upper=10
 754 |         """
 755 |         lower_bound = (index) // span
 756 |         lower_bound = lower_bound * span
 757 |         upper_bound = lower_bound + span
 758 |         return lower_bound, upper_bound
 759 | 
 760 |     def set_start_pos(example, idx):
 761 |         """
 762 |         Get correct new starting position when concatinating SQuAD datasets
 763 |         """
 764 |         low, high = get_span(idx, span)
 765 | 
 766 |         # Get new starting position
 767 |         prev_start = 0
 768 |         if idx != low:
 769 |             prev_start = len("".join(data["context"][low:idx]))
 770 | 
 771 |         start_pos = data["answers"][idx]["answer_start"][0]
 772 |         if not isinstance(start_pos, int):
 773 |             start_pos = start_pos[0]
 774 |         new_start = [prev_start + start_pos]
 775 |         example["answers"]["answer_start"] = new_start
 776 |         return example
 777 | 
 778 |     def set_context(example, idx):
 779 |         """
 780 |         Concatinate "SPAN" number of SQuAD samples
 781 |         """
 782 |         low, high = get_span(idx, span)
 783 | 
 784 |         # Get new context
 785 |         example["context"] = "".join(data["context"][low:high])
 786 |         return example
 787 | 
 788 |     # Filters out questions using the same context but different questions
 789 |     data = d.filter(
 790 |         lambda example, idx: example["context"] != d["context"][idx - 1],
 791 |         with_indices=True,
 792 |     )
 793 | 
 794 |     data = data.map(
 795 |         lambda example, idx: set_start_pos(example, idx),
 796 |         with_indices=True
 797 |     )
 798 |     data = data.map(
 799 |         lambda example, idx: set_context(example, idx),
 800 |         with_indices=True
 801 |     )
 802 | 
 803 |     return data
 804 | 
 805 | 
 806 | #################################################################
 807 | #
 808 | #  Main function
 809 | #
 810 | #################################################################
 811 | 
 812 | 
 813 | def main():
 814 | 
 815 |     parser = HfArgumentParser(
 816 |         (ModelArguments, DataTrainingArguments, TrainingArguments)
 817 |     )
 818 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 819 | 
 820 |     # Needed for getting eval_loss for QA in transformer v. 3.0.2 and 4.0.0
 821 |     training_args.label_names = ["start_positions", "end_positions"]
 822 | 
 823 |     if (
 824 |         os.path.exists(training_args.output_dir)
 825 |         and os.listdir(training_args.output_dir)
 826 |         and training_args.do_train
 827 |         and not training_args.overwrite_output_dir
 828 |     ):
 829 |         raise ValueError(
 830 |             f"Output directory ({training_args.output_dir}) \
 831 |             already exists and is not empty. \
 832 |             Use --overwrite_output_dir to overcome."
 833 |         )
 834 | 
 835 |     if data_args.data_dir is None:
 836 |         data_args.data_dir = "."
 837 | 
 838 |     if data_args.train_file_path is None or data_args.valid_file_path is None:
 839 |         data_args.train_file_path = f"{data_args.data_dir}/train_data.pt"
 840 |         data_args.valid_file_path = f"{data_args.data_dir}/val_data.pt"
 841 | 
 842 |     logger.info("Model parameters set: \n", model_args)
 843 |     logging.info(f"Logging to file: {training_args.logging_dir}.log")
 844 | 
 845 |     set_seed(training_args.seed)
 846 | 
 847 |     tokenizer = AutoTokenizer.from_pretrained(
 848 |         model_args.model_name_or_path,
 849 |         cache_dir=model_args.cache_dir,
 850 |         do_lowercase=args.do_lowercase,
 851 |         pad_to_max_length=True,
 852 |         max_length=args.max_length,
 853 |         truncation=True,
 854 |         use_fast=True,
 855 |     )
 856 | 
 857 |     model = AutoModelForQuestionAnswering.from_pretrained(
 858 |         model_args.model_name_or_path,
 859 |         cache_dir=model_args.cache_dir,
 860 |     )
 861 | 
 862 |     if data_args.datasets == "xquad":
 863 |         XQUAD.data = load_datasets(XQUAD, base_dataset="xquad")
 864 | 
 865 |     if data_args.datasets == "mlqa":
 866 |         MLQA.data = load_datasets(MLQA, base_dataset="mlqa")
 867 | 
 868 |     if data_args.datasets == "tydiqa":
 869 |         raise ValueError("Not yet implemented")
 870 | 
 871 |     if data_args.datasets == "xquad_long":
 872 |         XQUAD.data = load_datasets(XQUAD, "xquad", concatinate=True)
 873 | 
 874 |     if data_args.datasets in ["squad_long", "xquad_long"]:
 875 |         train_dataset = load_datasets(
 876 |             SQUAD, split="train", concatinate=True)[0]
 877 |         valid_dataset = load_datasets(SQUAD, concatinate=True)[0]
 878 |         SQUAD.data = valid_dataset
 879 | 
 880 |     if (data_args.datasets in ["xquad", "mlqa", "squad"]):
 881 |         train_dataset = load_datasets(
 882 |             SQUAD, split="train", concatinate=True)[0]
 883 |         valid_dataset = load_datasets(SQUAD, concatinate=True)[0]
 884 |         SQUAD.data = valid_dataset
 885 | 
 886 |     torch.save(train_dataset, data_args.train_file_path)
 887 |     torch.save(valid_dataset, data_args.valid_file_path)
 888 | 
 889 |     train_dataset = torch.load(data_args.train_file_path)
 890 |     valid_dataset = torch.load(data_args.valid_file_path)
 891 | 
 892 |     ####################################
 893 |     #
 894 |     # Train the model
 895 |     #
 896 |     ####################################
 897 | 
 898 |     if training_args.do_train:
 899 | 
 900 |         trainer = Trainer(
 901 |             model=model,
 902 |             args=training_args,
 903 |             train_dataset=train_dataset,
 904 |             eval_dataset=valid_dataset,
 905 |             data_collator=DummyDataCollator(),
 906 |             prediction_loss_only=True,
 907 |         )
 908 | 
 909 |         if training_args.do_train:
 910 |             trainer.train(
 911 |                 model_path=model_args.model_name_or_path
 912 |                 if os.path.isdir(model_args.model_name_or_path)
 913 |                 else None
 914 |             )
 915 |             trainer.save_model()
 916 |             if trainer.is_world_process_zero():
 917 |                 tokenizer.save_pretrained(training_args.output_dir)
 918 | 
 919 |         results = {}
 920 |         if training_args.do_eval and training_args.local_rank in [-1, 0]:
 921 |             logger.info("*** Evaluation ***")
 922 | 
 923 |             eval_output = trainer.evaluate()
 924 |             output_eval_file = os.path.join(
 925 |                 training_args.output_dir, "eval_results.txt"
 926 |             )
 927 |             print("\n'==========================================\n")
 928 |             print("Eval output:     ", eval_output)
 929 |             print("\n'==========================================\n")
 930 | 
 931 |             with open(output_eval_file, "w") as writer:
 932 |                 logger.info("***** Eval results *****")
 933 |                 for key in sorted(eval_output.keys()):
 934 |                     logger.info("  %s = %s", key, str(eval_output[key]))
 935 |                     writer.write("%s = %s\n" % (key, str(eval_output[key])))
 936 |                     print(key, str(eval_output[key]))
 937 | 
 938 |             results.update(eval_output)
 939 | 
 940 |         logging.info("=" * 45)
 941 |         logging.info("Results from evaluation:")
 942 |         logging.info(results)
 943 |         logging.info("\n")
 944 | 
 945 |     logging.info("" * 45)
 946 | 
 947 |     ####################################
 948 |     #
 949 |     # Evaluate the trained model
 950 |     #
 951 |     ####################################
 952 | 
 953 |     if training_args.do_train:
 954 |         tokenizer = AutoTokenizer.from_pretrained(
 955 |             training_args.output_dir,
 956 |             use_fast=True,
 957 |             do_lowercase=args.do_lowercase
 958 |         )
 959 |         model = AutoModelForQuestionAnswering.from_pretrained(
 960 |             training_args.output_dir,
 961 |         )
 962 |     else:
 963 |         try:
 964 |             model_path = training_args.output_dir
 965 |             tokenizer = AutoTokenizer.from_pretrained(
 966 |                 training_args.output_dir,
 967 |                 use_fast=True,
 968 |                 do_lowercase=args.do_lowercase
 969 |             )
 970 |             model = AutoModelForQuestionAnswering.from_pretrained(
 971 |                 training_args.output_dir,
 972 |             )
 973 |         except:
 974 |             model_path = model_args.model_name_or_path
 975 |             tokenizer = AutoTokenizer.from_pretrained(
 976 |                 model_path, use_fast=True, do_lowercase=args.do_lowercase
 977 |             )
 978 |             model = AutoModelForQuestionAnswering.from_pretrained(
 979 |                 model_path
 980 |             )
 981 | 
 982 |     model = model.cuda()
 983 |     model.eval()
 984 | 
 985 |     get_squad_evaluation(
 986 |         SQUAD.data,
 987 |         model,
 988 |         tokenizer,
 989 |         SQUAD.text_on_eval,
 990 |         training_args.per_device_eval_batch_size,
 991 |     )
 992 |     if data_args.datasets == "xquad" or data_args.datasets == "xquad_long":
 993 |         for i, _ in enumerate(XQUAD.langs):
 994 |             get_squad_evaluation(
 995 |                 XQUAD.data[i],
 996 |                 model,
 997 |                 tokenizer,
 998 |                 XQUAD.text_on_eval[i],
 999 |                 training_args.per_device_eval_batch_size,
1000 |             )
1001 |     elif data_args.datasets == "mlqa":
1002 |         for i, _ in enumerate(MLQA.langs):
1003 |             get_squad_evaluation(
1004 |                 MLQA.data[i],
1005 |                 model,
1006 |                 tokenizer,
1007 |                 MLQA.text_on_eval[i],
1008 |                 training_args.per_device_eval_batch_size,
1009 |             )
1010 | 
1011 |     elif data_args.datasets == "trivia_qa":
1012 |         pass
1013 | 
1014 |     else:
1015 |         print("Not a valid eval dataset...\n Exiting")
1016 | 
1017 | 
1018 | if __name__ == "__main__":
1019 |     main()
1020 | 


--------------------------------------------------------------------------------
/notebooks/Convert to Long.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import logging\n",
 10 |     "import os\n",
 11 |     "import math\n",
 12 |     "from dataclasses import dataclass, field\n",
 13 |     "\n",
 14 |     "import copy # for deep copy\n",
 15 |     "\n",
 16 |     "import torch\n",
 17 |     "from torch import nn\n",
 18 |     "from transformers import RobertaForMaskedLM, RobertaTokenizerFast, TextDataset, DataCollatorForLanguageModeling, Trainer\n",
 19 |     "from transformers import TrainingArguments, HfArgumentParser\n",
 20 |     "from transformers.modeling_longformer import LongformerSelfAttention\n",
 21 |     "\n",
 22 |     "logger = logging.getLogger(__name__)\n",
 23 |     "logging.basicConfig(level=logging.INFO)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 22,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "class RobertaLongSelfAttention(LongformerSelfAttention):        \n",
 33 |     "    def forward(\n",
 34 |     "        self,\n",
 35 |     "        hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None\n",
 36 |     "    ):\n",
 37 |     "        return super().forward(hidden_states, attention_mask=attention_mask)\n",
 38 |     "\n",
 39 |     "class RobertaLongForMaskedLM(RobertaForMaskedLM):\n",
 40 |     "    def __init__(self, config):\n",
 41 |     "        super().__init__(config)\n",
 42 |     "        for i, layer in enumerate(self.roberta.encoder.layer):\n",
 43 |     "            # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n",
 44 |     "            layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 23,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "def create_long_model(save_model_to, attention_window, max_pos):\n",
 54 |     "    model = RobertaForMaskedLM.from_pretrained('roberta-base')\n",
 55 |     "    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos)\n",
 56 |     "    config = model.config\n",
 57 |     "\n",
 58 |     "    # extend position embeddings\n",
 59 |     "    tokenizer.model_max_length = max_pos\n",
 60 |     "    tokenizer.init_kwargs['model_max_length'] = max_pos\n",
 61 |     "    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape\n",
 62 |     "    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2\n",
 63 |     "    config.max_position_embeddings = max_pos\n",
 64 |     "    assert max_pos > current_max_pos\n",
 65 |     "    # allocate a larger position embedding matrix\n",
 66 |     "    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)\n",
 67 |     "    # copy position embeddings over and over to initialize the new position embeddings\n",
 68 |     "    k = 2\n",
 69 |     "    step = current_max_pos - 2\n",
 70 |     "    while k < max_pos - 1:\n",
 71 |     "        new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]\n",
 72 |     "        k += step\n",
 73 |     "        \n",
 74 |     "    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed\n",
 75 |     "    model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)\n",
 76 |     "    \"\"\"\n",
 77 |     "    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed    # add after this line\n",
 78 |     "    model.roberta.embeddings.position_embeddings.num_embeddings = len(new_pos_embed.data)\n",
 79 |     "    # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length\n",
 80 |     "    model.roberta.embeddings.position_ids = torch.arange(0, model.roberta.embeddings.position_embeddings.num_embeddings)[None]\n",
 81 |     "    \"\"\"\n",
 82 |     "    \n",
 83 |     "    \n",
 84 |     "    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`\n",
 85 |     "    config.attention_window = [attention_window] * config.num_hidden_layers\n",
 86 |     "    for i, layer in enumerate(model.roberta.encoder.layer):\n",
 87 |     "        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n",
 88 |     "        longformer_self_attn.query = copy.deepcopy(layer.attention.self.query)\n",
 89 |     "        longformer_self_attn.key = copy.deepcopy(layer.attention.self.key)\n",
 90 |     "        longformer_self_attn.value = copy.deepcopy(layer.attention.self.value)\n",
 91 |     "\n",
 92 |     "        longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)\n",
 93 |     "        longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)\n",
 94 |     "        longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)\n",
 95 |     "\n",
 96 |     "        \"\"\"\n",
 97 |     "        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)\n",
 98 |     "        longformer_self_attn.query = layer.attention.self.query\n",
 99 |     "        longformer_self_attn.key = layer.attention.self.key\n",
100 |     "        longformer_self_attn.value = layer.attention.self.value\n",
101 |     "\n",
102 |     "        longformer_self_attn.query_global = layer.attention.self.query\n",
103 |     "        longformer_self_attn.key_global = layer.attention.self.key\n",
104 |     "        longformer_self_attn.value_global = layer.attention.self.value\n",
105 |     "        \"\"\"\n",
106 |     "\n",
107 |     "        layer.attention.self = longformer_self_attn\n",
108 |     "\n",
109 |     "    logger.info(f'saving model to {save_model_to}')\n",
110 |     "    model.save_pretrained(save_model_to)\n",
111 |     "    tokenizer.save_pretrained(save_model_to)\n",
112 |     "    return model, tokenizer"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 24,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "def copy_proj_layers(model):\n",
122 |     "    for i, layer in enumerate(model.roberta.encoder.layer):\n",
123 |     "        layer.attention.self.query_global = layer.attention.self.query\n",
124 |     "        layer.attention.self.key_global = layer.attention.self.key\n",
125 |     "        layer.attention.self.value_global = layer.attention.self.value\n",
126 |     "    return model"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 25,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):\n",
136 |     "    val_dataset = TextDataset(tokenizer=tokenizer,\n",
137 |     "                              file_path=args.val_datapath,\n",
138 |     "                              block_size=tokenizer.max_len)\n",
139 |     "    if eval_only:\n",
140 |     "        train_dataset = val_dataset\n",
141 |     "    else:\n",
142 |     "        logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')\n",
143 |     "        train_dataset = TextDataset(tokenizer=tokenizer,\n",
144 |     "                                    file_path=args.train_datapath,\n",
145 |     "                                    block_size=tokenizer.max_len)\n",
146 |     "\n",
147 |     "    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)\n",
148 |     "    trainer = Trainer(model=model, args=args, data_collator=data_collator,\n",
149 |     "                      train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True)\n",
150 |     "\n",
151 |     "    eval_loss = trainer.evaluate()\n",
152 |     "    eval_loss = eval_loss['eval_loss']\n",
153 |     "    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')\n",
154 |     "    \n",
155 |     "    if not eval_only:\n",
156 |     "        trainer.train(model_path=model_path)\n",
157 |     "        trainer.save_model()\n",
158 |     "\n",
159 |     "        eval_loss = trainer.evaluate()\n",
160 |     "        eval_loss = eval_loss['eval_loss']\n",
161 |     "        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 26,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "@dataclass\n",
171 |     "class ModelArgs:\n",
172 |     "    attention_window: int = field(default=512, metadata={\"help\": \"Size of attention window\"})\n",
173 |     "    max_pos: int = field(default=4096, metadata={\"help\": \"Maximum position\"})\n",
174 |     "\n",
175 |     "parser = HfArgumentParser((TrainingArguments, ModelArgs,))\n",
176 |     "\n",
177 |     "\n",
178 |     "training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[\n",
179 |     "    '--output_dir', 'tmp',\n",
180 |     "    '--warmup_steps', '500',\n",
181 |     "    '--learning_rate', '0.00003',\n",
182 |     "    '--weight_decay', '0.01',\n",
183 |     "    '--adam_epsilon', '1e-6',\n",
184 |     "    '--max_steps', '3000',\n",
185 |     "    '--logging_steps', '500',\n",
186 |     "    '--save_steps', '500',\n",
187 |     "    '--max_grad_norm', '5.0',\n",
188 |     "    '--per_gpu_eval_batch_size', '8',\n",
189 |     "    '--per_gpu_train_batch_size', '2',  # 32GB gpu with fp32\n",
190 |     "    '--gradient_accumulation_steps', '32',\n",
191 |     "    '--evaluate_during_training',\n",
192 |     "    '--do_train',\n",
193 |     "    '--do_eval',\n",
194 |     "])\n",
195 |     "training_args.val_datapath = '/workspace/data/wikitext-103-raw/wiki.valid.raw'\n",
196 |     "training_args.train_datapath = '/workspace/data/wikitext-103-raw/wiki.train.raw'\n",
197 |     "\n",
198 |     "# Choose GPU\n",
199 |     "import os\n",
200 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 27,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stderr",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']\n",
213 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
214 |       "INFO:__main__:Evaluating roberta-base (seqlen: 512) for refernece ...\n",
215 |       "INFO:filelock:Lock 140125418510600 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n",
216 |       "INFO:filelock:Lock 140125418510600 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid.raw.lock\n",
217 |       "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.\n"
218 |      ]
219 |     },
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "\n",
224 |        "    <div>\n",
225 |        "        <style>\n",
226 |        "            /* Turns off some styling */\n",
227 |        "            progress {\n",
228 |        "                /* gets rid of default border in Firefox and Opera. */\n",
229 |        "                border: none;\n",
230 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
231 |        "                background-size: auto;\n",
232 |        "            }\n",
233 |        "        </style>\n",
234 |        "      \n",
235 |        "      <progress value='62' max='62' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
236 |        "      [62/62 00:07]\n",
237 |        "    </div>\n",
238 |        "    "
239 |       ],
240 |       "text/plain": [
241 |        "<IPython.core.display.HTML object>"
242 |       ]
243 |      },
244 |      "metadata": {},
245 |      "output_type": "display_data"
246 |     },
247 |     {
248 |      "name": "stderr",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "INFO:__main__:Initial eval bpc: 2.549888218283919\n"
252 |      ]
253 |     }
254 |    ],
255 |    "source": [
256 |     "roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')\n",
257 |     "roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')\n",
258 |     "logger.info('Evaluating roberta-base (seqlen: 512) for refernece ...')\n",
259 |     "pretrain_and_evaluate(training_args, roberta_base, roberta_base_tokenizer, eval_only=True, model_path=None)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 28,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "name": "stderr",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "INFO:__main__:Converting roberta-base into roberta-base-4096\n",
272 |       "Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']\n",
273 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
274 |       "INFO:__main__:saving model to tmp/roberta-base-4096\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}'\n",
280 |     "if not os.path.exists(model_path):\n",
281 |     "    os.makedirs(model_path)\n",
282 |     "\n",
283 |     "logger.info(f'Converting roberta-base into roberta-base-{model_args.max_pos}')\n",
284 |     "model, tokenizer = create_long_model(\n",
285 |     "    save_model_to=model_path, attention_window=model_args.attention_window, max_pos=model_args.max_pos)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 29,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "'\\nSelf =   \\n    \\n    (query): Linear(in_features=768, out_features=768, bias=True)\\n  (key): Linear(in_features=768, out_features=768, bias=True)\\n  (value): Linear(in_features=768, out_features=768, bias=True)\\n  (query_global): Linear(in_features=768, out_features=768, bias=True)\\n  (key_global): Linear(in_features=768, out_features=768, bias=True)\\n  (value_global): Linear(in_features=768, out_features=768, bias=True)\\n'"
297 |       ]
298 |      },
299 |      "execution_count": 29,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "\"\"\"\n",
306 |     "Self =   \n",
307 |     "    \n",
308 |     "    (query): Linear(in_features=768, out_features=768, bias=True)\n",
309 |     "  (key): Linear(in_features=768, out_features=768, bias=True)\n",
310 |     "  (value): Linear(in_features=768, out_features=768, bias=True)\n",
311 |     "  (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
312 |     "  (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
313 |     "  (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
314 |     "\"\"\""
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 30,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "name": "stderr",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "INFO:__main__:Loading the model from tmp/roberta-base-4096\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "logger.info(f'Loading the model from {model_path}')\n",
332 |     "tokenizer = RobertaTokenizerFast.from_pretrained(model_path)\n",
333 |     "model = RobertaLongForMaskedLM.from_pretrained(model_path)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 31,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "name": "stderr",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "INFO:__main__:Pretraining roberta-base-4096 ... \n",
346 |       "INFO:filelock:Lock 140124002609248 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.valid.raw.lock\n",
347 |       "INFO:filelock:Lock 140124002609248 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.valid.raw.lock\n",
348 |       "INFO:__main__:Loading and tokenizing training data is usually slow: /workspace/data/wikitext-103-raw/wiki.train.raw\n",
349 |       "INFO:filelock:Lock 140125403321344 acquired on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.train.raw.lock\n",
350 |       "INFO:filelock:Lock 140125403321344 released on /workspace/data/wikitext-103-raw/cached_lm_RobertaTokenizerFast_4094_wiki.train.raw.lock\n",
351 |       "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.\n"
352 |      ]
353 |     },
354 |     {
355 |      "ename": "TypeError",
356 |      "evalue": "forward() takes from 2 to 6 positional arguments but 7 were given",
357 |      "output_type": "error",
358 |      "traceback": [
359 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
360 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
361 |       "\u001b[0;32m<ipython-input-31-00334be0deca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mtraining_args\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m   \u001b[0;31m## <<<<<<<<<<<<<<<<<<<<<<<< REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'magic'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mpretrain_and_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtraining_args\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
362 |       "\u001b[0;32m<ipython-input-25-5ab151aa4cdb>\u001b[0m in \u001b[0;36mpretrain_and_evaluate\u001b[0;34m(args, model, tokenizer, eval_only, model_path)\u001b[0m\n\u001b[1;32m     15\u001b[0m                       train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True)\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m     \u001b[0meval_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m     \u001b[0meval_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0meval_loss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'eval_loss'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m     \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Initial eval bpc: {eval_loss/math.log(2)}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
363 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, eval_dataset)\u001b[0m\n\u001b[1;32m   1311\u001b[0m             \u001b[0;31m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1312\u001b[0m             \u001b[0;31m# self.args.prediction_loss_only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1313\u001b[0;31m             \u001b[0mprediction_loss_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_metrics\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1314\u001b[0m         )\n\u001b[1;32m   1315\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
364 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mprediction_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only)\u001b[0m\n\u001b[1;32m   1415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1416\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1417\u001b[0;31m             \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprediction_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprediction_loss_only\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1418\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1419\u001b[0m                 \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrepeat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
365 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mprediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only)\u001b[0m\n\u001b[1;32m   1509\u001b[0m                     \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m                 \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mhas_labels\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m                 \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
366 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
367 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict, **kwargs)\u001b[0m\n\u001b[1;32m    903\u001b[0m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    904\u001b[0m             \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 905\u001b[0;31m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    906\u001b[0m         )\n\u001b[1;32m    907\u001b[0m         \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
368 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
369 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    692\u001b[0m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    693\u001b[0m             \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 694\u001b[0;31m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    695\u001b[0m         )\n\u001b[1;32m    696\u001b[0m         \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoder_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
370 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
371 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    425\u001b[0m                     \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    426\u001b[0m                     \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m                     \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    428\u001b[0m                 )\n\u001b[1;32m    429\u001b[0m             \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlayer_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
372 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
373 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions)\u001b[0m\n\u001b[1;32m    343\u001b[0m             \u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    344\u001b[0m             \u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 345\u001b[0;31m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    346\u001b[0m         )\n\u001b[1;32m    347\u001b[0m         \u001b[0mattention_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself_attention_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
374 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
375 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions)\u001b[0m\n\u001b[1;32m    277\u001b[0m             \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    278\u001b[0m             \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 279\u001b[0;31m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    280\u001b[0m         )\n\u001b[1;32m    281\u001b[0m         \u001b[0mattention_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
376 |       "\u001b[0;32m~usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
377 |       "\u001b[0;31mTypeError\u001b[0m: forward() takes from 2 to 6 positional arguments but 7 were given"
378 |      ]
379 |     }
380 |    ],
381 |    "source": [
382 |     "logger.info(f'Pretraining roberta-base-{model_args.max_pos} ... ')\n",
383 |     "\n",
384 |     "training_args.max_steps = 3   ## <<<<<<<<<<<<<<<<<<<<<<<< REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<\n",
385 |     "%magic\n",
386 |     "pretrain_and_evaluate(training_args, model, tokenizer, eval_only=False, model_path=training_args.output_dir)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "logger.info(f'Copying local projection layers into global projection layers ... ')\n",
396 |     "model = copy_proj_layers(model)\n",
397 |     "logger.info(f'Saving model to {model_path}')\n",
398 |     "model.save_pretrained(model_path)\n"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "logger.info(f'Loading the model from {model_path}')\n",
408 |     "tokenizer = RobertaTokenizerFast.from_pretrained(model_path)\n",
409 |     "model = RobertaLongForMaskedLM.from_pretrained(model_path)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": []
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "import transformers"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "transformers.__version__"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "model.roberta.embeddings"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "model.roberta.embeddings.position_embeddings"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "model.roberta.embeddings.position_embeddings.num_embeddings"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "model.roberta.embeddings.position_embeddings.num_embeddings"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "torch.cop"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": []
488 |   }
489 |  ],
490 |  "metadata": {
491 |   "kernelspec": {
492 |    "display_name": "Python 3",
493 |    "language": "python",
494 |    "name": "python3"
495 |   },
496 |   "language_info": {
497 |    "codemirror_mode": {
498 |     "name": "ipython",
499 |     "version": 3
500 |    },
501 |    "file_extension": ".py",
502 |    "mimetype": "text/x-python",
503 |    "name": "python",
504 |    "nbconvert_exporter": "python",
505 |    "pygments_lexer": "ipython3",
506 |    "version": "3.6.9"
507 |   }
508 |  },
509 |  "nbformat": 4,
510 |  "nbformat_minor": 4
511 | }
512 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | 
   2 | # Multilingual Longformer
   3 | Master thesis work for investigating if and how well multilingual models in low-resource languages (such as Swedish) can incorporate longer contexts without re-training the models from scratch on long-context datasets in each language. The goal was to investigate if a multilingual model, such as XLM-R, could be extended into a Longformer model and then pre-train only on English, while still having a long-context model in several languages.    
   4 |    
   5 | The script provided includes the necessary steps to reproduce the result presented in the master thesis. We convert pre-trained monolingual and multilingual language models on English to Longformer models to a maximum model length of 4096.
   6 | 
   7 | **We call the pre-trained models using the Longformer pre-training**:  
   8 | 1. RoBERTa-Long  
   9 | 2. XLM-Long      (weights and config are available on Huggingface [here](https://huggingface.co/markussagen/xlm-roberta-longformer-base-4096))
  10 | 
  11 | Based on a RoBERTa and XLM-R model that has been pre-trained using the Longformer pre-training scheme.   
  12 | 
  13 | Training of all models are done through docker containers for reproducability  
  14 | 
  15 | ## Usage and Setup  
  16 | Example of how to build, start, run and shutdown the docker container and the training script  
  17 | If you encounter problems, toggle the `Technical Requirement` and `Pre-Requisites` links to verify that you have a sufficiently large GPU and the pre-requisite applications/libraries installed.  
  18 | 
  19 | <details><summary><b>Technical Requirements</b></summary>
  20 | <p>
  21 | **Please Note**:
  22 | Running the following project is quite computationally expensive. It is required to have a Docker container with at least 90GB of RAM allocated for the pre-training and a CUDA enabled GPU with 48GB of memory!    
  23 |      
  24 | For the Fine-tuning on QA tasks, 32GB of RAM is sufficient and a smaller GPU can be used when fine-tuning on regular or multilingual SQuAD. However, for the datasets created with a longer context, it requires at least 32GB of RAM    
  25 | </p>
  26 | </details>
  27 | 
  28 | 
  29 | <details><summary><b>Pre-Requisites</b></summary>
  30 | <p>
  31 | The following applications and libraries needs to be installed in order to run the application
  32 | - [Docker](https://docs.docker.com/get-docker/)  
  33 | - [Docker Compose](https://docs.docker.com/compose/install/)  
  34 | - Miniconda or Anaconda with Python3  
  35 | - make (terminal command)  
  36 | - wget (terminal command)  
  37 | - unzip (terminal command)  
  38 | - tmux (terminal command)  
  39 | - CUDA enabled GPU (check if set up correctly by entering `nvidia-smi` in your terminal)  
  40 | - [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html) installed and linked to your Docker container (Needed if encountering error: ```ERROR: for XXX_markussagen_repl1  Cannot create container for service repl: Unknown runtime specified nvidia```)   
  41 | </p>
  42 | </details>
  43 | 
  44 | 
  45 | 1. **Download the repo**   
  46 |     
  47 |         git@github.com:MarkusSagen/Master-Thesis-Multilingual-Longformer.git   
  48 |         cp .env.template .env
  49 |         
  50 | 2.  **Download the dataset**   
  51 |     Unzip the dataset and then place it in a suitable location
  52 |     
  53 |         wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
  54 |         unzip wikitext-103-raw-v1.zip   
  55 | 
  56 | 3.  **Change your model and dataset paths**   
  57 |     Open the `.env` file and change the `DATA_DIR` and the `MODEL_DIR` to the relative path to where you have you want your models stored and where you downloaded the dataset. Make sure that the folders you set exist on your system.   
  58 |     For instance:  
  59 |     
  60 |         DATA_DIR=/Users/admin/data/wikitext-103-raw
  61 |         MODEL_DIR=/Users/admin/model
  62 | 4.  **Start the docker container** 
  63 |     
  64 |         make build && make up
  65 | 5.  **Start tmux**  
  66 |     In your terminal start tmux. This will ensure that your runs are not stopped if you are disconnected from an ssh connection  
  67 |     
  68 |         tmux
  69 | 6.  **Run the script**   
  70 |     Here is an example of how a training script might look like for pre-training a XLM-R model into a Longformer. The general format follows the parameters of [Huggingface Transformer's TrainingArgument](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments). 
  71 |     
  72 |         export SEED=42
  73 |         export MAX_LENGTH=4096
  74 |         export MODEL_NAME_OR_PATH=xlm-roberta-base
  75 |         export MODEL_NAME=xlm-roberta-to-longformer
  76 |         export MODEL_DIR=/workspace/models
  77 |         export DATA_DIR=/workspace/data
  78 |         export LOG_DIR=/workspace/logs
  79 |         
  80 |         make repl run="scripts/run_long_lm.py \
  81 |             --model_name_or_path $MODEL_NAME_OR_PATH \
  82 |             --model_name $MODEL_NAME \
  83 |             --output_dir $MODEL_DIR/$MODEL_NAME \
  84 |             --logging_dir $LOG_DIR/$MODEL_NAME \
  85 |             --val_file_path $DATA_DIR/wiki.valid.raw \
  86 |             --train_file_path $DATA_DIR/wiki.train.raw \
  87 |             --seed $SEED \
  88 |             --model_max_length $MAX_LENGTH \
  89 |             --adam_epsilon 1e-8 \
  90 |             --warmup_steps 500 \
  91 |             --learning_rate 3e-5 \
  92 |             --weight_decay 0.01 \
  93 |             --max_steps 6000 \
  94 |             --evaluate_during_training \
  95 |             --logging_steps 50 \
  96 |             --eval_steps 50 \
  97 |             --save_steps 6000  \
  98 |             --max_grad_norm 1.0 \
  99 |             --per_device_eval_batch_size 2 \
 100 |             --per_device_train_batch_size 1 \
 101 |             --gradient_accumulation_steps 64 \
 102 |             --overwrite_output_dir \
 103 |             --fp16 \
 104 |             --do_train \
 105 |             --do_eval
 106 |         "
 107 | 7.  **Shutdown run and container** 
 108 |     
 109 |         make down
 110 | 
 111 | 8. **(Optional) terminate tmux**   
 112 |    
 113 |         exit   
 114 |             
 115 | ## Training and Evaluation in-depth    
 116 | The training of these models were done in two steps:   
 117 | 1. Pre-train `RoBERTa-base` and `XLM-R-base` models into Longformer models   
 118 | 2. Fine-tune regular RoBERTa and XLM-R models on SQuAD formated dataset. Compare the results of these with our Longformer trained models and a Longformer model released by the Longformer authors. We train these models with multiple different seeds, datasets and context length.   
 119 |     
 120 | We have grouped each model trained and evaluated based on:    
 121 | - The dataset and language used for each model    
 122 | - Then based on what model that were trained    
 123 | 
 124 | ## Pre-train: Transfer Long-Context of Language Models   
 125 | 
 126 | The models were trained according to this structure   
 127 | 
 128 |     English Pre-training
 129 |     |-- Wikitext-103
 130 |         |-- RoBERTa-Long (4096)
 131 |         |-- XLM-Long (4096)
 132 | 
 133 | 
 134 | 
 135 | Each fine-tuning are grouped based on the dataset, language and context length and then evaluated for each model.  For more in-depth explanation of the pre-training script and parameters, see [Here](Pretraining_Details.md).    
 136 | 
 137 | <details><summary><b>Runs:</b></summary>
 138 | <p>   
 139 | 
 140 | <details><summary><b>Wikitext-103</b></summary>
 141 | <p>   
 142 | 
 143 | ##### RoBERTa   
 144 | 
 145 |     export SEED=42
 146 |     export MAX_LENGTH=4096
 147 |     export MODEL_DIR=/workspace/models
 148 |     export MODEL_NAME_OR_PATH=roberta-base
 149 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-long
 150 |     export DATA_DIR=/workspace/data
 151 |     export LOG_DIR=/workspace/logs
 152 |     
 153 |     make repl run="scripts/run_long_lm.py \
 154 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 155 |         --model_name $MODEL_NAME \
 156 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 157 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 158 |         --val_file_path $DATA_DIR/wiki.valid.raw \
 159 |         --train_file_path $DATA_DIR/wiki.train.raw \
 160 |         --seed $SEED \
 161 |         --model_max_length $MAX_LENGTH \
 162 |         --adam_epsilon 1e-8 \
 163 |         --warmup_steps 500 \
 164 |         --learning_rate 3e-5 \
 165 |         --weight_decay 0.01 \
 166 |         --max_steps 6000 \
 167 |         --evaluate_during_training \
 168 |         --logging_steps 50 \
 169 |         --eval_steps 50 \
 170 |         --save_steps 500  \
 171 |         --max_grad_norm 1.0 \
 172 |         --per_device_eval_batch_size 2 \
 173 |         --per_device_train_batch_size 1 \
 174 |         --gradient_accumulation_steps 64 \
 175 |         --overwrite_output_dir \
 176 |         --fp16 \
 177 |         --do_train \
 178 |         --do_eval
 179 |     "
 180 | 
 181 | 
 182 | ##### XLM-R   
 183 | 
 184 |     export SEED=42
 185 |     export MAX_LENGTH=4096
 186 |     export MODEL_DIR=/workspace/models
 187 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
 188 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-long
 189 |     export DATA_DIR=/workspace/data
 190 |     export LOG_DIR=/workspace/logs
 191 |     
 192 |     make repl run="scripts/run_long_lm.py \
 193 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 194 |         --model_name $MODEL_NAME \
 195 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 196 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 197 |         --val_file_path $DATA_DIR/wiki.valid.raw \
 198 |         --train_file_path $DATA_DIR/wiki.train.raw \
 199 |         --seed $SEED \
 200 |         --model_max_length $MAX_LENGTH \
 201 |         --adam_epsilon 1e-8 \
 202 |         --warmup_steps 500 \
 203 |         --learning_rate 3e-5 \
 204 |         --weight_decay 0.01 \
 205 |         --max_steps 6000 \
 206 |         --evaluate_during_training \
 207 |         --logging_steps 50 \
 208 |         --eval_steps 50 \
 209 |         --save_steps 500  \
 210 |         --max_grad_norm 1.0 \
 211 |         --per_device_eval_batch_size 2 \
 212 |         --per_device_train_batch_size 1 \
 213 |         --gradient_accumulation_steps 64 \
 214 |         --overwrite_output_dir \
 215 |         --fp16 \
 216 |         --do_train \
 217 |         --do_eval
 218 |     "
 219 | 
 220 | </p>
 221 | </details>
 222 | 
 223 | </p>
 224 | </details>   
 225 | 
 226 | 
 227 | ## Fine-Tune on Question Answering Tasks    
 228 | 
 229 |     
 230 |     English QA Fine-Tuning:
 231 |     |-- SQuAD
 232 |         |-- RoBERTa (512)
 233 |         |-- Longformer (4096)
 234 |         |-- RoBERTa-Long (4096)
 235 |         |-- XLM-R (512)
 236 |         |-- XLM-Long (4096)
 237 |     |-- SQ3 (512)
 238 |         |-- RoBERTa (512)
 239 |         |-- Longformer (4096)
 240 |         |-- RoBERTa-Long (4096)
 241 |         |-- XLM-R (512)
 242 |         |-- XLM-Long (4096)
 243 |     |-- SQ3 (2048)
 244 |         |-- RoBERTa (512)
 245 |         |-- Longformer (4096)
 246 |         |-- RoBERTa-Long (4096)
 247 |         |-- XLM-R (512)
 248 |         |-- XLM-Long (4096)
 249 |     |-- TriviaQA = TODO
 250 |     
 251 |     Multilingual QA Fine-Tuning:
 252 |     |-- XQuAD
 253 |         |-- RoBERTa (512)
 254 |         |-- XLM-R (512)
 255 |         |-- XLM-Long (4096)
 256 |     |-- XQ3 (512)
 257 |         |-- XLM-R (512)
 258 |         |-- XLM-Long (4096)
 259 |     |-- XQ3 (2048)
 260 |         |-- XLM-R (512)
 261 |         |-- XLM-Long (4096)
 262 |     |-- MLQA
 263 |         |-- XLM-R (512)
 264 |         |-- XLM-Long (4096)
 265 | 
 266 | 
 267 | We fine-tune the models on SQuAD-formated extractive question-answer datasets in English and multiple other languages. We also create a concatenated dataset with longer context for both the SQuAD and XQuAD (multilingual SQuAD). The datasets are provided through Huggingface's Datasets library.   
 268 | 
 269 | For more more in-depth information regarding how the fine-tuning scripts, parameters and evaluation setup, see [Here](Finetuning_Details.md)     
 270 | 
 271 | <details><summary><b>Runs:</b></summary>
 272 | <p>   
 273 | 
 274 | Each fine-tuning are grouped based on the dataset, language and context length and then evaluated for each model.   
 275 | 
 276 | ### English    
 277 | 
 278 | <details><summary><b>SQuAD</b></summary>   
 279 | <p>  
 280 |   
 281 | ##### RoBERTa   
 282 | 
 283 |     export SEED=42
 284 |     export DATASET=squad
 285 |     export MODEL_DIR=/workspace/models
 286 |     export MODEL_NAME_OR_PATH=roberta-base
 287 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 288 |     export LOG_DIR=/workspace/logs
 289 |     export DATA_DIR=/workspace/data
 290 |     # Debugging
 291 |     CUDA_LAUNCH_BLOCKING=1
 292 |     # model args
 293 |     make repl run="scripts/finetune_qa_models.py \
 294 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 295 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 296 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 297 |         --dataset $DATASET \
 298 |         --data_dir $DATA_DIR \
 299 |         --seed $SEED \
 300 |         --num_train_epochs 3 \
 301 |         --learning_rate 3e-5 \
 302 |         --logging_steps 50 \
 303 |         --eval_steps 50 \
 304 |         --save_steps 1000  \
 305 |         --per_device_train_batch_size 4 \
 306 |         --per_device_eval_batch_size 32  \
 307 |         --gradient_accumulation_steps 8 \
 308 |         --overwrite_output_dir \
 309 |         --evaluate_during_training \
 310 |         --fp16 \
 311 |         --do_train \
 312 |         --do_eval \
 313 |         --do_lowercase \
 314 |         --max_length 512 \
 315 |     "
 316 | 
 317 |    
 318 | ##### Longformer   
 319 | 
 320 |     export SEED=42
 321 |     export DATASET=squad
 322 |     export MODEL_DIR=/workspace/models
 323 |     export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
 324 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 325 |     export LOG_DIR=/workspace/logs
 326 |     export DATA_DIR=/workspace/data
 327 |     # Debugging
 328 |     CUDA_LAUNCH_BLOCKING=1
 329 |     # model args
 330 |     make repl run="scripts/finetune_qa_models.py \
 331 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 332 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 333 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 334 |         --dataset $DATASET \
 335 |         --data_dir $DATA_DIR \
 336 |         --seed $SEED \
 337 |         --num_train_epochs 3 \
 338 |         --learning_rate 3e-5 \
 339 |         --logging_steps 50 \
 340 |         --eval_steps 50 \
 341 |         --save_steps 1000  \
 342 |         --per_device_train_batch_size 4 \
 343 |         --per_device_eval_batch_size 32  \
 344 |         --gradient_accumulation_steps 8 \
 345 |         --overwrite_output_dir \
 346 |         --evaluate_during_training \
 347 |         --fp16 \
 348 |         --do_train \
 349 |         --do_eval \
 350 |         --do_lowercase \
 351 |         --max_length 512 \
 352 |     "
 353 | 
 354 | 
 355 | ##### RoBERTa-Long  
 356 | 
 357 |     export SEED=42
 358 |     export DATASET=squad
 359 |     export MODEL_DIR=/workspace/models
 360 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
 361 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 362 |     export LOG_DIR=/workspace/logs
 363 |     export DATA_DIR=/workspace/data
 364 |     # Debugging
 365 |     CUDA_LAUNCH_BLOCKING=1
 366 |     # model args
 367 |     make repl run="scripts/finetune_qa_models.py \
 368 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 369 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 370 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 371 |         --dataset $DATASET \
 372 |         --data_dir $DATA_DIR \
 373 |         --seed $SEED \
 374 |         --num_train_epochs 3 \
 375 |         --learning_rate 3e-5 \
 376 |         --logging_steps 50 \
 377 |         --eval_steps 50 \
 378 |         --save_steps 1000  \
 379 |         --per_device_train_batch_size 4 \
 380 |         --per_device_eval_batch_size 32  \
 381 |         --gradient_accumulation_steps 8 \
 382 |         --overwrite_output_dir \
 383 |         --evaluate_during_training \
 384 |         --fp16 \
 385 |         --do_train \
 386 |         --do_eval \
 387 |         --do_lowercase \
 388 |         --max_length 512 \
 389 |     "
 390 | 
 391 | 
 392 | ##### XLM-R   
 393 | 
 394 |     export SEED=42
 395 |     export DATASET=squad
 396 |     export MODEL_DIR=/workspace/models
 397 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
 398 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 399 |     export LOG_DIR=/workspace/logs
 400 |     export DATA_DIR=/workspace/data
 401 |     # Debugging
 402 |     CUDA_LAUNCH_BLOCKING=1
 403 |     # model args
 404 |     make repl run="scripts/finetune_qa_models.py \
 405 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 406 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 407 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 408 |         --dataset $DATASET \
 409 |         --data_dir $DATA_DIR \
 410 |         --seed $SEED \
 411 |         --num_train_epochs 3 \
 412 |         --learning_rate 3e-5 \
 413 |         --logging_steps 50 \
 414 |         --eval_steps 50 \
 415 |         --save_steps 1000  \
 416 |         --per_device_train_batch_size 4 \
 417 |         --per_device_eval_batch_size 32  \
 418 |         --gradient_accumulation_steps 8 \
 419 |         --overwrite_output_dir \
 420 |         --evaluate_during_training \
 421 |         --fp16 \
 422 |         --do_train \
 423 |         --do_eval \
 424 |         --do_lowercase \
 425 |         --max_length 512 \
 426 |     "
 427 | 
 428 | 
 429 | ##### XLM-Long   
 430 | 
 431 |     export SEED=42
 432 |     export DATASET=squad
 433 |     export MODEL_DIR=/workspace/models
 434 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
 435 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 436 |     export LOG_DIR=/workspace/logs
 437 |     export DATA_DIR=/workspace/data
 438 |     # Debugging
 439 |     CUDA_LAUNCH_BLOCKING=1
 440 |     # model args
 441 |     make repl run="scripts/finetune_qa_models.py \
 442 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 443 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 444 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 445 |         --dataset $DATASET \
 446 |         --data_dir $DATA_DIR \
 447 |         --seed $SEED \
 448 |         --num_train_epochs 3 \
 449 |         --learning_rate 3e-5 \
 450 |         --logging_steps 50 \
 451 |         --eval_steps 50 \
 452 |         --save_steps 1000  \
 453 |         --per_device_train_batch_size 4 \
 454 |         --per_device_eval_batch_size 32  \
 455 |         --gradient_accumulation_steps 8 \
 456 |         --overwrite_output_dir \
 457 |         --evaluate_during_training \
 458 |         --fp16 \
 459 |         --do_train \
 460 |         --do_eval \
 461 |         --do_lowercase \
 462 |         --max_length 512 \
 463 |     "
 464 | 
 465 | 
 466 | </p>
 467 | </details>
 468 | 
 469 | <details><summary><b>SQ3 (512)</b></summary>
 470 | <p>   
 471 |   
 472 | ##### RoBERTa   
 473 | 
 474 |     export SEED=42
 475 |     export MAX_LENGTH=512
 476 |     export NR_CONCATS=1
 477 |     export DATASET=squad_long
 478 |     export MODEL_DIR=/workspace/models
 479 |     export MODEL_NAME_OR_PATH=roberta-base
 480 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 481 |     export LOG_DIR=/workspace/logs
 482 |     export DATA_DIR=/workspace/data
 483 |     # Debugging
 484 |     CUDA_LAUNCH_BLOCKING=1
 485 |     # model args
 486 |     make repl run="scripts/finetune_qa_models.py \
 487 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 488 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 489 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 490 |         --dataset $DATASET \
 491 |         --data_dir $DATA_DIR \
 492 |         --seed $SEED \
 493 |         --num_train_epochs 3 \
 494 |         --learning_rate 3e-5 \
 495 |         --logging_steps 50 \
 496 |         --eval_steps 50 \
 497 |         --save_steps 1000  \
 498 |         --per_device_train_batch_size 4 \
 499 |         --per_device_eval_batch_size 32  \
 500 |         --gradient_accumulation_steps 8 \
 501 |         --overwrite_output_dir \
 502 |         --evaluate_during_training \
 503 |         --fp16 \
 504 |         --do_train \
 505 |         --do_eval \
 506 |         --do_lowercase \
 507 |         --nr_concats $NR_CONCATS \
 508 |         --max_length $MAX_LENGTH \
 509 |     "
 510 | 
 511 | ##### Longformer   
 512 | 
 513 |     export SEED=42
 514 |     export MAX_LENGTH=512
 515 |     export NR_CONCATS=1
 516 |     export DATASET=squad_long
 517 |     export MODEL_DIR=/workspace/models
 518 |     export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
 519 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 520 |     export LOG_DIR=/workspace/logs
 521 |     export DATA_DIR=/workspace/data
 522 |     # Debugging
 523 |     CUDA_LAUNCH_BLOCKING=1
 524 |     # model args
 525 |     make repl run="scripts/finetune_qa_models.py \
 526 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 527 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 528 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 529 |         --dataset $DATASET \
 530 |         --data_dir $DATA_DIR \
 531 |         --seed $SEED \
 532 |         --num_train_epochs 3 \
 533 |         --learning_rate 3e-5 \
 534 |         --logging_steps 50 \
 535 |         --eval_steps 50 \
 536 |         --save_steps 1000  \
 537 |         --per_device_train_batch_size 4 \
 538 |         --per_device_eval_batch_size 32  \
 539 |         --gradient_accumulation_steps 8 \
 540 |         --overwrite_output_dir \
 541 |         --evaluate_during_training \
 542 |         --fp16 \
 543 |         --do_train \
 544 |         --do_eval \
 545 |         --do_lowercase \
 546 |         --nr_concats $NR_CONCATS \
 547 |         --max_length $MAX_LENGTH \
 548 |     "
 549 | 
 550 | 
 551 | ##### RoBERTa-Long   
 552 | 
 553 |     export SEED=42
 554 |     export MAX_LENGTH=512
 555 |     export NR_CONCATS=1
 556 |     export DATASET=squad_long
 557 |     export MODEL_DIR=/workspace/models
 558 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
 559 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 560 |     export LOG_DIR=/workspace/logs
 561 |     export DATA_DIR=/workspace/data
 562 |     # Debugging
 563 |     CUDA_LAUNCH_BLOCKING=1
 564 |     # model args
 565 |     make repl run="scripts/finetune_qa_models.py \
 566 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 567 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 568 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 569 |         --dataset $DATASET \
 570 |         --data_dir $DATA_DIR \
 571 |         --seed $SEED \
 572 |         --num_train_epochs 3 \
 573 |         --learning_rate 3e-5 \
 574 |         --logging_steps 50 \
 575 |         --eval_steps 50 \
 576 |         --save_steps 1000  \
 577 |         --per_device_train_batch_size 4 \
 578 |         --per_device_eval_batch_size 32  \
 579 |         --gradient_accumulation_steps 8 \
 580 |         --overwrite_output_dir \
 581 |         --evaluate_during_training \
 582 |         --fp16 \
 583 |         --do_train \
 584 |         --do_eval \
 585 |         --do_lowercase \
 586 |         --nr_concats $NR_CONCATS \
 587 |         --max_length $MAX_LENGTH \
 588 |     "
 589 | 
 590 | ##### XLM-R   
 591 | 
 592 |     export SEED=42
 593 |     export MAX_LENGTH=512
 594 |     export NR_CONCATS=1
 595 |     export DATASET=squad_long
 596 |     export MODEL_DIR=/workspace/models
 597 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
 598 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 599 |     export LOG_DIR=/workspace/logs
 600 |     export DATA_DIR=/workspace/data
 601 |     # Debugging
 602 |     CUDA_LAUNCH_BLOCKING=1
 603 |     # model args
 604 |     make repl run="scripts/finetune_qa_models.py \
 605 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 606 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 607 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 608 |         --dataset $DATASET \
 609 |         --data_dir $DATA_DIR \
 610 |         --seed $SEED \
 611 |         --num_train_epochs 3 \
 612 |         --learning_rate 3e-5 \
 613 |         --logging_steps 50 \
 614 |         --eval_steps 50 \
 615 |         --save_steps 1000  \
 616 |         --per_device_train_batch_size 4 \
 617 |         --per_device_eval_batch_size 32  \
 618 |         --gradient_accumulation_steps 8 \
 619 |         --overwrite_output_dir \
 620 |         --evaluate_during_training \
 621 |         --fp16 \
 622 |         --do_train \
 623 |         --do_eval \
 624 |         --do_lowercase \
 625 |         --nr_concats $NR_CONCATS \
 626 |         --max_length $MAX_LENGTH \
 627 |     "
 628 | 
 629 | ##### XLM-Long  
 630 |    
 631 |     export SEED=42
 632 |     export MAX_LENGTH=512
 633 |     export NR_CONCATS=1
 634 |     export DATASET=squad_long
 635 |     export MODEL_DIR=/workspace/models
 636 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
 637 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 638 |     export LOG_DIR=/workspace/logs
 639 |     export DATA_DIR=/workspace/data
 640 |     # Debugging
 641 |     CUDA_LAUNCH_BLOCKING=1
 642 |     # model args
 643 |     make repl run="scripts/finetune_qa_models.py \
 644 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 645 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 646 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 647 |         --dataset $DATASET \
 648 |         --data_dir $DATA_DIR \
 649 |         --seed $SEED \
 650 |         --num_train_epochs 3 \
 651 |         --learning_rate 3e-5 \
 652 |         --logging_steps 50 \
 653 |         --eval_steps 50 \
 654 |         --save_steps 1000  \
 655 |         --per_device_train_batch_size 4 \
 656 |         --per_device_eval_batch_size 32  \
 657 |         --gradient_accumulation_steps 8 \
 658 |         --overwrite_output_dir \
 659 |         --evaluate_during_training \
 660 |         --fp16 \
 661 |         --do_train \
 662 |         --do_eval \
 663 |         --do_lowercase \
 664 |         --nr_concats $NR_CONCATS \
 665 |         --max_length $MAX_LENGTH \
 666 |     "
 667 | 
 668 | 
 669 | </p>
 670 | </details>
 671 | 
 672 | <details><summary><b>SQ3 (4096)</b></summary>
 673 | <p>   
 674 | 
 675 | ##### Longformer   
 676 | 
 677 |     export SEED=42
 678 |     export MAX_LENGTH=2048
 679 |     export NR_CONCATS=3
 680 |     export DATASET=squad_long
 681 |     export MODEL_DIR=/workspace/models
 682 |     export MODEL_NAME_OR_PATH=allenai/longformer-base-4096
 683 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 684 |     export LOG_DIR=/workspace/logs
 685 |     export DATA_DIR=/workspace/data
 686 |     # Debugging
 687 |     CUDA_LAUNCH_BLOCKING=1
 688 |     # model args
 689 |     make repl run="scripts/finetune_qa_models.py \
 690 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 691 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 692 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 693 |         --dataset $DATASET \
 694 |         --data_dir $DATA_DIR \
 695 |         --seed $SEED \
 696 |         --num_train_epochs 3 \
 697 |         --learning_rate 3e-5 \
 698 |         --logging_steps 50 \
 699 |         --eval_steps 50 \
 700 |         --save_steps 1000  \
 701 |         --per_device_train_batch_size 1 \
 702 |         --per_device_eval_batch_size 32  \
 703 |         --gradient_accumulation_steps 32 \
 704 |         --overwrite_output_dir \
 705 |         --evaluate_during_training \
 706 |         --fp16 \
 707 |         --do_train \
 708 |         --do_eval \
 709 |         --do_lowercase \
 710 |         --nr_concats $NR_CONCATS \
 711 |         --max_length $MAX_LENGTH \
 712 |     "
 713 | 
 714 | 
 715 | ##### RoBERTa-Long   
 716 | 
 717 |     export SEED=42
 718 |     export MAX_LENGTH=2048
 719 |     export NR_CONCATS=3
 720 |     export DATASET=squad_long
 721 |     export MODEL_DIR=/workspace/models
 722 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/roberta-base-long
 723 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 724 |     export LOG_DIR=/workspace/logs
 725 |     export DATA_DIR=/workspace/data
 726 |     # Debugging
 727 |     CUDA_LAUNCH_BLOCKING=1
 728 |     # model args
 729 |     make repl run="scripts/finetune_qa_models.py \
 730 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 731 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 732 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 733 |         --dataset $DATASET \
 734 |         --data_dir $DATA_DIR \
 735 |         --seed $SEED \
 736 |         --num_train_epochs 3 \
 737 |         --learning_rate 3e-5 \
 738 |         --logging_steps 50 \
 739 |         --eval_steps 50 \
 740 |         --save_steps 1000  \
 741 |         --per_device_train_batch_size 1 \
 742 |         --per_device_eval_batch_size 32  \
 743 |         --gradient_accumulation_steps 32 \
 744 |         --overwrite_output_dir \
 745 |         --evaluate_during_training \
 746 |         --fp16 \
 747 |         --do_train \
 748 |         --do_eval \
 749 |         --do_lowercase \
 750 |         --nr_concats $NR_CONCATS \
 751 |         --max_length $MAX_LENGTH \
 752 |     "
 753 | 
 754 | 
 755 | ##### XLM-Long  
 756 | 
 757 |     export SEED=42
 758 |     export MAX_LENGTH=2048
 759 |     export NR_CONCATS=3
 760 |     export DATASET=squad_long
 761 |     export MODEL_DIR=/workspace/models
 762 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
 763 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 764 |     export LOG_DIR=/workspace/logs
 765 |     export DATA_DIR=/workspace/data
 766 |     # Debugging
 767 |     CUDA_LAUNCH_BLOCKING=1
 768 |     # model args
 769 |     make repl run="scripts/finetune_qa_models.py \
 770 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 771 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 772 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 773 |         --dataset $DATASET \
 774 |         --data_dir $DATA_DIR \
 775 |         --seed $SEED \
 776 |         --num_train_epochs 3 \
 777 |         --learning_rate 3e-5 \
 778 |         --logging_steps 50 \
 779 |         --eval_steps 50 \
 780 |         --save_steps 1000  \
 781 |         --per_device_train_batch_size 1 \
 782 |         --per_device_eval_batch_size 32  \
 783 |         --gradient_accumulation_steps 32 \
 784 |         --overwrite_output_dir \
 785 |         --evaluate_during_training \
 786 |         --fp16 \
 787 |         --do_train \
 788 |         --do_eval \
 789 |         --do_lowercase \
 790 |         --nr_concats $NR_CONCATS \
 791 |         --max_length $MAX_LENGTH \
 792 |     "
 793 | 
 794 | 
 795 | </p>
 796 | </details>
 797 | 
 798 | <details><summary><b>TODO TriviaQA (4096)</b></summary>
 799 | <p>
 800 | </p>
 801 | </details>
 802 |         
 803 | ### Multilingual    
 804 | <details><summary><b>XQuAD</b></summary>
 805 | <p>   
 806 | 
 807 | ##### RoBERTa   
 808 | 
 809 |     export SEED=42
 810 |     export DATASET=xquad
 811 |     export MODEL_DIR=/workspace/models
 812 |     export MODEL_NAME_OR_PATH=roberta-base
 813 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 814 |     export LOG_DIR=/workspace/logs
 815 |     export DATA_DIR=/workspace/data
 816 |     # Debugging
 817 |     CUDA_LAUNCH_BLOCKING=1
 818 |     # model args
 819 |     make repl run="scripts/finetune_qa_models.py \
 820 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 821 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 822 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 823 |         --dataset $DATASET \
 824 |         --data_dir $DATA_DIR \
 825 |         --seed $SEED \
 826 |         --num_train_epochs 3 \
 827 |         --learning_rate 3e-5 \
 828 |         --logging_steps 50 \
 829 |         --eval_steps 50 \
 830 |         --save_steps 1000  \
 831 |         --per_device_train_batch_size 4 \
 832 |         --per_device_eval_batch_size 32  \
 833 |         --gradient_accumulation_steps 8 \
 834 |         --overwrite_output_dir \
 835 |         --evaluate_during_training \
 836 |         --fp16 \
 837 |         --do_train \
 838 |         --do_eval \
 839 |         --do_lowercase \
 840 |         --max_length 512 \
 841 |     "
 842 | 
 843 | ##### XLM-R   
 844 | 
 845 |     export SEED=42
 846 |     export DATASET=xquad
 847 |     export MODEL_DIR=/workspace/models
 848 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
 849 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 850 |     export LOG_DIR=/workspace/logs
 851 |     export DATA_DIR=/workspace/data
 852 |     # Debugging
 853 |     CUDA_LAUNCH_BLOCKING=1
 854 |     # model args
 855 |     make repl run="scripts/finetune_qa_models.py \
 856 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 857 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 858 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 859 |         --dataset $DATASET \
 860 |         --data_dir $DATA_DIR \
 861 |         --seed $SEED \
 862 |         --num_train_epochs 3 \
 863 |         --learning_rate 3e-5 \
 864 |         --logging_steps 50 \
 865 |         --eval_steps 50 \
 866 |         --save_steps 1000  \
 867 |         --per_device_train_batch_size 4 \
 868 |         --per_device_eval_batch_size 32  \
 869 |         --gradient_accumulation_steps 8 \
 870 |         --overwrite_output_dir \
 871 |         --evaluate_during_training \
 872 |         --fp16 \
 873 |         --do_train \
 874 |         --do_eval \
 875 |         --do_lowercase \
 876 |         --max_length 512 \
 877 |     "
 878 | 
 879 | ##### XLM-Long   
 880 | 
 881 |     export SEED=42
 882 |     export DATASET=xquad
 883 |     export MODEL_DIR=/workspace/models
 884 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
 885 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 886 |     export LOG_DIR=/workspace/logs
 887 |     export DATA_DIR=/workspace/data
 888 |     # Debugging
 889 |     CUDA_LAUNCH_BLOCKING=1
 890 |     # model args
 891 |     make repl run="scripts/finetune_qa_models.py \
 892 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 893 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 894 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 895 |         --dataset $DATASET \
 896 |         --data_dir $DATA_DIR \
 897 |         --seed $SEED \
 898 |         --num_train_epochs 3 \
 899 |         --learning_rate 3e-5 \
 900 |         --logging_steps 50 \
 901 |         --eval_steps 50 \
 902 |         --save_steps 1000  \
 903 |         --per_device_train_batch_size 4 \
 904 |         --per_device_eval_batch_size 32  \
 905 |         --gradient_accumulation_steps 8 \
 906 |         --overwrite_output_dir \
 907 |         --evaluate_during_training \
 908 |         --fp16 \
 909 |         --do_train \
 910 |         --do_eval \
 911 |         --do_lowercase \
 912 |         --max_length 512 \
 913 |     "
 914 | 
 915 | 
 916 | </p>
 917 | </details>
 918 | 
 919 | <details><summary><b>XQ3 (512)</b></summary>
 920 | <p>  
 921 | 
 922 | ##### XLM-R   
 923 | 
 924 |     export SEED=42
 925 |     export MAX_LENGTH=512
 926 |     export NR_CONCATS=1
 927 |     export DATASET=xquad_long
 928 |     export MODEL_DIR=/workspace/models
 929 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
 930 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 931 |     export LOG_DIR=/workspace/logs
 932 |     export DATA_DIR=/workspace/data
 933 |     # Debugging
 934 |     CUDA_LAUNCH_BLOCKING=1
 935 |     # model args
 936 |     make repl run="scripts/finetune_qa_models.py \
 937 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 938 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 939 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 940 |         --dataset $DATASET \
 941 |         --data_dir $DATA_DIR \
 942 |         --seed $SEED \
 943 |         --num_train_epochs 3 \
 944 |         --learning_rate 3e-5 \
 945 |         --logging_steps 50 \
 946 |         --eval_steps 50 \
 947 |         --save_steps 1000  \
 948 |         --per_device_train_batch_size 4 \
 949 |         --per_device_eval_batch_size 32  \
 950 |         --gradient_accumulation_steps 8 \
 951 |         --overwrite_output_dir \
 952 |         --evaluate_during_training \
 953 |         --fp16 \
 954 |         --do_train \
 955 |         --do_eval \
 956 |         --do_lowercase \
 957 |         --nr_concats $NR_CONCATS \
 958 |         --max_length $MAX_LENGTH \
 959 |     "
 960 | 
 961 | 
 962 | ##### XLM-Long   
 963 | 
 964 |     export SEED=42
 965 |     export MAX_LENGTH=512
 966 |     export NR_CONCATS=1
 967 |     export DATASET=xquad_long
 968 |     export MODEL_DIR=/workspace/models
 969 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
 970 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
 971 |     export LOG_DIR=/workspace/logs
 972 |     export DATA_DIR=/workspace/data
 973 |     # Debugging
 974 |     CUDA_LAUNCH_BLOCKING=1
 975 |     # model args
 976 |     make repl run="scripts/finetune_qa_models.py \
 977 |         --model_name_or_path $MODEL_NAME_OR_PATH \
 978 |         --output_dir $MODEL_DIR/$MODEL_NAME \
 979 |         --logging_dir $LOG_DIR/$MODEL_NAME \
 980 |         --dataset $DATASET \
 981 |         --data_dir $DATA_DIR \
 982 |         --seed $SEED \
 983 |         --num_train_epochs 3 \
 984 |         --learning_rate 3e-5 \
 985 |         --logging_steps 50 \
 986 |         --eval_steps 50 \
 987 |         --save_steps 1000  \
 988 |         --per_device_train_batch_size 4 \
 989 |         --per_device_eval_batch_size 32  \
 990 |         --gradient_accumulation_steps 8 \
 991 |         --overwrite_output_dir \
 992 |         --evaluate_during_training \
 993 |         --fp16 \
 994 |         --do_train \
 995 |         --do_eval \
 996 |         --do_lowercase \
 997 |         --nr_concats $NR_CONCATS \
 998 |         --max_length $MAX_LENGTH \
 999 |     "
1000 | 
1001 | 
1002 | </p>
1003 | </details>
1004 | 
1005 | <details><summary><b>XQ3 (4096)</b></summary>
1006 | <p>  
1007 | 
1008 | ##### XLM-Long   
1009 | 
1010 |     export SEED=42
1011 |     export MAX_LENGTH=2048
1012 |     export NR_CONCATS=3
1013 |     export DATASET=xquad_long
1014 |     export MODEL_DIR=/workspace/models
1015 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
1016 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1017 |     export LOG_DIR=/workspace/logs
1018 |     export DATA_DIR=/workspace/data
1019 |     # Debugging
1020 |     CUDA_LAUNCH_BLOCKING=1
1021 |     # model args
1022 |     make repl run="scripts/finetune_qa_models.py \
1023 |         --model_name_or_path $MODEL_NAME_OR_PATH \
1024 |         --output_dir $MODEL_DIR/$MODEL_NAME \
1025 |         --logging_dir $LOG_DIR/$MODEL_NAME \
1026 |         --dataset $DATASET \
1027 |         --data_dir $DATA_DIR \
1028 |         --seed $SEED \
1029 |         --num_train_epochs 3 \
1030 |         --learning_rate 3e-5 \
1031 |         --logging_steps 50 \
1032 |         --eval_steps 50 \
1033 |         --save_steps 1000  \
1034 |         --per_device_train_batch_size 1 \
1035 |         --per_device_eval_batch_size 32  \
1036 |         --gradient_accumulation_steps 32 \
1037 |         --overwrite_output_dir \
1038 |         --evaluate_during_training \
1039 |         --fp16 \
1040 |         --do_train \
1041 |         --do_eval \
1042 |         --do_lowercase \
1043 |         --nr_concats $NR_CONCATS \
1044 |         --max_length $MAX_LENGTH \
1045 |     "
1046 | 
1047 | 
1048 | </p>
1049 | </details>
1050 | 
1051 | 
1052 | <details><summary><b>MLQA</b></summary>
1053 | <p>   
1054 | 
1055 | ##### XLM-R   
1056 | 
1057 |     export SEED=42
1058 |     export DATASET=mlqa
1059 |     export MODEL_DIR=/workspace/models
1060 |     export MODEL_NAME_OR_PATH=xlm-roberta-base
1061 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1062 |     export LOG_DIR=/workspace/logs
1063 |     export DATA_DIR=/workspace/data
1064 |     # Debugging
1065 |     CUDA_LAUNCH_BLOCKING=1
1066 |     # model args
1067 |     make repl run="scripts/finetune_qa_models.py \
1068 |         --model_name_or_path $MODEL_NAME_OR_PATH \
1069 |         --output_dir $MODEL_DIR/$MODEL_NAME \
1070 |         --logging_dir $LOG_DIR/$MODEL_NAME \
1071 |         --dataset $DATASET \
1072 |         --data_dir $DATA_DIR \
1073 |         --seed $SEED \
1074 |         --num_train_epochs 3 \
1075 |         --learning_rate 3e-5 \
1076 |         --logging_steps 50 \
1077 |         --eval_steps 50 \
1078 |         --save_steps 1000  \
1079 |         --per_device_train_batch_size 4 \
1080 |         --per_device_eval_batch_size 32  \
1081 |         --gradient_accumulation_steps 8 \
1082 |         --overwrite_output_dir \
1083 |         --evaluate_during_training \
1084 |         --fp16 \
1085 |         --do_train \
1086 |         --do_eval \
1087 |         --do_lowercase \
1088 |         --max_length 512 \
1089 |     "
1090 | 
1091 | 
1092 | ##### XLM-Long   
1093 | 
1094 |     export SEED=42
1095 |     export DATASET=mlqa
1096 |     export MODEL_DIR=/workspace/models
1097 |     export MODEL_NAME_OR_PATH=$MODEL_DIR/xlm-roberta-base-long
1098 |     export MODEL_NAME=$MODEL_NAME_OR_PATH-seed-$SEED-on-$DATASET
1099 |     export LOG_DIR=/workspace/logs
1100 |     export DATA_DIR=/workspace/data
1101 |     # Debugging
1102 |     CUDA_LAUNCH_BLOCKING=1
1103 |     # model args
1104 |     make repl run="scripts/finetune_qa_models.py \
1105 |         --model_name_or_path $MODEL_NAME_OR_PATH \
1106 |         --output_dir $MODEL_DIR/$MODEL_NAME \
1107 |         --logging_dir $LOG_DIR/$MODEL_NAME \
1108 |         --dataset $DATASET \
1109 |         --data_dir $DATA_DIR \
1110 |         --seed $SEED \
1111 |         --num_train_epochs 3 \
1112 |         --learning_rate 3e-5 \
1113 |         --logging_steps 50 \
1114 |         --eval_steps 50 \
1115 |         --save_steps 1000  \
1116 |         --per_device_train_batch_size 4 \
1117 |         --per_device_eval_batch_size 32  \
1118 |         --gradient_accumulation_steps 8 \
1119 |         --overwrite_output_dir \
1120 |         --evaluate_during_training \
1121 |         --fp16 \
1122 |         --do_train \
1123 |         --do_eval \
1124 |         --do_lowercase \
1125 |         --max_length 512 \
1126 |     "
1127 | 
1128 | 
1129 | 
1130 | 
1131 | </p>
1132 | </details>
1133 | 
1134 | 
1135 | </p>
1136 | </details>
1137 | 
1138 | ## Acknowledgment   
1139 | Many thanks to the [Longformer Authors](https://github.com/allenai/longformer) for providing reproducible training scripts and Huggingface for open-sourcing their models and frameworks. I would like to thank my supervisor at Peltarion Philipp Eisen for his invaluable feedback, insight and availability. Thank you Professor Joakim Nivre for insightful and thorough feedback and for taking the time out of your busy schedule. A massive thank you to all the wonderful people at Peltarion for the opportunity to work on such an interesting project.   
1140 | 
1141 | ## Citation   
1142 | You can read the report [here](http://www.diva-portal.org/smash/get/diva2:1545786/FULLTEXT02.pdf)  
1143 | ```
1144 | @mastersthesis{Sagen1545786,
1145 |    author = {Sagen, Markus},
1146 |    institution = {Uppsala University, Department of Information Technology},
1147 |    pages = {45},
1148 |    school = {Uppsala University, Department of Information Technology},
1149 |    title = {Large-Context Question Answering with Cross-Lingual Transfer},
1150 |    series = {UPTEC IT},
1151 |    ISSN = {1401-5749},
1152 |    number = {21003},
1153 |    year = {2021}
1154 | }
1155 | ```   
1156 | 
1157 | 
1158 | ## Contact   
1159 |    
1160 | > The model weights and config for the XLM-Long are available [at Huggingface](https://huggingface.co/markussagen/xlm-roberta-longformer-base-4096).   
1161 | > Import as model_name: `markussagen/xlm-roberta-longformer-base-4096`  
1162 | 
1163 | For questions regarding the code or the master thesis in general add an issue in the repo or contact:   
1164 | [markus.john.sagen@gmail.com](mailto:markus.john.sagen@gmail.com)
1165 | 
1166 | ## TODO
1167 | - Include plots and table for the evaluations
1168 | - Create bash scripts to fine-tune models on all seeds. Just send in model name 
1169 | 


--------------------------------------------------------------------------------
/notebooks/Try Train Longformer SQuAD.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": []
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": []
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [
  22 |     {
  23 |      "name": "stderr",
  24 |      "output_type": "stream",
  25 |      "text": [
  26 |       "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m W&B installed but not logged in.  Run `wandb login` or set the WANDB_API_KEY env variable.\n"
  27 |      ]
  28 |     }
  29 |    ],
  30 |    "source": [
  31 |     "import torch\n",
  32 |     "import datasets as nlp\n",
  33 |     "from transformers import LongformerTokenizerFast"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 2,
  39 |    "metadata": {},
  40 |    "outputs": [
  41 |     {
  42 |      "data": {
  43 |       "application/vnd.jupyter.widget-view+json": {
  44 |        "model_id": "63522ed3effe4cba996db1224652e8a0",
  45 |        "version_major": 2,
  46 |        "version_minor": 0
  47 |       },
  48 |       "text/plain": [
  49 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…"
  50 |       ]
  51 |      },
  52 |      "metadata": {},
  53 |      "output_type": "display_data"
  54 |     },
  55 |     {
  56 |      "name": "stdout",
  57 |      "output_type": "stream",
  58 |      "text": [
  59 |       "\n"
  60 |      ]
  61 |     },
  62 |     {
  63 |      "data": {
  64 |       "application/vnd.jupyter.widget-view+json": {
  65 |        "model_id": "1b86554d789b47ffba0ea4b4bd7bd6ac",
  66 |        "version_major": 2,
  67 |        "version_minor": 0
  68 |       },
  69 |       "text/plain": [
  70 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…"
  71 |       ]
  72 |      },
  73 |      "metadata": {},
  74 |      "output_type": "display_data"
  75 |     },
  76 |     {
  77 |      "name": "stdout",
  78 |      "output_type": "stream",
  79 |      "text": [
  80 |       "\n"
  81 |      ]
  82 |     }
  83 |    ],
  84 |    "source": [
  85 |     "tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 3,
  91 |    "metadata": {},
  92 |    "outputs": [],
  93 |    "source": [
  94 |     "def get_correct_alignement(context, answer):\n",
  95 |     "    \"\"\" Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. \"\"\"\n",
  96 |     "    gold_text = answer['text'][0]\n",
  97 |     "    start_idx = answer['answer_start'][0]\n",
  98 |     "    end_idx = start_idx + len(gold_text)\n",
  99 |     "    if context[start_idx:end_idx] == gold_text:\n",
 100 |     "        return start_idx, end_idx       # When the gold label position is good\n",
 101 |     "    elif context[start_idx-1:end_idx-1] == gold_text:\n",
 102 |     "        return start_idx-1, end_idx-1   # When the gold label is off by one character\n",
 103 |     "    elif context[start_idx-2:end_idx-2] == gold_text:\n",
 104 |     "        return start_idx-2, end_idx-2   # When the gold label is off by two character\n",
 105 |     "    else:\n",
 106 |     "        raise ValueError()\n",
 107 |     "\n",
 108 |     "# Tokenize our training dataset\n",
 109 |     "def convert_to_features(example):\n",
 110 |     "    # Tokenize contexts and questions (as pairs of inputs)\n",
 111 |     "    encodings = tokenizer.encode_plus(example['question'], example['context'], pad_to_max_length=True, max_length=512, truncation=True)\n",
 112 |     "    context_encodings = tokenizer.encode_plus(example['context'])\n",
 113 |     "    \n",
 114 |     "\n",
 115 |     "    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.\n",
 116 |     "    # this will give us the position of answer span in the context text\n",
 117 |     "    start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])\n",
 118 |     "    start_positions_context = context_encodings.char_to_token(start_idx)\n",
 119 |     "    end_positions_context = context_encodings.char_to_token(end_idx-1)\n",
 120 |     "\n",
 121 |     "    # here we will compute the start and end position of the answer in the whole example\n",
 122 |     "    # as the example is encoded like this <s> question</s></s> context</s>\n",
 123 |     "    # and we know the postion of the answer in the context\n",
 124 |     "    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)\n",
 125 |     "    # this will give us the position of the answer span in whole example \n",
 126 |     "    sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)\n",
 127 |     "    start_positions = start_positions_context + sep_idx + 1\n",
 128 |     "    end_positions = end_positions_context + sep_idx + 1\n",
 129 |     "\n",
 130 |     "    if end_positions > 512:\n",
 131 |     "        start_positions, end_positions = 0, 0\n",
 132 |     "\n",
 133 |     "    encodings.update({'start_positions': start_positions,\n",
 134 |     "                      'end_positions': end_positions,\n",
 135 |     "                      'attention_mask': encodings['attention_mask']})\n",
 136 |     "    return encodings"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": 4,
 142 |    "metadata": {},
 143 |    "outputs": [
 144 |     {
 145 |      "data": {
 146 |       "application/vnd.jupyter.widget-view+json": {
 147 |        "model_id": "ca4e9622c3254a46b42130e280594080",
 148 |        "version_major": 2,
 149 |        "version_minor": 0
 150 |       },
 151 |       "text/plain": [
 152 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2043.0, style=ProgressStyle(description…"
 153 |       ]
 154 |      },
 155 |      "metadata": {},
 156 |      "output_type": "display_data"
 157 |     },
 158 |     {
 159 |      "name": "stdout",
 160 |      "output_type": "stream",
 161 |      "text": [
 162 |       "\n"
 163 |      ]
 164 |     },
 165 |     {
 166 |      "data": {
 167 |       "application/vnd.jupyter.widget-view+json": {
 168 |        "model_id": "36d24978dfd84650b485d2ebbddaa6c3",
 169 |        "version_major": 2,
 170 |        "version_minor": 0
 171 |       },
 172 |       "text/plain": [
 173 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=986.0, style=ProgressStyle(description_…"
 174 |       ]
 175 |      },
 176 |      "metadata": {},
 177 |      "output_type": "display_data"
 178 |     },
 179 |     {
 180 |      "name": "stdout",
 181 |      "output_type": "stream",
 182 |      "text": [
 183 |       "\n",
 184 |       "Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown size, total: 119.27 MiB) to /.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41...\n"
 185 |      ]
 186 |     },
 187 |     {
 188 |      "data": {
 189 |       "application/vnd.jupyter.widget-view+json": {
 190 |        "model_id": "f94edb441c714bce84e7bb0a574c6823",
 191 |        "version_major": 2,
 192 |        "version_minor": 0
 193 |       },
 194 |       "text/plain": [
 195 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=8116577.0, style=ProgressStyle(descript…"
 196 |       ]
 197 |      },
 198 |      "metadata": {},
 199 |      "output_type": "display_data"
 200 |     },
 201 |     {
 202 |      "name": "stdout",
 203 |      "output_type": "stream",
 204 |      "text": [
 205 |       "\n"
 206 |      ]
 207 |     },
 208 |     {
 209 |      "data": {
 210 |       "application/vnd.jupyter.widget-view+json": {
 211 |        "model_id": "351a3e7b4fc241f1a4e983f2a14767a0",
 212 |        "version_major": 2,
 213 |        "version_minor": 0
 214 |       },
 215 |       "text/plain": [
 216 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054280.0, style=ProgressStyle(descript…"
 217 |       ]
 218 |      },
 219 |      "metadata": {},
 220 |      "output_type": "display_data"
 221 |     },
 222 |     {
 223 |      "name": "stdout",
 224 |      "output_type": "stream",
 225 |      "text": [
 226 |       "\n"
 227 |      ]
 228 |     },
 229 |     {
 230 |      "data": {
 231 |       "application/vnd.jupyter.widget-view+json": {
 232 |        "model_id": "5102dc040b554a53b433aafa90d8f75d",
 233 |        "version_major": 2,
 234 |        "version_minor": 0
 235 |       },
 236 |       "text/plain": [
 237 |        "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
 238 |       ]
 239 |      },
 240 |      "metadata": {},
 241 |      "output_type": "display_data"
 242 |     },
 243 |     {
 244 |      "name": "stdout",
 245 |      "output_type": "stream",
 246 |      "text": [
 247 |       "\r"
 248 |      ]
 249 |     },
 250 |     {
 251 |      "data": {
 252 |       "application/vnd.jupyter.widget-view+json": {
 253 |        "model_id": "bf4ce91a4e0c4b379f49a431155bb052",
 254 |        "version_major": 2,
 255 |        "version_minor": 0
 256 |       },
 257 |       "text/plain": [
 258 |        "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
 259 |       ]
 260 |      },
 261 |      "metadata": {},
 262 |      "output_type": "display_data"
 263 |     },
 264 |     {
 265 |      "name": "stdout",
 266 |      "output_type": "stream",
 267 |      "text": [
 268 |       "Dataset squad downloaded and prepared to /.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41. Subsequent calls will reuse this data.\n"
 269 |      ]
 270 |     },
 271 |     {
 272 |      "name": "stderr",
 273 |      "output_type": "stream",
 274 |      "text": [
 275 |       "Reusing dataset squad (/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)\n",
 276 |       "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:1773: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
 277 |       "  FutureWarning,\n"
 278 |      ]
 279 |     },
 280 |     {
 281 |      "data": {
 282 |       "application/vnd.jupyter.widget-view+json": {
 283 |        "model_id": "cdb0696df27646d2a7293feb6235a180",
 284 |        "version_major": 2,
 285 |        "version_minor": 0
 286 |       },
 287 |       "text/plain": [
 288 |        "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))"
 289 |       ]
 290 |      },
 291 |      "metadata": {},
 292 |      "output_type": "display_data"
 293 |     },
 294 |     {
 295 |      "name": "stdout",
 296 |      "output_type": "stream",
 297 |      "text": [
 298 |       "\n"
 299 |      ]
 300 |     },
 301 |     {
 302 |      "data": {
 303 |       "application/vnd.jupyter.widget-view+json": {
 304 |        "model_id": "cefbdc9d6e184b26940367884b1ffcc4",
 305 |        "version_major": 2,
 306 |        "version_minor": 0
 307 |       },
 308 |       "text/plain": [
 309 |        "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))"
 310 |       ]
 311 |      },
 312 |      "metadata": {},
 313 |      "output_type": "display_data"
 314 |     },
 315 |     {
 316 |      "name": "stdout",
 317 |      "output_type": "stream",
 318 |      "text": [
 319 |       "\n"
 320 |      ]
 321 |     }
 322 |    ],
 323 |    "source": [
 324 |     "# load train and validation split of squad\n",
 325 |     "train_dataset  = nlp.load_dataset('squad', split='train')\n",
 326 |     "valid_dataset = nlp.load_dataset('squad', split='validation')\n",
 327 |     "\n",
 328 |     "# Temp. Only for testing quickly\n",
 329 |     "train_dataset = nlp.Dataset.from_dict(train_dataset[:3])\n",
 330 |     "valid_dataset = nlp.Dataset.from_dict(valid_dataset[:3])\n",
 331 |     "\n",
 332 |     "train_dataset = train_dataset.map(convert_to_features)\n",
 333 |     "valid_dataset = valid_dataset.map(convert_to_features, load_from_cache_file=False)\n",
 334 |     "\n",
 335 |     "\n",
 336 |     "# set the tensor type and the columns which the dataset should return\n",
 337 |     "columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']\n",
 338 |     "train_dataset.set_format(type='torch', columns=columns)\n",
 339 |     "valid_dataset.set_format(type='torch', columns=columns)"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 5,
 345 |    "metadata": {},
 346 |    "outputs": [
 347 |     {
 348 |      "data": {
 349 |       "text/plain": [
 350 |        "(3, 3)"
 351 |       ]
 352 |      },
 353 |      "execution_count": 5,
 354 |      "metadata": {},
 355 |      "output_type": "execute_result"
 356 |     }
 357 |    ],
 358 |    "source": [
 359 |     "len(train_dataset), len(valid_dataset)"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "code",
 364 |    "execution_count": 6,
 365 |    "metadata": {},
 366 |    "outputs": [],
 367 |    "source": []
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": 7,
 372 |    "metadata": {},
 373 |    "outputs": [],
 374 |    "source": [
 375 |     "t = torch.load('train_data.pt')"
 376 |    ]
 377 |   },
 378 |   {
 379 |    "cell_type": "code",
 380 |    "execution_count": null,
 381 |    "metadata": {},
 382 |    "outputs": [],
 383 |    "source": []
 384 |   },
 385 |   {
 386 |    "cell_type": "code",
 387 |    "execution_count": null,
 388 |    "metadata": {},
 389 |    "outputs": [],
 390 |    "source": []
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": 8,
 395 |    "metadata": {},
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "# Write training script"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "code",
 403 |    "execution_count": null,
 404 |    "metadata": {},
 405 |    "outputs": [],
 406 |    "source": []
 407 |   },
 408 |   {
 409 |    "cell_type": "code",
 410 |    "execution_count": 9,
 411 |    "metadata": {},
 412 |    "outputs": [],
 413 |    "source": []
 414 |   },
 415 |   {
 416 |    "cell_type": "code",
 417 |    "execution_count": 10,
 418 |    "metadata": {},
 419 |    "outputs": [],
 420 |    "source": []
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": 11,
 425 |    "metadata": {},
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "import json\n",
 429 |     "\n",
 430 |     "args_dict = {\n",
 431 |     "  \"n_gpu\": 1,\n",
 432 |     "  \"model_name_or_path\": 'allenai/longformer-base-4096',\n",
 433 |     "  \"max_len\": 512 ,\n",
 434 |     "  \"output_dir\": './models',\n",
 435 |     "  \"overwrite_output_dir\": True,\n",
 436 |     "  \"per_gpu_train_batch_size\": 8,\n",
 437 |     "  \"per_gpu_eval_batch_size\": 8,\n",
 438 |     "  \"gradient_accumulation_steps\": 16,\n",
 439 |     "  \"learning_rate\": 1e-4,\n",
 440 |     "  \"num_train_epochs\": 3,\n",
 441 |     "  \"do_train\": True\n",
 442 |     "}"
 443 |    ]
 444 |   },
 445 |   {
 446 |    "cell_type": "code",
 447 |    "execution_count": 12,
 448 |    "metadata": {},
 449 |    "outputs": [],
 450 |    "source": []
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": null,
 455 |    "metadata": {},
 456 |    "outputs": [],
 457 |    "source": []
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": 14,
 462 |    "metadata": {},
 463 |    "outputs": [],
 464 |    "source": [
 465 |     "## SQuAD evaluation script. Modifed slightly for this notebook\n",
 466 |     "\n",
 467 |     "from __future__ import print_function\n",
 468 |     "from collections import Counter\n",
 469 |     "import string\n",
 470 |     "import re\n",
 471 |     "import argparse\n",
 472 |     "import json\n",
 473 |     "import sys\n",
 474 |     "\n",
 475 |     "\n",
 476 |     "def normalize_answer(s):\n",
 477 |     "    \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n",
 478 |     "    def remove_articles(text):\n",
 479 |     "        return re.sub(r'\\b(a|an|the)\\b', ' ', text)\n",
 480 |     "\n",
 481 |     "    def white_space_fix(text):\n",
 482 |     "        return ' '.join(text.split())\n",
 483 |     "\n",
 484 |     "    def remove_punc(text):\n",
 485 |     "        exclude = set(string.punctuation)\n",
 486 |     "        return ''.join(ch for ch in text if ch not in exclude)\n",
 487 |     "\n",
 488 |     "    def lower(text):\n",
 489 |     "        return text.lower()\n",
 490 |     "\n",
 491 |     "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
 492 |     "\n",
 493 |     "\n",
 494 |     "def f1_score(prediction, ground_truth):\n",
 495 |     "    prediction_tokens = normalize_answer(prediction).split()\n",
 496 |     "    ground_truth_tokens = normalize_answer(ground_truth).split()\n",
 497 |     "    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)\n",
 498 |     "    num_same = sum(common.values())\n",
 499 |     "    if num_same == 0:\n",
 500 |     "        return 0\n",
 501 |     "    precision = 1.0 * num_same / len(prediction_tokens)\n",
 502 |     "    recall = 1.0 * num_same / len(ground_truth_tokens)\n",
 503 |     "    f1 = (2 * precision * recall) / (precision + recall)\n",
 504 |     "    return f1\n",
 505 |     "\n",
 506 |     "\n",
 507 |     "def exact_match_score(prediction, ground_truth):\n",
 508 |     "    return (normalize_answer(prediction) == normalize_answer(ground_truth))\n",
 509 |     "\n",
 510 |     "\n",
 511 |     "def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):\n",
 512 |     "    scores_for_ground_truths = []\n",
 513 |     "    for ground_truth in ground_truths:\n",
 514 |     "        score = metric_fn(prediction, ground_truth)\n",
 515 |     "        scores_for_ground_truths.append(score)\n",
 516 |     "    return max(scores_for_ground_truths)\n",
 517 |     "\n",
 518 |     "\n",
 519 |     "def evaluate(gold_answers, predictions):\n",
 520 |     "    f1 = exact_match = total = 0\n",
 521 |     "\n",
 522 |     "    for ground_truths, prediction in zip(gold_answers, predictions):\n",
 523 |     "        total += 1\n",
 524 |     "        exact_match += metric_max_over_ground_truths(\n",
 525 |     "                    exact_match_score, prediction, ground_truths)\n",
 526 |     "        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)\n",
 527 |     "    \n",
 528 |     "    exact_match = 100.0 * exact_match / total\n",
 529 |     "    f1 = 100.0 * f1 / total\n",
 530 |     "\n",
 531 |     "    return {'exact_match': exact_match, 'f1': f1}"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "code",
 536 |    "execution_count": 15,
 537 |    "metadata": {},
 538 |    "outputs": [],
 539 |    "source": [
 540 |     "import torch\n",
 541 |     "from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering\n",
 542 |     "from tqdm.auto import tqdm"
 543 |    ]
 544 |   },
 545 |   {
 546 |    "cell_type": "code",
 547 |    "execution_count": 16,
 548 |    "metadata": {},
 549 |    "outputs": [
 550 |     {
 551 |      "data": {
 552 |       "text/plain": [
 553 |        "LongformerForQuestionAnswering(\n",
 554 |        "  (longformer): LongformerModel(\n",
 555 |        "    (embeddings): LongformerEmbeddings(\n",
 556 |        "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
 557 |        "      (position_embeddings): Embedding(4098, 768, padding_idx=1)\n",
 558 |        "      (token_type_embeddings): Embedding(1, 768)\n",
 559 |        "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 560 |        "      (dropout): Dropout(p=0.1, inplace=False)\n",
 561 |        "    )\n",
 562 |        "    (encoder): LongformerEncoder(\n",
 563 |        "      (layer): ModuleList(\n",
 564 |        "        (0): LongformerLayer(\n",
 565 |        "          (attention): LongformerAttention(\n",
 566 |        "            (self): LongformerSelfAttention(\n",
 567 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 568 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 569 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 570 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 571 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 572 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 573 |        "            )\n",
 574 |        "            (output): LongformerSelfOutput(\n",
 575 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 576 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 577 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 578 |        "            )\n",
 579 |        "          )\n",
 580 |        "          (intermediate): LongformerIntermediate(\n",
 581 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 582 |        "          )\n",
 583 |        "          (output): LongformerOutput(\n",
 584 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 585 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 586 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 587 |        "          )\n",
 588 |        "        )\n",
 589 |        "        (1): LongformerLayer(\n",
 590 |        "          (attention): LongformerAttention(\n",
 591 |        "            (self): LongformerSelfAttention(\n",
 592 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 593 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 594 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 595 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 596 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 597 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 598 |        "            )\n",
 599 |        "            (output): LongformerSelfOutput(\n",
 600 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 601 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 602 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 603 |        "            )\n",
 604 |        "          )\n",
 605 |        "          (intermediate): LongformerIntermediate(\n",
 606 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 607 |        "          )\n",
 608 |        "          (output): LongformerOutput(\n",
 609 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 610 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 611 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 612 |        "          )\n",
 613 |        "        )\n",
 614 |        "        (2): LongformerLayer(\n",
 615 |        "          (attention): LongformerAttention(\n",
 616 |        "            (self): LongformerSelfAttention(\n",
 617 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 618 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 619 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 620 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 621 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 622 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 623 |        "            )\n",
 624 |        "            (output): LongformerSelfOutput(\n",
 625 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 626 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 627 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 628 |        "            )\n",
 629 |        "          )\n",
 630 |        "          (intermediate): LongformerIntermediate(\n",
 631 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 632 |        "          )\n",
 633 |        "          (output): LongformerOutput(\n",
 634 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 635 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 636 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 637 |        "          )\n",
 638 |        "        )\n",
 639 |        "        (3): LongformerLayer(\n",
 640 |        "          (attention): LongformerAttention(\n",
 641 |        "            (self): LongformerSelfAttention(\n",
 642 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 643 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 644 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 645 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 646 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 647 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 648 |        "            )\n",
 649 |        "            (output): LongformerSelfOutput(\n",
 650 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 651 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 652 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 653 |        "            )\n",
 654 |        "          )\n",
 655 |        "          (intermediate): LongformerIntermediate(\n",
 656 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 657 |        "          )\n",
 658 |        "          (output): LongformerOutput(\n",
 659 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 660 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 661 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 662 |        "          )\n",
 663 |        "        )\n",
 664 |        "        (4): LongformerLayer(\n",
 665 |        "          (attention): LongformerAttention(\n",
 666 |        "            (self): LongformerSelfAttention(\n",
 667 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 668 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 669 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 670 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 671 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 672 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 673 |        "            )\n",
 674 |        "            (output): LongformerSelfOutput(\n",
 675 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 676 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 677 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 678 |        "            )\n",
 679 |        "          )\n",
 680 |        "          (intermediate): LongformerIntermediate(\n",
 681 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 682 |        "          )\n",
 683 |        "          (output): LongformerOutput(\n",
 684 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 685 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 686 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 687 |        "          )\n",
 688 |        "        )\n",
 689 |        "        (5): LongformerLayer(\n",
 690 |        "          (attention): LongformerAttention(\n",
 691 |        "            (self): LongformerSelfAttention(\n",
 692 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 693 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 694 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 695 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 696 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 697 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 698 |        "            )\n",
 699 |        "            (output): LongformerSelfOutput(\n",
 700 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 701 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 702 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 703 |        "            )\n",
 704 |        "          )\n",
 705 |        "          (intermediate): LongformerIntermediate(\n",
 706 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 707 |        "          )\n",
 708 |        "          (output): LongformerOutput(\n",
 709 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 710 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 711 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 712 |        "          )\n",
 713 |        "        )\n",
 714 |        "        (6): LongformerLayer(\n",
 715 |        "          (attention): LongformerAttention(\n",
 716 |        "            (self): LongformerSelfAttention(\n",
 717 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 718 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 719 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 720 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 721 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 722 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 723 |        "            )\n",
 724 |        "            (output): LongformerSelfOutput(\n",
 725 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 726 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 727 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 728 |        "            )\n",
 729 |        "          )\n",
 730 |        "          (intermediate): LongformerIntermediate(\n",
 731 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 732 |        "          )\n",
 733 |        "          (output): LongformerOutput(\n",
 734 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 735 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 736 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 737 |        "          )\n",
 738 |        "        )\n",
 739 |        "        (7): LongformerLayer(\n",
 740 |        "          (attention): LongformerAttention(\n",
 741 |        "            (self): LongformerSelfAttention(\n",
 742 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 743 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 744 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 745 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 746 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 747 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 748 |        "            )\n",
 749 |        "            (output): LongformerSelfOutput(\n",
 750 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 751 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 752 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 753 |        "            )\n",
 754 |        "          )\n",
 755 |        "          (intermediate): LongformerIntermediate(\n",
 756 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 757 |        "          )\n",
 758 |        "          (output): LongformerOutput(\n",
 759 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 760 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 761 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 762 |        "          )\n",
 763 |        "        )\n",
 764 |        "        (8): LongformerLayer(\n",
 765 |        "          (attention): LongformerAttention(\n",
 766 |        "            (self): LongformerSelfAttention(\n",
 767 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 768 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 769 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 770 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 771 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 772 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 773 |        "            )\n",
 774 |        "            (output): LongformerSelfOutput(\n",
 775 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 776 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 777 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 778 |        "            )\n",
 779 |        "          )\n",
 780 |        "          (intermediate): LongformerIntermediate(\n",
 781 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 782 |        "          )\n",
 783 |        "          (output): LongformerOutput(\n",
 784 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 785 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 786 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 787 |        "          )\n",
 788 |        "        )\n",
 789 |        "        (9): LongformerLayer(\n",
 790 |        "          (attention): LongformerAttention(\n",
 791 |        "            (self): LongformerSelfAttention(\n",
 792 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 793 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 794 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 795 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 796 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 797 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 798 |        "            )\n",
 799 |        "            (output): LongformerSelfOutput(\n",
 800 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 801 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 802 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 803 |        "            )\n",
 804 |        "          )\n",
 805 |        "          (intermediate): LongformerIntermediate(\n",
 806 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 807 |        "          )\n",
 808 |        "          (output): LongformerOutput(\n",
 809 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 810 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 811 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 812 |        "          )\n",
 813 |        "        )\n",
 814 |        "        (10): LongformerLayer(\n",
 815 |        "          (attention): LongformerAttention(\n",
 816 |        "            (self): LongformerSelfAttention(\n",
 817 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 818 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 819 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 820 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 821 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 822 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 823 |        "            )\n",
 824 |        "            (output): LongformerSelfOutput(\n",
 825 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 826 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 827 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 828 |        "            )\n",
 829 |        "          )\n",
 830 |        "          (intermediate): LongformerIntermediate(\n",
 831 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 832 |        "          )\n",
 833 |        "          (output): LongformerOutput(\n",
 834 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 835 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 836 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 837 |        "          )\n",
 838 |        "        )\n",
 839 |        "        (11): LongformerLayer(\n",
 840 |        "          (attention): LongformerAttention(\n",
 841 |        "            (self): LongformerSelfAttention(\n",
 842 |        "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
 843 |        "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
 844 |        "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
 845 |        "              (query_global): Linear(in_features=768, out_features=768, bias=True)\n",
 846 |        "              (key_global): Linear(in_features=768, out_features=768, bias=True)\n",
 847 |        "              (value_global): Linear(in_features=768, out_features=768, bias=True)\n",
 848 |        "            )\n",
 849 |        "            (output): LongformerSelfOutput(\n",
 850 |        "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
 851 |        "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 852 |        "              (dropout): Dropout(p=0.1, inplace=False)\n",
 853 |        "            )\n",
 854 |        "          )\n",
 855 |        "          (intermediate): LongformerIntermediate(\n",
 856 |        "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
 857 |        "          )\n",
 858 |        "          (output): LongformerOutput(\n",
 859 |        "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
 860 |        "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
 861 |        "            (dropout): Dropout(p=0.1, inplace=False)\n",
 862 |        "          )\n",
 863 |        "        )\n",
 864 |        "      )\n",
 865 |        "    )\n",
 866 |        "  )\n",
 867 |        "  (qa_outputs): Linear(in_features=768, out_features=2, bias=True)\n",
 868 |        ")"
 869 |       ]
 870 |      },
 871 |      "execution_count": 16,
 872 |      "metadata": {},
 873 |      "output_type": "execute_result"
 874 |     }
 875 |    ],
 876 |    "source": [
 877 |     "tokenizer = LongformerTokenizerFast.from_pretrained('models')\n",
 878 |     "model = LongformerForQuestionAnswering.from_pretrained('models')\n",
 879 |     "model = model.cuda()\n",
 880 |     "model.eval()"
 881 |    ]
 882 |   },
 883 |   {
 884 |    "cell_type": "code",
 885 |    "execution_count": 17,
 886 |    "metadata": {},
 887 |    "outputs": [],
 888 |    "source": []
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": 18,
 893 |    "metadata": {},
 894 |    "outputs": [],
 895 |    "source": []
 896 |   },
 897 |   {
 898 |    "cell_type": "code",
 899 |    "execution_count": null,
 900 |    "metadata": {},
 901 |    "outputs": [],
 902 |    "source": []
 903 |   },
 904 |   {
 905 |    "cell_type": "code",
 906 |    "execution_count": null,
 907 |    "metadata": {},
 908 |    "outputs": [],
 909 |    "source": []
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": null,
 914 |    "metadata": {},
 915 |    "outputs": [],
 916 |    "source": []
 917 |   },
 918 |   {
 919 |    "cell_type": "code",
 920 |    "execution_count": null,
 921 |    "metadata": {},
 922 |    "outputs": [],
 923 |    "source": []
 924 |   },
 925 |   {
 926 |    "cell_type": "code",
 927 |    "execution_count": null,
 928 |    "metadata": {},
 929 |    "outputs": [],
 930 |    "source": []
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": null,
 935 |    "metadata": {},
 936 |    "outputs": [],
 937 |    "source": []
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": null,
 942 |    "metadata": {},
 943 |    "outputs": [],
 944 |    "source": []
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": null,
 949 |    "metadata": {},
 950 |    "outputs": [],
 951 |    "source": []
 952 |   },
 953 |   {
 954 |    "cell_type": "code",
 955 |    "execution_count": null,
 956 |    "metadata": {},
 957 |    "outputs": [],
 958 |    "source": []
 959 |   },
 960 |   {
 961 |    "cell_type": "code",
 962 |    "execution_count": null,
 963 |    "metadata": {},
 964 |    "outputs": [],
 965 |    "source": []
 966 |   },
 967 |   {
 968 |    "cell_type": "code",
 969 |    "execution_count": null,
 970 |    "metadata": {},
 971 |    "outputs": [],
 972 |    "source": []
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": null,
 977 |    "metadata": {},
 978 |    "outputs": [],
 979 |    "source": []
 980 |   },
 981 |   {
 982 |    "cell_type": "code",
 983 |    "execution_count": null,
 984 |    "metadata": {},
 985 |    "outputs": [],
 986 |    "source": []
 987 |   },
 988 |   {
 989 |    "cell_type": "code",
 990 |    "execution_count": null,
 991 |    "metadata": {},
 992 |    "outputs": [],
 993 |    "source": []
 994 |   },
 995 |   {
 996 |    "cell_type": "code",
 997 |    "execution_count": null,
 998 |    "metadata": {},
 999 |    "outputs": [],
1000 |    "source": []
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": null,
1005 |    "metadata": {},
1006 |    "outputs": [],
1007 |    "source": []
1008 |   },
1009 |   {
1010 |    "cell_type": "code",
1011 |    "execution_count": null,
1012 |    "metadata": {},
1013 |    "outputs": [],
1014 |    "source": []
1015 |   },
1016 |   {
1017 |    "cell_type": "code",
1018 |    "execution_count": null,
1019 |    "metadata": {},
1020 |    "outputs": [],
1021 |    "source": []
1022 |   },
1023 |   {
1024 |    "cell_type": "code",
1025 |    "execution_count": null,
1026 |    "metadata": {},
1027 |    "outputs": [],
1028 |    "source": []
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": null,
1033 |    "metadata": {},
1034 |    "outputs": [],
1035 |    "source": []
1036 |   },
1037 |   {
1038 |    "cell_type": "code",
1039 |    "execution_count": null,
1040 |    "metadata": {},
1041 |    "outputs": [],
1042 |    "source": []
1043 |   }
1044 |  ],
1045 |  "metadata": {
1046 |   "kernelspec": {
1047 |    "display_name": "Python 3",
1048 |    "language": "python",
1049 |    "name": "python3"
1050 |   },
1051 |   "language_info": {
1052 |    "codemirror_mode": {
1053 |     "name": "ipython",
1054 |     "version": 3
1055 |    },
1056 |    "file_extension": ".py",
1057 |    "mimetype": "text/x-python",
1058 |    "name": "python",
1059 |    "nbconvert_exporter": "python",
1060 |    "pygments_lexer": "ipython3",
1061 |    "version": "3.6.9"
1062 |   }
1063 |  },
1064 |  "nbformat": 4,
1065 |  "nbformat_minor": 4
1066 | }
1067 | 


--------------------------------------------------------------------------------