├── .idea
    ├── .gitignore
    ├── deployment.xml
    ├── encodings.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── remote-mappings.xml
    ├── sshConfigs.xml
    ├── vcs.xml
    ├── webServers.xml
    └── xTune.iml
├── README.md
├── scripts
    ├── cross-lingual-transfer
    │   ├── train_mlqa.sh
    │   ├── train_panx.sh
    │   ├── train_pawsx.sh
    │   ├── train_tydiqa.sh
    │   ├── train_udpos.sh
    │   ├── train_xnli.sh
    │   └── train_xquad.sh
    ├── download_data.sh
    ├── download_model.sh
    ├── preprocess_panx.sh
    ├── preprocess_udpos.sh
    ├── train.sh
    └── translate-train-all
    │   ├── train_mlqa.sh
    │   ├── train_panx.sh
    │   ├── train_pawsx.sh
    │   ├── train_tydiqa.sh
    │   ├── train_udpos.sh
    │   ├── train_xnli.sh
    │   └── train_xquad.sh
├── setup.py
├── src
    ├── pequod
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   └── __init__.cpython-37.pyc
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── sampler.cpython-37.pyc
    │   │   │   ├── utils_squad.cpython-37.pyc
    │   │   │   ├── utils_squad_evaluate.cpython-37.pyc
    │   │   │   ├── xdoc.cpython-37.pyc
    │   │   │   ├── xqa.cpython-37.pyc
    │   │   │   └── xretrieval.cpython-37.pyc
    │   │   ├── dataloader.py
    │   │   ├── sampler.py
    │   │   ├── utils_squad.py
    │   │   ├── utils_squad_evaluate.py
    │   │   ├── wili.py
    │   │   ├── xdoc.py
    │   │   ├── xqa.py
    │   │   └── xretrieval.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── bretrieval.cpython-37.pyc
    │   │   │   ├── bucc_eval.cpython-37.pyc
    │   │   │   ├── evaluator.cpython-37.pyc
    │   │   │   ├── utils_retrieve.cpython-37.pyc
    │   │   │   └── xretrieval.cpython-37.pyc
    │   │   ├── bretrieval.py
    │   │   ├── evaluator.py
    │   │   ├── utils_retrieve.py
    │   │   └── xretrieval.py
    │   ├── io.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── roberta.cpython-37.pyc
    │   │   └── roberta.py
    │   ├── optim
    │   │   ├── __init__.py
    │   │   ├── la.py
    │   │   └── la0.py
    │   ├── text
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── tokenization_sentencepiece.cpython-37.pyc
    │   │   └── tokenization_sentencepiece.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── convert.cpython-37.pyc
    │   │   └── convert.py
    │   └── training
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       └── trainer.cpython-37.pyc
    │   │   ├── trainer.py
    │   │   └── xtrainer.py
    ├── run_cls.py
    ├── run_qa.py
    ├── run_tag.py
    ├── tools
    │   ├── __init__.py
    │   ├── check_many2many_alignment.py
    │   ├── dump_hf_state_dict.py
    │   ├── get_eval_results.py
    │   ├── sample_xnli.py
    │   └── xnli_sampling_statistics.py
    ├── transformers
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── convert.py
    │   │   ├── download.py
    │   │   ├── env.py
    │   │   ├── run.py
    │   │   ├── serving.py
    │   │   ├── train.py
    │   │   └── user.py
    │   ├── configuration_albert.py
    │   ├── configuration_auto.py
    │   ├── configuration_bart.py
    │   ├── configuration_bert.py
    │   ├── configuration_camembert.py
    │   ├── configuration_ctrl.py
    │   ├── configuration_distilbert.py
    │   ├── configuration_flaubert.py
    │   ├── configuration_gpt2.py
    │   ├── configuration_mmbt.py
    │   ├── configuration_openai.py
    │   ├── configuration_roberta.py
    │   ├── configuration_t5.py
    │   ├── configuration_transfo_xl.py
    │   ├── configuration_utils.py
    │   ├── configuration_xlm.py
    │   ├── configuration_xlm_roberta.py
    │   ├── configuration_xlnet.py
    │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_pytorch_checkpoint_to_tf2.py
    │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── metrics
    │   │   │   ├── __init__.py
    │   │   │   ├── evaluate_mlqa.py
    │   │   │   ├── evaluate_squad.py
    │   │   │   ├── mlqa_evaluation_v1.py
    │   │   │   └── squad_metrics.py
    │   │   └── processors
    │   │   │   ├── __init__.py
    │   │   │   ├── glue.py
    │   │   │   ├── squad.py
    │   │   │   ├── utils.py
    │   │   │   ├── xglue.py
    │   │   │   ├── xnli.py
    │   │   │   └── xtreme.py
    │   ├── file_utils.py
    │   ├── hf_api.py
    │   ├── modelcard.py
    │   ├── modeling_albert.py
    │   ├── modeling_auto.py
    │   ├── modeling_bart.py
    │   ├── modeling_bert.py
    │   ├── modeling_camembert.py
    │   ├── modeling_ctrl.py
    │   ├── modeling_distilbert.py
    │   ├── modeling_encoder_decoder.py
    │   ├── modeling_flaubert.py
    │   ├── modeling_gpt2.py
    │   ├── modeling_mmbt.py
    │   ├── modeling_openai.py
    │   ├── modeling_roberta.py
    │   ├── modeling_t5.py
    │   ├── modeling_tf_albert.py
    │   ├── modeling_tf_auto.py
    │   ├── modeling_tf_bert.py
    │   ├── modeling_tf_camembert.py
    │   ├── modeling_tf_ctrl.py
    │   ├── modeling_tf_distilbert.py
    │   ├── modeling_tf_gpt2.py
    │   ├── modeling_tf_openai.py
    │   ├── modeling_tf_pytorch_utils.py
    │   ├── modeling_tf_roberta.py
    │   ├── modeling_tf_t5.py
    │   ├── modeling_tf_transfo_xl.py
    │   ├── modeling_tf_transfo_xl_utilities.py
    │   ├── modeling_tf_utils.py
    │   ├── modeling_tf_xlm.py
    │   ├── modeling_tf_xlm_roberta.py
    │   ├── modeling_tf_xlnet.py
    │   ├── modeling_transfo_xl.py
    │   ├── modeling_transfo_xl_utilities.py
    │   ├── modeling_utils.py
    │   ├── modeling_xlm.py
    │   ├── modeling_xlm_roberta.py
    │   ├── modeling_xlnet.py
    │   ├── optimization.py
    │   ├── optimization_tf.py
    │   ├── pipelines.py
    │   ├── tokenization_albert.py
    │   ├── tokenization_auto.py
    │   ├── tokenization_bart.py
    │   ├── tokenization_bert.py
    │   ├── tokenization_bert_japanese.py
    │   ├── tokenization_camembert.py
    │   ├── tokenization_ctrl.py
    │   ├── tokenization_distilbert.py
    │   ├── tokenization_flaubert.py
    │   ├── tokenization_gpt2.py
    │   ├── tokenization_openai.py
    │   ├── tokenization_roberta.py
    │   ├── tokenization_t5.py
    │   ├── tokenization_transfo_xl.py
    │   ├── tokenization_utils.py
    │   ├── tokenization_xlm.py
    │   ├── tokenization_xlm_roberta.py
    │   ├── tokenization_xlnet.py
    │   └── utils_encoder_decoder.py
    ├── ud-conversion-tools
    │   ├── conllu_to_conll.py
    │   └── lib
    │   │   ├── __init__.py
    │   │   └── conll.py
    └── utils_tag.py
├── transformers-cli
└── utils_preprocess.py


/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /../../../../../../:\Users\v-zhebo\OneDrive - Microsoft\stabletune\.idea/dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
 4 |     <serverData>
 5 |       <paths name="FAREAST.v-zhebo@GCRAZGDL750.redmond.corp.microsoft.com:22">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping local="$PROJECT_DIR$" web="/" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |       <paths name="FAREAST.v-zhebo@GCRAZGDL750.redmond.corp.microsoft.com:22 key">
13 |         <serverdata>
14 |           <mappings>
15 |             <mapping local="$PROJECT_DIR$" web="/" />
16 |           </mappings>
17 |         </serverdata>
18 |       </paths>
19 |       <paths name="dgx">
20 |         <serverdata>
21 |           <mappings>
22 |             <mapping deploy="workspace/xtune" local="$PROJECT_DIR$" web="/" />
23 |           </mappings>
24 |         </serverdata>
25 |       </paths>
26 |     </serverData>
27 |   </component>
28 | </project>


--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" defaultCharsetForPropertiesFiles="UTF-8">
4 |     <file url="PROJECT" charset="UTF-8" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.10 (sftp://FAREAST.v-zhebo@GCRAZGDL487.redmond.corp.microsoft.com:22/home/v-zhebo/.conda/envs/py36/bin/python)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/xTune.iml" filepath="$PROJECT_DIR$/.idea/xTune.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="RemoteMappingsManager">
 4 |     <list>
 5 |       <list>
 6 |         <remote-mappings server-id="python@sftp://FAREAST.v-zhebo@GCRAZGDL487.redmond.corp.microsoft.com:22/home/v-zhebo/.conda/envs/py36/bin/python">
 7 |           <settings>
 8 |             <list>
 9 |               <mapping local-root="$PROJECT_DIR$" remote-root="/home/v-zhebo/workspace/stabletune" />
10 |             </list>
11 |           </settings>
12 |         </remote-mappings>
13 |       </list>
14 |     </list>
15 |   </component>
16 | </project>


--------------------------------------------------------------------------------
/.idea/sshConfigs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="SshConfigs">
4 |     <configs>
5 |       <sshConfig host="gcrsandbox101.redmond.corp.microsoft.com" id="988a9131-1394-4843-9ed7-96b2f77f3df5" keyPath="C:\workspace\gcr_rsa.ppk" port="22" nameFormat="DESCRIPTIVE" username="FAREAST.v-zhebo" />
6 |     </configs>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/webServers.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="WebServers">
 4 |     <option name="servers">
 5 |       <webServer id="2159f78b-3349-4bc6-8b1a-a2e8e3c7c063" name="dgx">
 6 |         <fileTransfer rootFolder="/home/v-zhebo" accessType="SFTP" host="gcrsandbox101.redmond.corp.microsoft.com" port="22" sshConfigId="988a9131-1394-4843-9ed7-96b2f77f3df5" sshConfig="FAREAST.v-zhebo@gcrsandbox101.redmond.corp.microsoft.com:22 key" keyPair="true">
 7 |           <advancedOptions>
 8 |             <advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
 9 |           </advancedOptions>
10 |         </fileTransfer>
11 |       </webServer>
12 |     </option>
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/.idea/xTune.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Remote Python 3.6.10 (sftp://FAREAST.v-zhebo@GCRAZGDL487.redmond.corp.microsoft.com:22/home/v-zhebo/.conda/envs/py36/bin/python)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xTune
 2 | 
 3 | Code for ACL2021 paper [Consistency Regularization for Cross-Lingual Fine-Tuning](https://arxiv.org/pdf/2106.08226.pdf).
 4 | ## Environment
 5 | 
 6 | DockerFile: `dancingsoul/pytorch:xTune`
 7 | 
 8 | Install the fine-tuning code: `pip install --user .`
 9 | 
10 | ## Data & Model Preparation
11 | 
12 | ### XTREME Datasets  
13 | 
14 | 1) Create a download folder with `mkdir -p download` in the root of this project. 
15 | 2) manually download `panx_dataset` (for NER) [here][2], (note that it will download as `AmazonPhotos.zip`) to the download directory.
16 | 3) run the following command to download the remaining datasets: `bash scripts/download_data.sh`
17 | The code of downloading dataset from XTREME is from [xtreme offical repo][1].
18 | 
19 | Note that we keep the labels in test set for easier evaluation. To prevent accidental evaluation on the test sets while running experiments, the code of [xtreme offical repo][1] removes labels of the test data during pre-processing and changes the order of the test sentences for cross-lingual sentence retrieval. 
20 | Replace `csv.writer(fout, delimiter='\t')` with `csv.writer(fout, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')` in utils_process.py if using XTREME official repo.
21 | 
22 | ### Translations
23 | 
24 | XTREME provides translations for SQuAD v1.1 (only train and dev), MLQA, PAWS-X, TyDiQA-GoldP, XNLI, and XQuAD, which can be downloaded from [here][3]. The `xtreme_translations` folder should be moved to the download directory. 
25 | 
26 | The target language translations for panx and udpos are obtained with Google Translate, since they are not provided. Our processed version can be downloaded from [here][4]. It should be merged with the above `xtreme_translations` folder.
27 | 
28 | ### Bi-lingual dictionaries
29 | 
30 | We obtain the bi-lingual dictionaries from the [MUSE][6] repo. For convenience, you can download them from [here][7] and move it to the download directory, i.e., `./download/dicts`.
31 | 
32 | ### Models
33 | 
34 | XLM-Roberta is supported. We utilize the [huggingface][5] format, which can be downloaded with `bash scripts/download_model.sh`.
35 | 
36 | ## Fine-tuning Usage
37 | 
38 | Our default settings were using Nvidia V100-32GB GPU cards. If there were out-of-memory errors, you can reduce `per_gpu_train_batch_size` while increasing `gradient_accumulation_steps`, or use multi-GPU training.
39 | 
40 | xTune consists of a two-stage training process.
41 | - Stage 1: fine-tuning with example consistency on the English training set.
42 | - Stage 2: fine-tuning with example consistency on the augmented training set and regularize model consistency with the model from Stage 1.
43 | 
44 | It's recommended to use both Stage 1 and Stage 2 for token-level tasks, such as sequential labeling, and question answering. For text classification, you can only use Stage 1 if the computation budget was limited.
45 | 
46 | ```bash
47 | bash ./scripts/train.sh [setting] [dataset] [model] [stage] [gpu] [data_dir] [output_dir]
48 | ```
49 | where the options are described as follows:
50 | - `[setting]`: `translate-train-all` (using input translation for the languages other than English) or `cross-lingual-transfer` (only using English for zero-shot cross-lingual transfer)
51 | - `[dataset]`: dataset names in XTREME, i.e., `xnli`, `panx`, `pawsx`, `udpos`, `mlqa`, `tydiqa`, `xquad`
52 | - `[model]`: `xlm-roberta-base`, `xlm-roberta-large`
53 | - `[stage]`: `1` (first stage), `2` (second stage)
54 | - `[gpu]`: used to set environment variable `CUDA_VISIBLE_DEVICES`
55 | - `[data_dir]`: folder of training data
56 | - `[output_dir]`: folder of fine-tuning output
57 | 
58 | ## Examples: XTREME Tasks
59 | 
60 | ### XNLI fine-tuning on English training set and translated training sets (`translate-train-all`)
61 | 
62 | ```bash
63 | # run stage 1 of xTune
64 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 1
65 | # run stage 2 of xTune (optional)
66 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 2
67 | ```
68 | 
69 | ### XNLI fine-tuning on English training set (`cross-lingual-transfer`)
70 | 
71 | ```bash
72 | # run stage 1 of xTune
73 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 1
74 | # run stage 2 of xTune (optional)
75 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 2
76 | ```
77 | 
78 | ## Paper
79 | Please cite our paper `\cite{bo2021xtune}` if you found the resources in the repository useful.
80 | 
81 | ```
82 | @inproceedings{bo2021xtune,
83 | author = {Bo Zheng, Li Dong, Shaohan Huang, Wenhui Wang, Zewen Chi, Saksham Singhal, Wanxiang Che, Ting Liu, Xia Song, Furu Wei},
84 | booktitle = {Proceedings of ACL 2021},
85 | title = {{Consistency Regularization for Cross-Lingual Fine-Tuning}},
86 | year = {2021}
87 | }
88 | ```
89 | 
90 | ## Reference
91 | 
92 | 1. https://github.com/google-research/xtreme
93 | 2. https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN?_encoding=UTF8&%2AVersion%2A=1&%2Aentries%2A=0&mgh=1
94 | 3. https://console.cloud.google.com/storage/browser/xtreme_translations
95 | 4. https://drive.google.com/drive/folders/1Rdbc0Us_4I5MpRCwLASxBwqSW8_dlF87?usp=sharing
96 | 5. https://github.com/huggingface/transformers/
97 | 6. https://github.com/facebookresearch/MUSE
98 | 7. https://drive.google.com/drive/folders/1k9rQinwUXicglA5oyzo9xtgqiuUVDkjT?usp=sharing
99 | 


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_mlqa.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | 
 27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/
 28 | 
 29 | TASK='mlqa'
 30 | 
 31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
 32 | MODEL_PATH=$DATA_DIR/$MODEL
 33 | 
 34 | EPOCH=4
 35 | MAXL=384
 36 | LANGS="en,es,de,ar,hi,vi,zh"
 37 | BSR=0.3
 38 | SA=0.3
 39 | SNBS=-1
 40 | CSR=0.3
 41 | R1_LAMBDA=5.0
 42 | R2_LAMBDA=5.0
 43 | if [ $MODEL == "xlm-roberta-large" ]; then
 44 |   BATCH_SIZE=4
 45 |   GRAD_ACC=8
 46 |   LR=1.5e-5
 47 | else
 48 |   BATCH_SIZE=32
 49 |   GRAD_ACC=1
 50 |   LR=3e-5
 51 | fi
 52 | 
 53 | if [ $STAGE == 1 ]; then
 54 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 55 |   python ./src/run_qa.py --model_type xlmr \
 56 |         --task_name $TASK \
 57 |         --model_name_or_path $MODEL_PATH \
 58 |         --do_train \
 59 |         --do_eval \
 60 |         --language $LANGS \
 61 |         --train_language en \
 62 |         --data_dir $DATA_DIR/$TASK/ \
 63 |         --per_gpu_train_batch_size $BATCH_SIZE \
 64 |         --gradient_accumulation_steps $GRAD_ACC \
 65 |         --per_gpu_eval_batch_size 128 \
 66 |         --learning_rate $LR \
 67 |         --num_train_epochs $EPOCH \
 68 |         --save_steps 0 \
 69 |         --logging_each_epoch \
 70 |         --max_seq_length $MAXL \
 71 |         --doc_stride 128 \
 72 |         --output_dir $OUTPUT_DIR \
 73 |         --overwrite_output_dir \
 74 |         --evaluate_during_training \
 75 |         --logging_steps 50 \
 76 |         --evaluate_steps 0 \
 77 |         --seed $SEED \
 78 |         --fp16 --fp16_opt_level O2 \
 79 |         --warmup_steps -1 \
 80 |         --enable_r1_loss \
 81 |         --r1_lambda $R1_LAMBDA \
 82 |         --original_loss \
 83 |         --overall_ratio 1.0 \
 84 |         --keep_boundary_unchanged \
 85 |         --enable_code_switch \
 86 |         --code_switch_ratio $CSR \
 87 |         --dict_dir $DATA_DIR/dicts \
 88 |         --dict_languages es,de,ar,hi,vi,zh \
 89 |         --noised_max_seq_length $MAXL
 90 | elif [ $STAGE == 2 ]; then
 91 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 92 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
 93 |   python ./src/run_qa.py --model_type xlmr \
 94 |         --task_name $TASK \
 95 |         --model_name_or_path $MODEL_PATH \
 96 |         --do_train \
 97 |         --do_eval \
 98 |         --language $LANGS \
 99 |         --train_language en \
100 |         --data_dir $DATA_DIR/$TASK/ \
101 |         --per_gpu_train_batch_size $BATCH_SIZE \
102 |         --gradient_accumulation_steps $GRAD_ACC \
103 |         --per_gpu_eval_batch_size 128 \
104 |         --learning_rate $LR \
105 |         --num_train_epochs $EPOCH \
106 |         --save_steps 0 \
107 |         --logging_each_epoch \
108 |         --max_seq_length $MAXL \
109 |         --doc_stride 128 \
110 |         --output_dir $OUTPUT_DIR \
111 |         --overwrite_output_dir \
112 |         --evaluate_during_training \
113 |         --logging_steps 50 \
114 |         --evaluate_steps 0 \
115 |         --seed $SEED \
116 |         --fp16 --fp16_opt_level O2 \
117 |         --warmup_steps -1 \
118 |         --enable_r1_loss \
119 |         --r1_lambda $R1_LAMBDA \
120 |         --original_loss \
121 |         --overall_ratio 1.0 \
122 |         --keep_boundary_unchanged \
123 |         --enable_bpe_sampling \
124 |         --bpe_sampling_ratio $BSR \
125 |         --sampling_alpha $SA \
126 |         --sampling_nbest_size $SNBS \
127 |         --noised_max_seq_length $MAXL \
128 |         --enable_data_augmentation \
129 |         --augment_ratio 1.0 \
130 |         --augment_method ss \
131 |         --max_steps 24000 \
132 |         --r2_lambda $R2_LAMBDA \
133 |         --first_stage_model_path $FIRST_MODEL_PATH
134 | fi
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_panx.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='panx'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | EPOCH=10
 29 | MAX_LENGTH=128
 30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
 31 | EVALUATE_STEPS=1000
 32 | BSR=0.3
 33 | SA=0.3
 34 | SNBS=-1
 35 | R1_LAMBDA=5.0
 36 | R2_LAMBDA=5.0
 37 | if [ $MODEL == "xlm-roberta-large" ]; then
 38 |   BATCH_SIZE=32
 39 |   GRAD_ACC=1
 40 |   LR=7e-6
 41 | else
 42 |   BATCH_SIZE=32
 43 |   GRAD_ACC=1
 44 |   LR=1e-5
 45 | fi
 46 | 
 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt
 48 | 
 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
 50 | 
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python src/run_tag.py --model_type xlmr \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --do_predict \
 59 |         --do_predict_dev \
 60 |         --predict_langs $LANGS \
 61 |         --train_langs en \
 62 |         --data_dir $DATA_DIR \
 63 |         --labels $DATA_DIR/labels.txt \
 64 |         --per_gpu_train_batch_size $BATCH_SIZE \
 65 |         --gradient_accumulation_steps $GRAD_ACC \
 66 |         --per_gpu_eval_batch_size 128 \
 67 |         --learning_rate $LR \
 68 |         --num_train_epochs $EPOCH \
 69 |         --max_seq_length $MAX_LENGTH \
 70 |         --noised_max_seq_length $MAX_LENGTH \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps $EVALUATE_STEPS \
 76 |         --seed $SEED \
 77 |         --warmup_steps -1 \
 78 |         --save_only_best_checkpoint \
 79 |         --eval_all_checkpoints \
 80 |         --eval_patience -1 \
 81 |         --fp16 --fp16_opt_level O2 \
 82 |         --hidden_dropout_prob 0.1 \
 83 |         --original_loss \
 84 |         --enable_r1_loss \
 85 |         --r1_lambda $R1_LAMBDA \
 86 |         --use_token_label_probs \
 87 |         --enable_bpe_sampling \
 88 |         --bpe_sampling_ratio $BSR \
 89 |         --sampling_alpha $SA \
 90 |         --sampling_nbest_size $SNBS
 91 | elif [ $STAGE == 2 ]; then
 92 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 93 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
 94 |   python src/run_tag.py --model_type xlmr \
 95 |         --model_name_or_path $MODEL_PATH \
 96 |         --do_train \
 97 |         --do_eval \
 98 |         --do_predict \
 99 |         --do_predict_dev \
100 |         --predict_langs $LANGS \
101 |         --train_langs en \
102 |         --data_dir $DATA_DIR \
103 |         --labels $DATA_DIR/labels.txt \
104 |         --per_gpu_train_batch_size $BATCH_SIZE \
105 |         --gradient_accumulation_steps $GRAD_ACC \
106 |         --per_gpu_eval_batch_size 128 \
107 |         --learning_rate $LR \
108 |         --num_train_epochs $EPOCH \
109 |         --max_seq_length $MAX_LENGTH \
110 |         --noised_max_seq_length $MAX_LENGTH \
111 |         --output_dir $OUTPUT_DIR \
112 |         --overwrite_output_dir \
113 |         --evaluate_during_training \
114 |         --logging_steps 50 \
115 |         --evaluate_steps $EVALUATE_STEPS \
116 |         --seed $SEED \
117 |         --warmup_steps -1 \
118 |         --save_only_best_checkpoint \
119 |         --eval_all_checkpoints \
120 |         --eval_patience -1 \
121 |         --fp16 --fp16_opt_level O2 \
122 |         --hidden_dropout_prob 0.1 \
123 |         --original_loss \
124 |         --enable_r1_loss \
125 |         --r1_lambda $R1_LAMBDA \
126 |         --use_token_label_probs \
127 |         --enable_bpe_sampling \
128 |         --bpe_sampling_ratio $BSR \
129 |         --sampling_alpha $SA \
130 |         --sampling_nbest_size $SNBS \
131 |         --enable_data_augmentation \
132 |         --augment_ratio 1.0 \
133 |         --augment_method ss \
134 |         --r2_lambda $R2_LAMBDA \
135 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
136 |         --use_hard_labels
137 | fi


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_pawsx.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='pawsx'
 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/
 28 | MODEL_PATH=$DATA_DIR/$MODEL
 29 | EPOCH=10
 30 | MAXL=256
 31 | LANGS="de,en,es,fr,ja,ko,zh"
 32 | EVALUATE_STEPS=1000
 33 | CSR=0.5
 34 | R1_LAMBDA=5.0
 35 | R2_LAMBDA=2.0
 36 | if [ $MODEL == "xlm-roberta-large" ]; then
 37 |   BATCH_SIZE=16
 38 |   GRAD_ACC=2
 39 |   LR=1e-5
 40 | else
 41 |   BATCH_SIZE=32
 42 |   GRAD_ACC=1
 43 |   LR=1e-5
 44 | fi
 45 | 
 46 | if [ $STAGE == 1 ]; then
 47 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 48 |   mkdir -p $OUTPUT_DIR
 49 |   python ./src/run_cls.py --model_type xlmr \
 50 |         --model_name_or_path $MODEL_PATH \
 51 |         --language $LANGS \
 52 |         --train_language en \
 53 |         --do_train \
 54 |         --data_dir $DATA_DIR/$TASK/ \
 55 |         --per_gpu_train_batch_size $BATCH_SIZE \
 56 |         --gradient_accumulation_steps $GRAD_ACC \
 57 |         --per_gpu_eval_batch_size 64 \
 58 |         --learning_rate $LR \
 59 |         --num_train_epochs $EPOCH \
 60 |         --max_seq_length $MAXL \
 61 |         --output_dir $OUTPUT_DIR \
 62 |         --task_name $TASK \
 63 |         --save_steps -1 \
 64 |         --overwrite_output_dir \
 65 |         --evaluate_during_training \
 66 |         --evaluate_steps $EVALUATE_STEPS \
 67 |         --logging_steps 50 \
 68 |         --logging_steps_in_sample -1 \
 69 |         --logging_each_epoch \
 70 |         --gpu_id 0 \
 71 |         --seed $SEED \
 72 |         --fp16 --fp16_opt_level O2 \
 73 |         --warmup_steps -1 \
 74 |         --enable_r1_loss \
 75 |         --r1_lambda $R1_LAMBDA \
 76 |         --original_loss \
 77 |         --overall_ratio 1.0 \
 78 |         --enable_code_switch \
 79 |         --code_switch_ratio $CSR \
 80 |         --dict_dir $DATA_DIR/dicts \
 81 |         --dict_languages de,es,fr,ja,ko,zh
 82 | elif [ $STAGE == 2 ]; then
 83 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 84 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/"
 85 |   mkdir -p $OUTPUT_DIR
 86 |   python ./src/run_cls.py --model_type xlmr \
 87 |         --model_name_or_path $MODEL_PATH \
 88 |         --language $LANGS \
 89 |         --train_language en \
 90 |         --do_train \
 91 |         --data_dir $DATA_DIR/$TASK/ \
 92 |         --per_gpu_train_batch_size $BATCH_SIZE \
 93 |         --gradient_accumulation_steps $GRAD_ACC \
 94 |         --per_gpu_eval_batch_size 64 \
 95 |         --learning_rate $LR \
 96 |         --num_train_epochs $EPOCH \
 97 |         --max_seq_length $MAXL \
 98 |         --output_dir $OUTPUT_DIR \
 99 |         --task_name $TASK \
100 |         --save_steps -1 \
101 |         --overwrite_output_dir \
102 |         --evaluate_during_training \
103 |         --evaluate_steps $EVALUATE_STEPS \
104 |         --logging_steps 50 \
105 |         --logging_steps_in_sample -1 \
106 |         --logging_each_epoch \
107 |         --gpu_id 0 \
108 |         --seed $SEED \
109 |         --fp16 --fp16_opt_level O2 \
110 |         --warmup_steps -1 \
111 |         --enable_r1_loss \
112 |         --r1_lambda $R1_LAMBDA \
113 |         --original_loss \
114 |         --overall_ratio 1.0 \
115 |         --enable_code_switch \
116 |         --code_switch_ratio $CSR \
117 |         --dict_dir $DATA_DIR/dicts \
118 |         --dict_languages de,es,fr,ja,ko,zh \
119 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
120 |         --enable_data_augmentation \
121 |         --augment_ratio 1.0 \
122 |         --augment_method cs \
123 |         --r2_lambda $R2_LAMBDA
124 | fi


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_tydiqa.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='tydiqa'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/
 29 | 
 30 | 
 31 | MAXL=384
 32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te"
 33 | BSR=0.3
 34 | SA=0.3
 35 | SNBS=-1
 36 | R1_LAMBDA=5.0
 37 | R2_LAMBDA=5.0
 38 | if [ $MODEL == "xlm-roberta-large" ]; then
 39 |   BATCH_SIZE=4
 40 |   GRAD_ACC=8
 41 |   LR=1.5e-5
 42 |   EPOCH=10
 43 |   MAX_STEPS=2500
 44 | else
 45 |   BATCH_SIZE=32
 46 |   GRAD_ACC=1
 47 |   LR=3e-5
 48 |   EPOCH=20
 49 |   MAX_STEPS=5000
 50 | fi
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python ./src/run_qa.py --model_type xlmr \
 55 |         --task_name $TASK \
 56 |         --model_name_or_path $MODEL_PATH \
 57 |         --do_train \
 58 |         --do_eval \
 59 |         --language $LANGS \
 60 |         --train_language en \
 61 |         --data_dir $DATA_DIR/$TASK/ \
 62 |         --per_gpu_train_batch_size $BATCH_SIZE \
 63 |         --gradient_accumulation_steps $GRAD_ACC \
 64 |         --per_gpu_eval_batch_size 128 \
 65 |         --learning_rate $LR \
 66 |         --num_train_epochs $EPOCH \
 67 |         --save_steps 0 \
 68 |         --logging_each_epoch \
 69 |         --max_seq_length $MAXL \
 70 |         --doc_stride 128 \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps 0 \
 76 |         --seed $SEED \
 77 |         --fp16 --fp16_opt_level O2 \
 78 |         --warmup_steps -1 \
 79 |         --enable_r1_loss \
 80 |         --r1_lambda $R1_LAMBDA \
 81 |         --original_loss \
 82 |         --overall_ratio 1.0 \
 83 |         --keep_boundary_unchanged \
 84 |         --enable_bpe_sampling \
 85 |         --bpe_sampling_ratio $BSR \
 86 |         --sampling_alpha $SA \
 87 |         --sampling_nbest_size $SNBS \
 88 |         --noised_max_seq_length $MAXL
 89 | elif [ $STAGE == 2 ]; then
 90 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 91 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
 92 |   python ./src/run_qa.py --model_type xlmr \
 93 |         --task_name $TASK \
 94 |         --model_name_or_path $MODEL_PATH \
 95 |         --do_train \
 96 |         --do_eval \
 97 |         --language $LANGS \
 98 |         --train_language en \
 99 |         --data_dir $DATA_DIR/$TASK/ \
100 |         --per_gpu_train_batch_size $BATCH_SIZE \
101 |         --gradient_accumulation_steps $GRAD_ACC \
102 |         --per_gpu_eval_batch_size 128 \
103 |         --learning_rate $LR \
104 |         --num_train_epochs $EPOCH \
105 |         --save_steps 0 \
106 |         --logging_each_epoch \
107 |         --max_seq_length $MAXL \
108 |         --doc_stride 128 \
109 |         --output_dir $OUTPUT_DIR \
110 |         --overwrite_output_dir \
111 |         --evaluate_during_training \
112 |         --logging_steps 50 \
113 |         --evaluate_steps 0 \
114 |         --seed $SEED \
115 |         --fp16 --fp16_opt_level O2 \
116 |         --warmup_steps -1 \
117 |         --enable_r1_loss \
118 |         --r1_lambda $R1_LAMBDA \
119 |         --original_loss \
120 |         --overall_ratio 1.0 \
121 |         --keep_boundary_unchanged \
122 |         --enable_bpe_sampling \
123 |         --bpe_sampling_ratio $BSR \
124 |         --sampling_alpha $SA \
125 |         --sampling_nbest_size $SNBS \
126 |         --noised_max_seq_length $MAXL \
127 |         --enable_data_augmentation \
128 |         --augment_ratio 1.0 \
129 |         --augment_method ss \
130 |         --max_steps $MAX_STEPS \
131 |         --r2_lambda $R2_LAMBDA \
132 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
133 | fi
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_udpos.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='udpos'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | EPOCH=10
 29 | MAX_LENGTH=128
 30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh"
 31 | EVALUATE_STEPS=500
 32 | BSR=0.5
 33 | SA=0.3
 34 | SNBS=-1
 35 | R1_LAMBDA=5.0
 36 | R2_LAMBDA=0.3
 37 | if [ $MODEL == "xlm-roberta-large" ]; then
 38 |   BATCH_SIZE=32
 39 |   GRAD_ACC=1
 40 |   LR=5e-6
 41 | else
 42 |   BATCH_SIZE=32
 43 |   GRAD_ACC=1
 44 |   LR=2e-5
 45 | fi
 46 | 
 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt
 48 | 
 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
 50 | 
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python src/run_tag.py --model_type xlmr \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --do_predict \
 59 |         --do_predict_dev \
 60 |         --predict_langs $LANGS \
 61 |         --train_langs en \
 62 |         --data_dir $DATA_DIR \
 63 |         --labels $DATA_DIR/labels.txt \
 64 |         --per_gpu_train_batch_size $BATCH_SIZE \
 65 |         --gradient_accumulation_steps $GRAD_ACC \
 66 |         --per_gpu_eval_batch_size 128 \
 67 |         --learning_rate $LR \
 68 |         --num_train_epochs $EPOCH \
 69 |         --max_seq_length $MAX_LENGTH \
 70 |         --noised_max_seq_length $MAX_LENGTH \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps $EVALUATE_STEPS \
 76 |         --seed $SEED \
 77 |         --warmup_steps -1 \
 78 |         --save_only_best_checkpoint \
 79 |         --eval_all_checkpoints \
 80 |         --eval_patience -1 \
 81 |         --fp16 --fp16_opt_level O2 \
 82 |         --hidden_dropout_prob 0.1 \
 83 |         --original_loss \
 84 |         --use_pooling_strategy \
 85 |         --enable_r1_loss \
 86 |         --r1_lambda $R1_LAMBDA \
 87 |         --use_token_label_probs \
 88 |         --enable_bpe_sampling \
 89 |         --bpe_sampling_ratio $BSR \
 90 |         --sampling_alpha $SA \
 91 |         --sampling_nbest_size $SNBS
 92 | elif [ $STAGE == 2 ]; then
 93 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 94 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
 95 |   python src/run_tag.py --model_type xlmr \
 96 |         --model_name_or_path $MODEL_PATH \
 97 |         --do_train \
 98 |         --do_eval \
 99 |         --do_predict \
100 |         --do_predict_dev \
101 |         --predict_langs $LANGS \
102 |         --train_langs en \
103 |         --data_dir $DATA_DIR \
104 |         --labels $DATA_DIR/labels.txt \
105 |         --per_gpu_train_batch_size $BATCH_SIZE \
106 |         --gradient_accumulation_steps $GRAD_ACC \
107 |         --per_gpu_eval_batch_size 128 \
108 |         --learning_rate $LR \
109 |         --num_train_epochs $EPOCH \
110 |         --max_seq_length $MAX_LENGTH \
111 |         --noised_max_seq_length $MAX_LENGTH \
112 |         --output_dir $OUTPUT_DIR \
113 |         --overwrite_output_dir \
114 |         --evaluate_during_training \
115 |         --logging_steps 50 \
116 |         --evaluate_steps $EVALUATE_STEPS \
117 |         --seed $SEED \
118 |         --warmup_steps -1 \
119 |         --save_only_best_checkpoint \
120 |         --eval_all_checkpoints \
121 |         --eval_patience -1 \
122 |         --fp16 --fp16_opt_level O2 \
123 |         --hidden_dropout_prob 0.1 \
124 |         --original_loss \
125 |         --use_pooling_strategy \
126 |         --enable_r1_loss \
127 |         --r1_lambda $R1_LAMBDA \
128 |         --use_token_label_probs \
129 |         --enable_bpe_sampling \
130 |         --bpe_sampling_ratio $BSR \
131 |         --sampling_alpha $SA \
132 |         --sampling_nbest_size $SNBS \
133 |         --enable_data_augmentation \
134 |         --augment_ratio 1.0 \
135 |         --augment_method ss \
136 |         --r2_lambda $R2_LAMBDA \
137 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
138 | fi


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_xnli.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='xnli'
 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/
 28 | MODEL_PATH=$DATA_DIR/$MODEL
 29 | EPOCH=10
 30 | MAXL=256
 31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh"
 32 | EVALUATE_STEPS=5000
 33 | CSR=0.3
 34 | R1_LAMBDA=5.0
 35 | R2_LAMBDA=5.0
 36 | if [ $MODEL == "xlm-roberta-large" ]; then
 37 |   BATCH_SIZE=16
 38 |   GRAD_ACC=2
 39 |   LR=5e-6
 40 | else
 41 |   BATCH_SIZE=32
 42 |   GRAD_ACC=1
 43 |   LR=7e-6
 44 | fi
 45 | 
 46 | if [ $STAGE == 1 ]; then
 47 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 48 |   mkdir -p $OUTPUT_DIR
 49 |   python ./src/run_cls.py --model_type xlmr \
 50 |         --model_name_or_path $MODEL_PATH \
 51 |         --language $LANGS \
 52 |         --train_language en \
 53 |         --do_train \
 54 |         --data_dir $DATA_DIR/$TASK/ \
 55 |         --per_gpu_train_batch_size $BATCH_SIZE \
 56 |         --gradient_accumulation_steps $GRAD_ACC \
 57 |         --per_gpu_eval_batch_size 64 \
 58 |         --learning_rate $LR \
 59 |         --num_train_epochs $EPOCH \
 60 |         --max_seq_length $MAXL \
 61 |         --output_dir $OUTPUT_DIR \
 62 |         --task_name $TASK \
 63 |         --save_steps -1 \
 64 |         --overwrite_output_dir \
 65 |         --evaluate_during_training \
 66 |         --evaluate_steps $EVALUATE_STEPS \
 67 |         --logging_steps 50 \
 68 |         --logging_steps_in_sample -1 \
 69 |         --logging_each_epoch \
 70 |         --gpu_id 0 \
 71 |         --seed $SEED \
 72 |         --fp16 --fp16_opt_level O2 \
 73 |         --warmup_steps -1 \
 74 |         --enable_r1_loss \
 75 |         --r1_lambda $R1_LAMBDA \
 76 |         --original_loss \
 77 |         --overall_ratio 1.0 \
 78 |         --enable_code_switch \
 79 |         --code_switch_ratio $CSR \
 80 |         --dict_dir $DATA_DIR/dicts \
 81 |         --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh
 82 | elif [ $STAGE == 2 ]; then
 83 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 84 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/"
 85 |   mkdir -p $OUTPUT_DIR
 86 |   python ./src/run_cls.py --model_type xlmr \
 87 |         --model_name_or_path $MODEL_PATH \
 88 |         --language $LANGS \
 89 |         --train_language en \
 90 |         --do_train \
 91 |         --data_dir $DATA_DIR/$TASK/ \
 92 |         --per_gpu_train_batch_size $BATCH_SIZE \
 93 |         --gradient_accumulation_steps $GRAD_ACC \
 94 |         --per_gpu_eval_batch_size 64 \
 95 |         --learning_rate $LR \
 96 |         --num_train_epochs $EPOCH \
 97 |         --max_seq_length $MAXL \
 98 |         --output_dir $OUTPUT_DIR \
 99 |         --task_name $TASK \
100 |         --save_steps -1 \
101 |         --overwrite_output_dir \
102 |         --evaluate_during_training \
103 |         --evaluate_steps $EVALUATE_STEPS \
104 |         --logging_steps 50 \
105 |         --logging_steps_in_sample -1 \
106 |         --logging_each_epoch \
107 |         --gpu_id 0 \
108 |         --seed $SEED \
109 |         --fp16 --fp16_opt_level O2 \
110 |         --warmup_steps -1 \
111 |         --enable_r1_loss \
112 |         --r1_lambda $R1_LAMBDA \
113 |         --original_loss \
114 |         --overall_ratio 1.0 \
115 |         --enable_code_switch \
116 |         --code_switch_ratio $CSR \
117 |         --dict_dir $DATA_DIR/dicts \
118 |         --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh \
119 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
120 |         --enable_data_augmentation \
121 |         --augment_ratio 1.0 \
122 |         --augment_method cs \
123 |         --r2_lambda $R2_LAMBDA
124 | fi


--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_xquad.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/
 27 | 
 28 | TASK='xquad'
 29 | MODEL_PATH=$DATA_DIR/$MODEL
 30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
 31 | 
 32 | EPOCH=4
 33 | MAXL=384
 34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh"
 35 | BSR=0.3
 36 | SA=0.3
 37 | SNBS=-1
 38 | CSR=0.3
 39 | R1_LAMBDA=5.0
 40 | R2_LAMBDA=5.0
 41 | if [ $MODEL == "xlm-roberta-large" ]; then
 42 |   BATCH_SIZE=4
 43 |   GRAD_ACC=8
 44 |   LR=1.5e-5
 45 | else
 46 |   BATCH_SIZE=32
 47 |   GRAD_ACC=1
 48 |   LR=3e-5
 49 | fi
 50 | 
 51 | if [ $STAGE == 1 ]; then
 52 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 53 |   python ./src/run_qa.py --model_type xlmr \
 54 |         --task_name $TASK \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --language $LANGS \
 59 |         --train_language en \
 60 |         --data_dir $DATA_DIR/$TASK/ \
 61 |         --per_gpu_train_batch_size $BATCH_SIZE \
 62 |         --gradient_accumulation_steps $GRAD_ACC \
 63 |         --per_gpu_eval_batch_size 128 \
 64 |         --learning_rate $LR \
 65 |         --num_train_epochs $EPOCH \
 66 |         --save_steps 0 \
 67 |         --logging_each_epoch \
 68 |         --max_seq_length $MAXL \
 69 |         --doc_stride 128 \
 70 |         --output_dir $OUTPUT_DIR \
 71 |         --overwrite_output_dir \
 72 |         --evaluate_during_training \
 73 |         --logging_steps 50 \
 74 |         --evaluate_steps 0 \
 75 |         --seed $SEED \
 76 |         --fp16 --fp16_opt_level O2 \
 77 |         --warmup_steps -1 \
 78 |         --enable_r1_loss \
 79 |         --r1_lambda $R1_LAMBDA \
 80 |         --original_loss \
 81 |         --overall_ratio 1.0 \
 82 |         --keep_boundary_unchanged \
 83 |         --enable_code_switch \
 84 |         --code_switch_ratio $CSR \
 85 |         --dict_dir $DATA_DIR/dicts \
 86 |         --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \
 87 |         --noised_max_seq_length $MAXL
 88 | elif [ $STAGE == 2 ]; then
 89 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 90 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
 91 |   python ./src/run_qa.py --model_type xlmr \
 92 |         --task_name $TASK \
 93 |         --model_name_or_path $MODEL_PATH \
 94 |         --do_train \
 95 |         --do_eval \
 96 |         --language $LANGS \
 97 |         --train_language en \
 98 |         --data_dir $DATA_DIR/$TASK/ \
 99 |         --per_gpu_train_batch_size $BATCH_SIZE \
100 |         --gradient_accumulation_steps $GRAD_ACC \
101 |         --per_gpu_eval_batch_size 128 \
102 |         --learning_rate $LR \
103 |         --num_train_epochs $EPOCH \
104 |         --save_steps 0 \
105 |         --logging_each_epoch \
106 |         --max_seq_length $MAXL \
107 |         --doc_stride 128 \
108 |         --output_dir $OUTPUT_DIR \
109 |         --overwrite_output_dir \
110 |         --evaluate_during_training \
111 |         --logging_steps 50 \
112 |         --evaluate_steps 0 \
113 |         --seed $SEED \
114 |         --fp16 --fp16_opt_level O2 \
115 |         --warmup_steps -1 \
116 |         --enable_r1_loss \
117 |         --r1_lambda $R1_LAMBDA \
118 |         --original_loss \
119 |         --overall_ratio 1.0 \
120 |         --keep_boundary_unchanged \
121 |         --enable_bpe_sampling \
122 |         --bpe_sampling_ratio $BSR \
123 |         --sampling_alpha $SA \
124 |         --sampling_nbest_size $SNBS \
125 |         --noised_max_seq_length $MAXL \
126 |         --enable_data_augmentation \
127 |         --augment_ratio 1.0 \
128 |         --augment_method ss \
129 |         --max_steps 24000 \
130 |         --r2_lambda $R2_LAMBDA \
131 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
132 | fi
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/scripts/download_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | DIR=$REPO/download/
18 | mkdir -p $DIR
19 | 
20 | # download xlm-roberta-base
21 | function download_xlm-roberta-base {
22 |     mkdir -p $DIR/xlm-roberta-base/
23 |     cd $DIR/xlm-roberta-base/
24 |     wget https://huggingface.co/xlm-roberta-base/resolve/main/pytorch_model.bin -q --show-progress
25 |     wget https://huggingface.co/xlm-roberta-base/resolve/main/config.json -q --show-progress
26 |     wget https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model -q --show-progress
27 |     wget https://huggingface.co/xlm-roberta-base/resolve/main/tokenizer.json -q --show-progress
28 |     echo "Successfully downloaded xlm-roberta-base at $DIR/xlm-roberta-base" >> $DIR/download_model.log
29 | }
30 | 
31 | # download xlm-roberta-large
32 | function download_xlm-roberta-large {
33 |     mkdir -p $DIR/xlm-roberta-large/
34 |     cd $DIR/xlm-roberta-large/
35 |     wget https://huggingface.co/xlm-roberta-large/resolve/main/pytorch_model.bin -q --show-progress
36 |     wget https://huggingface.co/xlm-roberta-large/resolve/main/config.json -q --show-progress
37 |     wget https://huggingface.co/xlm-roberta-large/resolve/main/sentencepiece.bpe.model -q --show-progress
38 |     wget https://huggingface.co/xlm-roberta-large/resolve/main/tokenizer.json -q --show-progress
39 |     echo "Successfully downloaded xlm-roberta-large at $DIR/xlm-roberta-large" >> $DIR/download_model.log
40 | }
41 | 
42 | download_xlm-roberta-base
43 | download_xlm-roberta-large
44 | 


--------------------------------------------------------------------------------
/scripts/preprocess_panx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 | 
20 | TASK='panx'
21 | MAXL=128
22 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 |   MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 |   MODEL_TYPE="xlm"
28 |   LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then
30 |   MODEL_TYPE="xlmr"
31 | fi
32 | SAVE_DIR="$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAXL}"
33 | mkdir -p $SAVE_DIR
34 | python3 $REPO/utils_preprocess.py \
35 |   --data_dir $DATA_DIR/$TASK/ \
36 |   --task panx_tokenize \
37 |   --model_name_or_path $MODEL \
38 |   --model_type $MODEL_TYPE \
39 |   --max_len $MAXL \
40 |   --output_dir $SAVE_DIR \
41 |   --languages $LANGS $LC >> $SAVE_DIR/preprocess.log
42 | if [ ! -f $SAVE_DIR/labels.txt ]; then
43 |   cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
44 | fi
45 | 


--------------------------------------------------------------------------------
/scripts/preprocess_udpos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 | 
20 | TASK='udpos'
21 | MAXL=128
22 | LANGS='af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh'
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 |   MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 |   MODEL_TYPE="xlm"
28 |   LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then
30 |   MODEL_TYPE="xlmr"
31 | fi
32 | 
33 | SAVE_DIR="$DATA_DIR/${TASK}/udpos_processed_maxlen${MAXL}"
34 | mkdir -p $SAVE_DIR
35 | python3 $REPO/utils_preprocess.py \
36 |   --data_dir $DATA_DIR/${TASK}/ \
37 |   --task udpos_tokenize \
38 |   --model_name_or_path $MODEL \
39 |   --model_type $MODEL_TYPE \
40 |   --max_len $MAXL \
41 |   --output_dir $SAVE_DIR \
42 |   --languages $LANGS $LC >> $SAVE_DIR/process.log
43 | if [ ! -f $SAVE_DIR/labels.txt ]; then
44 |   echo "create label"
45 |   cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
46 | fi
47 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Google and DeepMind.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | REPO=$PWD
17 | SETTING=${1:-cross-lingual-transfer}
18 | TASK=${2:-xnli}
19 | MODEL=${3:-"xlm-roberta-base"}
20 | STAGE=${4:-1}
21 | GPU=${5:-0}
22 | DATA_DIR=${6:-"$REPO/download/"}
23 | OUT_DIR=${7:-"$REPO/outputs/"}
24 | SEED=${8:-1}
25 | 
26 | echo "Fine-tuning $MODEL on $TASK using GPU $GPU in STAGE $STAGE with SETTING $SETTING"
27 | echo "Load data from $DATA_DIR, and save models to $OUT_DIR"
28 | 
29 | if [ $TASK == "udpos" ]; then
30 |   bash $REPO/scripts/preprocess_udpos.sh $MODEL $DATA_DIR
31 | elif [ $TASK == "panx" ]; then
32 |   bash $REPO/scripts/preprocess_panx.sh $MODEL $DATA_DIR
33 | fi
34 | 
35 | bash $REPO/scripts/$SETTING/train_${TASK}.sh $MODEL $STAGE $GPU $DATA_DIR $OUT_DIR $SEED


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_mlqa.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | 
 27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/
 28 | 
 29 | TASK='mlqa'
 30 | 
 31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
 32 | MODEL_PATH=$DATA_DIR/$MODEL
 33 | 
 34 | EPOCH=4
 35 | MAXL=384
 36 | LANGS="en,es,de,ar,hi,vi,zh"
 37 | BSR=0.3
 38 | SA=0.3
 39 | SNBS=-1
 40 | CSR=0.3
 41 | R1_LAMBDA=5.0
 42 | R2_LAMBDA=0.5
 43 | if [ $MODEL == "xlm-roberta-large" ]; then
 44 |   BATCH_SIZE=4
 45 |   GRAD_ACC=8
 46 |   LR=1.5e-5
 47 | else
 48 |   BATCH_SIZE=32
 49 |   GRAD_ACC=1
 50 |   LR=3e-5
 51 | fi
 52 | 
 53 | if [ $STAGE == 1 ]; then
 54 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 55 |   python ./src/run_qa.py --model_type xlmr \
 56 |         --task_name $TASK \
 57 |         --model_name_or_path $MODEL_PATH \
 58 |         --do_train \
 59 |         --do_eval \
 60 |         --language $LANGS \
 61 |         --train_language en \
 62 |         --data_dir $DATA_DIR/$TASK/ \
 63 |         --per_gpu_train_batch_size $BATCH_SIZE \
 64 |         --gradient_accumulation_steps $GRAD_ACC \
 65 |         --per_gpu_eval_batch_size 128 \
 66 |         --learning_rate $LR \
 67 |         --num_train_epochs $EPOCH \
 68 |         --save_steps 0 \
 69 |         --logging_each_epoch \
 70 |         --max_seq_length $MAXL \
 71 |         --doc_stride 128 \
 72 |         --output_dir $OUTPUT_DIR \
 73 |         --overwrite_output_dir \
 74 |         --evaluate_during_training \
 75 |         --logging_steps 50 \
 76 |         --evaluate_steps 0 \
 77 |         --seed $SEED \
 78 |         --fp16 --fp16_opt_level O2 \
 79 |         --warmup_steps -1 \
 80 |         --enable_r1_loss \
 81 |         --r1_lambda $R1_LAMBDA \
 82 |         --original_loss \
 83 |         --overall_ratio 1.0 \
 84 |         --keep_boundary_unchanged \
 85 |         --enable_code_switch \
 86 |         --code_switch_ratio $CSR \
 87 |         --dict_dir $DATA_DIR/dicts \
 88 |         --dict_languages es,de,ar,hi,vi,zh \
 89 |         --noised_max_seq_length $MAXL
 90 | elif [ $STAGE == 2 ]; then
 91 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 92 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 93 |   python ./src/run_qa.py --model_type xlmr \
 94 |         --task_name $TASK \
 95 |         --model_name_or_path $MODEL_PATH \
 96 |         --do_train \
 97 |         --do_eval \
 98 |         --language $LANGS \
 99 |         --train_language en \
100 |         --data_dir $DATA_DIR/$TASK/ \
101 |         --per_gpu_train_batch_size $BATCH_SIZE \
102 |         --gradient_accumulation_steps $GRAD_ACC \
103 |         --per_gpu_eval_batch_size 128 \
104 |         --learning_rate $LR \
105 |         --num_train_epochs $EPOCH \
106 |         --save_steps 0 \
107 |         --logging_each_epoch \
108 |         --max_seq_length $MAXL \
109 |         --doc_stride 128 \
110 |         --output_dir $OUTPUT_DIR \
111 |         --overwrite_output_dir \
112 |         --evaluate_during_training \
113 |         --logging_steps 50 \
114 |         --evaluate_steps 0 \
115 |         --seed $SEED \
116 |         --fp16 --fp16_opt_level O2 \
117 |         --warmup_steps -1 \
118 |         --enable_r1_loss \
119 |         --r1_lambda $R1_LAMBDA \
120 |         --original_loss \
121 |         --overall_ratio 1.0 \
122 |         --keep_boundary_unchanged \
123 |         --enable_bpe_sampling \
124 |         --bpe_sampling_ratio $BSR \
125 |         --sampling_alpha $SA \
126 |         --sampling_nbest_size $SNBS \
127 |         --noised_max_seq_length $MAXL \
128 |         --enable_data_augmentation \
129 |         --augment_ratio 1.0 \
130 |         --augment_method mt \
131 |         --translation_path $TRANSLATION_PATH \
132 |         --max_steps 24000 \
133 |         --r2_lambda $R2_LAMBDA \
134 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
135 | fi
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_panx.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='panx'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | EPOCH=10
 29 | MAX_LENGTH=128
 30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
 31 | EVALUATE_STEPS=1000
 32 | BSR=0.3
 33 | SA=0.3
 34 | SNBS=-1
 35 | R1_LAMBDA=5.0
 36 | R2_LAMBDA=1.0
 37 | if [ $MODEL == "xlm-roberta-large" ]; then
 38 |   BATCH_SIZE=32
 39 |   GRAD_ACC=1
 40 |   LR=7e-6
 41 | else
 42 |   BATCH_SIZE=32
 43 |   GRAD_ACC=1
 44 |   LR=1e-5
 45 | fi
 46 | 
 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt
 48 | 
 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
 50 | 
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python src/run_tag.py --model_type xlmr \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --do_predict \
 59 |         --do_predict_dev \
 60 |         --predict_langs $LANGS \
 61 |         --train_langs en \
 62 |         --data_dir $DATA_DIR \
 63 |         --labels $DATA_DIR/labels.txt \
 64 |         --per_gpu_train_batch_size $BATCH_SIZE \
 65 |         --gradient_accumulation_steps $GRAD_ACC \
 66 |         --per_gpu_eval_batch_size 128 \
 67 |         --learning_rate $LR \
 68 |         --num_train_epochs $EPOCH \
 69 |         --max_seq_length $MAX_LENGTH \
 70 |         --noised_max_seq_length $MAX_LENGTH \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps $EVALUATE_STEPS \
 76 |         --seed $SEED \
 77 |         --warmup_steps -1 \
 78 |         --save_only_best_checkpoint \
 79 |         --eval_all_checkpoints \
 80 |         --eval_patience -1 \
 81 |         --fp16 --fp16_opt_level O2 \
 82 |         --hidden_dropout_prob 0.1 \
 83 |         --original_loss \
 84 |         --enable_r1_loss \
 85 |         --r1_lambda $R1_LAMBDA \
 86 |         --use_token_label_probs \
 87 |         --enable_bpe_sampling \
 88 |         --bpe_sampling_ratio $BSR \
 89 |         --sampling_alpha $SA \
 90 |         --sampling_nbest_size $SNBS
 91 | elif [ $STAGE == 2 ]; then
 92 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 93 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 94 |   python src/run_tag.py --model_type xlmr \
 95 |         --model_name_or_path $MODEL_PATH \
 96 |         --do_train \
 97 |         --do_eval \
 98 |         --do_predict \
 99 |         --do_predict_dev \
100 |         --predict_langs $LANGS \
101 |         --train_langs en \
102 |         --data_dir $DATA_DIR \
103 |         --labels $DATA_DIR/labels.txt \
104 |         --per_gpu_train_batch_size $BATCH_SIZE \
105 |         --gradient_accumulation_steps $GRAD_ACC \
106 |         --per_gpu_eval_batch_size 128 \
107 |         --learning_rate $LR \
108 |         --num_train_epochs $EPOCH \
109 |         --max_seq_length $MAX_LENGTH \
110 |         --noised_max_seq_length $MAX_LENGTH \
111 |         --output_dir $OUTPUT_DIR \
112 |         --overwrite_output_dir \
113 |         --evaluate_during_training \
114 |         --logging_steps 50 \
115 |         --evaluate_steps $EVALUATE_STEPS \
116 |         --seed $SEED \
117 |         --warmup_steps -1 \
118 |         --save_only_best_checkpoint \
119 |         --eval_all_checkpoints \
120 |         --eval_patience -1 \
121 |         --fp16 --fp16_opt_level O2 \
122 |         --hidden_dropout_prob 0.1 \
123 |         --original_loss \
124 |         --enable_r1_loss \
125 |         --r1_lambda $R1_LAMBDA \
126 |         --use_token_label_probs \
127 |         --enable_bpe_sampling \
128 |         --bpe_sampling_ratio $BSR \
129 |         --sampling_alpha $SA \
130 |         --sampling_nbest_size $SNBS \
131 |         --enable_data_augmentation \
132 |         --augment_ratio 1.0 \
133 |         --augment_method mt \
134 |         --translation_path $TRANSLATION_PATH \
135 |         --r2_lambda $R2_LAMBDA \
136 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
137 |         --use_hard_labels
138 | fi


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_pawsx.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='pawsx'
 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/
 28 | MODEL_PATH=$DATA_DIR/$MODEL
 29 | EPOCH=10
 30 | MAXL=256
 31 | LANGS="de,en,es,fr,ja,ko,zh"
 32 | EVALUATE_STEPS=1000
 33 | R1_LAMBDA=5.0
 34 | R2_LAMBDA=1.0
 35 | if [ $MODEL == "xlm-roberta-large" ]; then
 36 |   BATCH_SIZE=16
 37 |   GRAD_ACC=2
 38 |   LR=1e-5
 39 | else
 40 |   BATCH_SIZE=32
 41 |   GRAD_ACC=1
 42 |   LR=1e-5
 43 | fi
 44 | 
 45 | if [ $STAGE == 1 ]; then
 46 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/"
 47 |   mkdir -p $OUTPUT_DIR
 48 |   python ./src/run_cls.py --model_type xlmr \
 49 |         --model_name_or_path $MODEL_PATH \
 50 |         --language $LANGS \
 51 |         --train_language en \
 52 |         --do_train \
 53 |         --data_dir $DATA_DIR/$TASK/ \
 54 |         --per_gpu_train_batch_size $BATCH_SIZE \
 55 |         --gradient_accumulation_steps $GRAD_ACC \
 56 |         --per_gpu_eval_batch_size 64 \
 57 |         --learning_rate $LR \
 58 |         --num_train_epochs $EPOCH \
 59 |         --max_seq_length $MAXL \
 60 |         --output_dir $OUTPUT_DIR \
 61 |         --task_name $TASK \
 62 |         --save_steps -1 \
 63 |         --overwrite_output_dir \
 64 |         --evaluate_during_training \
 65 |         --evaluate_steps $EVALUATE_STEPS \
 66 |         --logging_steps 50 \
 67 |         --logging_steps_in_sample -1 \
 68 |         --logging_each_epoch \
 69 |         --gpu_id 0 \
 70 |         --seed $SEED \
 71 |         --fp16 --fp16_opt_level O2 \
 72 |         --warmup_steps -1 \
 73 |         --enable_r1_loss \
 74 |         --r1_lambda $R1_LAMBDA \
 75 |         --original_loss \
 76 |         --enable_translate_data \
 77 |         --translation_path $TRANSLATION_PATH
 78 | elif [ $STAGE == 2 ]; then
 79 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 80 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 81 |   mkdir -p $OUTPUT_DIR
 82 |   python ./src/run_cls.py --model_type xlmr \
 83 |         --model_name_or_path $MODEL_PATH \
 84 |         --language $LANGS \
 85 |         --train_language en \
 86 |         --do_train \
 87 |         --data_dir $DATA_DIR/$TASK/ \
 88 |         --per_gpu_train_batch_size $BATCH_SIZE \
 89 |         --gradient_accumulation_steps $GRAD_ACC \
 90 |         --per_gpu_eval_batch_size 64 \
 91 |         --learning_rate $LR \
 92 |         --num_train_epochs $EPOCH \
 93 |         --max_seq_length $MAXL \
 94 |         --output_dir $OUTPUT_DIR \
 95 |         --task_name $TASK \
 96 |         --save_steps -1 \
 97 |         --overwrite_output_dir \
 98 |         --evaluate_during_training \
 99 |         --evaluate_steps $EVALUATE_STEPS \
100 |         --logging_steps 50 \
101 |         --logging_steps_in_sample -1 \
102 |         --logging_each_epoch \
103 |         --gpu_id 0 \
104 |         --seed $SEED \
105 |         --fp16 --fp16_opt_level O2 \
106 |         --warmup_steps -1 \
107 |         --enable_r1_loss \
108 |         --r1_lambda $R1_LAMBDA \
109 |         --original_loss \
110 |         --enable_translate_data \
111 |         --translation_path $TRANSLATION_PATH \
112 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
113 |         --enable_data_augmentation \
114 |         --augment_ratio 1.0 \
115 |         --augment_method mt \
116 |         --r2_lambda $R2_LAMBDA
117 | fi
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_tydiqa.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='tydiqa'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/
 29 | 
 30 | 
 31 | MAXL=384
 32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te"
 33 | BSR=0.3
 34 | SA=0.3
 35 | SNBS=-1
 36 | R1_LAMBDA=5.0
 37 | R2_LAMBDA=0.3
 38 | if [ $MODEL == "xlm-roberta-large" ]; then
 39 |   BATCH_SIZE=4
 40 |   GRAD_ACC=8
 41 |   LR=1.5e-5
 42 |   EPOCH=10
 43 |   MAX_STEPS=2500
 44 | else
 45 |   BATCH_SIZE=32
 46 |   GRAD_ACC=1
 47 |   LR=3e-5
 48 |   EPOCH=20
 49 |   MAX_STEPS=5000
 50 | fi
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python ./src/run_qa.py --model_type xlmr \
 55 |         --task_name $TASK \
 56 |         --model_name_or_path $MODEL_PATH \
 57 |         --do_train \
 58 |         --do_eval \
 59 |         --language $LANGS \
 60 |         --train_language en \
 61 |         --data_dir $DATA_DIR/$TASK/ \
 62 |         --per_gpu_train_batch_size $BATCH_SIZE \
 63 |         --gradient_accumulation_steps $GRAD_ACC \
 64 |         --per_gpu_eval_batch_size 128 \
 65 |         --learning_rate $LR \
 66 |         --num_train_epochs $EPOCH \
 67 |         --save_steps 0 \
 68 |         --logging_each_epoch \
 69 |         --max_seq_length $MAXL \
 70 |         --doc_stride 128 \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps 0 \
 76 |         --seed $SEED \
 77 |         --fp16 --fp16_opt_level O2 \
 78 |         --warmup_steps -1 \
 79 |         --enable_r1_loss \
 80 |         --r1_lambda $R1_LAMBDA \
 81 |         --original_loss \
 82 |         --overall_ratio 1.0 \
 83 |         --keep_boundary_unchanged \
 84 |         --enable_bpe_sampling \
 85 |         --bpe_sampling_ratio $BSR \
 86 |         --sampling_alpha $SA \
 87 |         --sampling_nbest_size $SNBS \
 88 |         --noised_max_seq_length $MAXL
 89 | elif [ $STAGE == 2 ]; then
 90 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 91 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 92 |   python ./src/run_qa.py --model_type xlmr \
 93 |         --task_name $TASK \
 94 |         --model_name_or_path $MODEL_PATH \
 95 |         --do_train \
 96 |         --do_eval \
 97 |         --language $LANGS \
 98 |         --train_language en \
 99 |         --data_dir $DATA_DIR/$TASK/ \
100 |         --per_gpu_train_batch_size $BATCH_SIZE \
101 |         --gradient_accumulation_steps $GRAD_ACC \
102 |         --per_gpu_eval_batch_size 128 \
103 |         --learning_rate $LR \
104 |         --num_train_epochs $EPOCH \
105 |         --save_steps 0 \
106 |         --logging_each_epoch \
107 |         --max_seq_length $MAXL \
108 |         --doc_stride 128 \
109 |         --output_dir $OUTPUT_DIR \
110 |         --overwrite_output_dir \
111 |         --evaluate_during_training \
112 |         --logging_steps 50 \
113 |         --evaluate_steps 0 \
114 |         --seed $SEED \
115 |         --fp16 --fp16_opt_level O2 \
116 |         --warmup_steps -1 \
117 |         --enable_r1_loss \
118 |         --r1_lambda $R1_LAMBDA \
119 |         --original_loss \
120 |         --overall_ratio 1.0 \
121 |         --keep_boundary_unchanged \
122 |         --enable_bpe_sampling \
123 |         --bpe_sampling_ratio $BSR \
124 |         --sampling_alpha $SA \
125 |         --sampling_nbest_size $SNBS \
126 |         --noised_max_seq_length $MAXL \
127 |         --enable_data_augmentation \
128 |         --augment_ratio 1.0 \
129 |         --augment_method mt \
130 |         --translation_path $TRANSLATION_PATH \
131 |         --max_steps $MAX_STEPS \
132 |         --r2_lambda $R2_LAMBDA \
133 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
134 | fi
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_udpos.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='udpos'
 27 | MODEL_PATH=$DATA_DIR/$MODEL
 28 | EPOCH=10
 29 | MAX_LENGTH=128
 30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh"
 31 | EVALUATE_STEPS=500
 32 | BSR=0.5
 33 | SA=0.3
 34 | SNBS=-1
 35 | R1_LAMBDA=5.0
 36 | R2_LAMBDA=0.3
 37 | if [ $MODEL == "xlm-roberta-large" ]; then
 38 |   BATCH_SIZE=32
 39 |   GRAD_ACC=1
 40 |   LR=5e-6
 41 | else
 42 |   BATCH_SIZE=32
 43 |   GRAD_ACC=1
 44 |   LR=2e-5
 45 | fi
 46 | 
 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt
 48 | 
 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
 50 | 
 51 | 
 52 | if [ $STAGE == 1 ]; then
 53 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
 54 |   python src/run_tag.py --model_type xlmr \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --do_predict \
 59 |         --do_predict_dev \
 60 |         --predict_langs $LANGS \
 61 |         --train_langs en \
 62 |         --data_dir $DATA_DIR \
 63 |         --labels $DATA_DIR/labels.txt \
 64 |         --per_gpu_train_batch_size $BATCH_SIZE \
 65 |         --gradient_accumulation_steps $GRAD_ACC \
 66 |         --per_gpu_eval_batch_size 128 \
 67 |         --learning_rate $LR \
 68 |         --num_train_epochs $EPOCH \
 69 |         --max_seq_length $MAX_LENGTH \
 70 |         --noised_max_seq_length $MAX_LENGTH \
 71 |         --output_dir $OUTPUT_DIR \
 72 |         --overwrite_output_dir \
 73 |         --evaluate_during_training \
 74 |         --logging_steps 50 \
 75 |         --evaluate_steps $EVALUATE_STEPS \
 76 |         --seed $SEED \
 77 |         --warmup_steps -1 \
 78 |         --save_only_best_checkpoint \
 79 |         --eval_all_checkpoints \
 80 |         --eval_patience -1 \
 81 |         --fp16 --fp16_opt_level O2 \
 82 |         --hidden_dropout_prob 0.1 \
 83 |         --original_loss \
 84 |         --use_pooling_strategy \
 85 |         --enable_r1_loss \
 86 |         --r1_lambda $R1_LAMBDA \
 87 |         --use_token_label_probs \
 88 |         --enable_bpe_sampling \
 89 |         --bpe_sampling_ratio $BSR \
 90 |         --sampling_alpha $SA \
 91 |         --sampling_nbest_size $SNBS
 92 | elif [ $STAGE == 2 ]; then
 93 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 94 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 95 |   python src/run_tag.py --model_type xlmr \
 96 |         --model_name_or_path $MODEL_PATH \
 97 |         --do_train \
 98 |         --do_eval \
 99 |         --do_predict \
100 |         --do_predict_dev \
101 |         --predict_langs $LANGS \
102 |         --train_langs en \
103 |         --data_dir $DATA_DIR \
104 |         --labels $DATA_DIR/labels.txt \
105 |         --per_gpu_train_batch_size $BATCH_SIZE \
106 |         --gradient_accumulation_steps $GRAD_ACC \
107 |         --per_gpu_eval_batch_size 128 \
108 |         --learning_rate $LR \
109 |         --num_train_epochs $EPOCH \
110 |         --max_seq_length $MAX_LENGTH \
111 |         --noised_max_seq_length $MAX_LENGTH \
112 |         --output_dir $OUTPUT_DIR \
113 |         --overwrite_output_dir \
114 |         --evaluate_during_training \
115 |         --logging_steps 50 \
116 |         --evaluate_steps $EVALUATE_STEPS \
117 |         --seed $SEED \
118 |         --warmup_steps -1 \
119 |         --save_only_best_checkpoint \
120 |         --eval_all_checkpoints \
121 |         --eval_patience -1 \
122 |         --fp16 --fp16_opt_level O2 \
123 |         --hidden_dropout_prob 0.1 \
124 |         --original_loss \
125 |         --use_pooling_strategy \
126 |         --enable_r1_loss \
127 |         --r1_lambda $R1_LAMBDA \
128 |         --use_token_label_probs \
129 |         --enable_bpe_sampling \
130 |         --bpe_sampling_ratio $BSR \
131 |         --sampling_alpha $SA \
132 |         --sampling_nbest_size $SNBS \
133 |         --enable_data_augmentation \
134 |         --augment_ratio 1.0 \
135 |         --augment_method mt \
136 |         --translation_path $TRANSLATION_PATH \
137 |         --r2_lambda $R2_LAMBDA \
138 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
139 | fi


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_xnli.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | TASK='xnli'
 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/
 28 | MODEL_PATH=$DATA_DIR/$MODEL
 29 | EPOCH=10
 30 | MAXL=256
 31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh"
 32 | EVALUATE_STEPS=5000
 33 | R1_LAMBDA=5.0
 34 | R2_LAMBDA=1.0
 35 | if [ $MODEL == "xlm-roberta-large" ]; then
 36 |   BATCH_SIZE=16
 37 |   GRAD_ACC=2
 38 |   LR=5e-6
 39 | else
 40 |   BATCH_SIZE=32
 41 |   GRAD_ACC=1
 42 |   LR=7e-6
 43 | fi
 44 | 
 45 | if [ $STAGE == 1 ]; then
 46 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/"
 47 |   mkdir -p $OUTPUT_DIR
 48 |   python ./src/run_cls.py --model_type xlmr \
 49 |         --model_name_or_path $MODEL_PATH \
 50 |         --language $LANGS \
 51 |         --train_language en \
 52 |         --do_train \
 53 |         --data_dir $DATA_DIR/$TASK/ \
 54 |         --per_gpu_train_batch_size $BATCH_SIZE \
 55 |         --gradient_accumulation_steps $GRAD_ACC \
 56 |         --per_gpu_eval_batch_size 64 \
 57 |         --learning_rate $LR \
 58 |         --num_train_epochs $EPOCH \
 59 |         --max_seq_length $MAXL \
 60 |         --output_dir $OUTPUT_DIR \
 61 |         --task_name $TASK \
 62 |         --save_steps -1 \
 63 |         --overwrite_output_dir \
 64 |         --evaluate_during_training \
 65 |         --evaluate_steps $EVALUATE_STEPS \
 66 |         --logging_steps 50 \
 67 |         --logging_steps_in_sample -1 \
 68 |         --logging_each_epoch \
 69 |         --gpu_id 0 \
 70 |         --seed $SEED \
 71 |         --fp16 --fp16_opt_level O2 \
 72 |         --warmup_steps -1 \
 73 |         --enable_r1_loss \
 74 |         --r1_lambda $R1_LAMBDA \
 75 |         --original_loss \
 76 |         --enable_translate_data \
 77 |         --translation_path $TRANSLATION_PATH
 78 | elif [ $STAGE == 2 ]; then
 79 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
 80 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 81 |   mkdir -p $OUTPUT_DIR
 82 |   python ./src/run_cls.py --model_type xlmr \
 83 |         --model_name_or_path $MODEL_PATH \
 84 |         --language $LANGS \
 85 |         --train_language en \
 86 |         --do_train \
 87 |         --data_dir $DATA_DIR/$TASK/ \
 88 |         --per_gpu_train_batch_size $BATCH_SIZE \
 89 |         --gradient_accumulation_steps $GRAD_ACC \
 90 |         --per_gpu_eval_batch_size 64 \
 91 |         --learning_rate $LR \
 92 |         --num_train_epochs $EPOCH \
 93 |         --max_seq_length $MAXL \
 94 |         --output_dir $OUTPUT_DIR \
 95 |         --task_name $TASK \
 96 |         --save_steps -1 \
 97 |         --overwrite_output_dir \
 98 |         --evaluate_during_training \
 99 |         --evaluate_steps $EVALUATE_STEPS \
100 |         --logging_steps 50 \
101 |         --logging_steps_in_sample -1 \
102 |         --logging_each_epoch \
103 |         --gpu_id 0 \
104 |         --seed $SEED \
105 |         --fp16 --fp16_opt_level O2 \
106 |         --warmup_steps -1 \
107 |         --enable_r1_loss \
108 |         --r1_lambda $R1_LAMBDA \
109 |         --original_loss \
110 |         --enable_translate_data \
111 |         --translation_path $TRANSLATION_PATH \
112 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
113 |         --enable_data_augmentation \
114 |         --augment_ratio 1.0 \
115 |         --augment_method mt \
116 |         --r2_lambda $R2_LAMBDA
117 | fi


--------------------------------------------------------------------------------
/scripts/translate-train-all/train_xquad.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2020 Google and DeepMind.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | REPO=$PWD
 17 | MODEL=${1:-"xlm-roberta-base"}
 18 | STAGE=${2:-1}
 19 | GPU=${3:-0}
 20 | DATA_DIR=${4:-"$REPO/download/"}
 21 | OUT_DIR=${5:-"$REPO/outputs/"}
 22 | SEED=${6:-1}
 23 | 
 24 | export CUDA_VISIBLE_DEVICES=$GPU
 25 | 
 26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/
 27 | 
 28 | TASK='xquad'
 29 | MODEL_PATH=$DATA_DIR/$MODEL
 30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
 31 | 
 32 | EPOCH=4
 33 | MAXL=384
 34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh"
 35 | BSR=0.3
 36 | SA=0.3
 37 | SNBS=-1
 38 | CSR=0.3
 39 | R1_LAMBDA=5.0
 40 | R2_LAMBDA=0.1
 41 | if [ $MODEL == "xlm-roberta-large" ]; then
 42 |   BATCH_SIZE=4
 43 |   GRAD_ACC=8
 44 |   LR=1.5e-5
 45 | else
 46 |   BATCH_SIZE=32
 47 |   GRAD_ACC=1
 48 |   LR=3e-5
 49 | fi
 50 | 
 51 | if [ $STAGE == 1 ]; then
 52 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 53 |   python ./src/run_qa.py --model_type xlmr \
 54 |         --task_name $TASK \
 55 |         --model_name_or_path $MODEL_PATH \
 56 |         --do_train \
 57 |         --do_eval \
 58 |         --language $LANGS \
 59 |         --train_language en \
 60 |         --data_dir $DATA_DIR/$TASK/ \
 61 |         --per_gpu_train_batch_size $BATCH_SIZE \
 62 |         --gradient_accumulation_steps $GRAD_ACC \
 63 |         --per_gpu_eval_batch_size 128 \
 64 |         --learning_rate $LR \
 65 |         --num_train_epochs $EPOCH \
 66 |         --save_steps 0 \
 67 |         --logging_each_epoch \
 68 |         --max_seq_length $MAXL \
 69 |         --doc_stride 128 \
 70 |         --output_dir $OUTPUT_DIR \
 71 |         --overwrite_output_dir \
 72 |         --evaluate_during_training \
 73 |         --logging_steps 50 \
 74 |         --evaluate_steps 0 \
 75 |         --seed $SEED \
 76 |         --fp16 --fp16_opt_level O2 \
 77 |         --warmup_steps -1 \
 78 |         --enable_r1_loss \
 79 |         --r1_lambda $R1_LAMBDA \
 80 |         --original_loss \
 81 |         --overall_ratio 1.0 \
 82 |         --keep_boundary_unchanged \
 83 |         --enable_code_switch \
 84 |         --code_switch_ratio $CSR \
 85 |         --dict_dir $DATA_DIR/dicts \
 86 |         --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \
 87 |         --noised_max_seq_length $MAXL
 88 | elif [ $STAGE == 2 ]; then
 89 |   FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
 90 |   OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
 91 |   python ./src/run_qa.py --model_type xlmr \
 92 |         --task_name $TASK \
 93 |         --model_name_or_path $MODEL_PATH \
 94 |         --do_train \
 95 |         --do_eval \
 96 |         --language $LANGS \
 97 |         --train_language en \
 98 |         --data_dir $DATA_DIR/$TASK/ \
 99 |         --per_gpu_train_batch_size $BATCH_SIZE \
100 |         --gradient_accumulation_steps $GRAD_ACC \
101 |         --per_gpu_eval_batch_size 128 \
102 |         --learning_rate $LR \
103 |         --num_train_epochs $EPOCH \
104 |         --save_steps 0 \
105 |         --logging_each_epoch \
106 |         --max_seq_length $MAXL \
107 |         --doc_stride 128 \
108 |         --output_dir $OUTPUT_DIR \
109 |         --overwrite_output_dir \
110 |         --evaluate_during_training \
111 |         --logging_steps 50 \
112 |         --evaluate_steps 0 \
113 |         --seed $SEED \
114 |         --fp16 --fp16_opt_level O2 \
115 |         --warmup_steps -1 \
116 |         --enable_r1_loss \
117 |         --r1_lambda $R1_LAMBDA \
118 |         --original_loss \
119 |         --overall_ratio 1.0 \
120 |         --keep_boundary_unchanged \
121 |         --enable_bpe_sampling \
122 |         --bpe_sampling_ratio $BSR \
123 |         --sampling_alpha $SA \
124 |         --sampling_nbest_size $SNBS \
125 |         --noised_max_seq_length $MAXL \
126 |         --enable_data_augmentation \
127 |         --augment_ratio 1.0 \
128 |         --augment_method mt \
129 |         --translation_path $TRANSLATION_PATH \
130 |         --max_steps 24000 \
131 |         --r2_lambda $R2_LAMBDA \
132 |         --first_stage_model_path $FIRST_STAGE_MODEL_PATH
133 | fi
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/src/pequod/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__init__.py


--------------------------------------------------------------------------------
/src/pequod/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from transformers.data.processors.utils import InputFeatures
 3 | 
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 |   
 7 | 
 8 | def convert_examples_to_features(
 9 |   processor, examples, tokenizer, max_length, label_list,
10 |   pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True):
11 |   
12 |   if label_list is None: label_list = processor.get_labels()
13 | 
14 |   label_map = {label: i for i, label in enumerate(label_list)}
15 | 
16 |   features = []
17 |   for ex_index, example in enumerate(examples):
18 |     if ex_index % 10000 == 0:
19 |       logger.info("Writing example %d" % ex_index)
20 |     inputs = tokenizer.encode_plus(
21 |       example.text_a,
22 |       example.text_b,
23 |       add_special_tokens=True,
24 |       max_length=max_length)
25 |     input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
26 | 
27 |     attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
28 | 
29 |     padding_length = max_length - len(input_ids)
30 |     input_ids = input_ids + ([pad_token] * padding_length)
31 |     attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
32 |     token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
33 |   
34 |     assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
35 |     assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
36 |     assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
37 | 
38 |     label = label_map[example.label]
39 |     if ex_index < 3:
40 |       logger.info("*** Example ***")
41 |       logger.info("guid: %s" % (example.guid))
42 |       logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
43 |       logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
44 |       logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
45 |       logger.info("label: %s (id = %d)" % (example.label, label))
46 |     
47 |     features.append(InputFeatures(
48 |       input_ids=input_ids,
49 |       attention_mask=attention_mask,
50 |       token_type_ids=token_type_ids,
51 |       label=label))
52 |     
53 |   return features


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/sampler.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/sampler.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/utils_squad.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xdoc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xdoc.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xqa.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xretrieval.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/data/dataloader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/dataloader.py


--------------------------------------------------------------------------------
/src/pequod/data/sampler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.utils.data.sampler import Sampler
 4 | 
 5 | 
 6 | class SubSampler(Sampler):
 7 | 
 8 |   def __init__(self, data_source, num_samples):
 9 |     self.data_source = data_source
10 |     self.num_samples = num_samples
11 | 
12 |   def __len__(self):
13 |     return self.num_samples
14 |   
15 |   def __iter__(self):
16 |     n = len(self.data_source)
17 |     if self.num_samples <= n:
18 |       return iter(torch.randperm(n).tolist()[:self.num_samples])
19 |     return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64).tolist())


--------------------------------------------------------------------------------
/src/pequod/data/wili.py:
--------------------------------------------------------------------------------
 1 | """Loading examples and features for WiLI-2018 dataset"""
 2 | 
 3 | import logging
 4 | import os
 5 | import torch
 6 | 
 7 | from transformers.data.processors.utils import (DataProcessor,
 8 |   InputExample, InputFeatures)
 9 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
10 |                               TensorDataset)
11 | from src.data import convert_examples_to_features
12 | from src.io import lines_gen
13 | 
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | _alias2lang = {}
19 | _lang2id = {}
20 | _langs = []
21 | 
22 | def get_alias2lang(data_dir):
23 |   if len(_alias2lang) > 0: return _alias2lang, _lang2id, _langs
24 |   for line, in lines_gen(os.path.join(data_dir, "labels-new")):
25 |     value = None
26 |     for alias in line.split(";"):
27 |       alias = alias.strip()
28 |       if alias == "": continue
29 |       if value is None: value = alias
30 |       _alias2lang[alias] = value
31 |     _langs.append(value)
32 |   for i, lang in enumerate(_langs): _lang2id[lang] = i
33 |   return _alias2lang, _lang2id, _langs
34 | 
35 | 
36 | def load_and_cache_examples(args, data_dir, split, run_lang2id, tokenizer, key=""):
37 |   cache_filename = os.path.join(
38 |     data_dir, "cached_%s_%s" % (split, key))
39 |   
40 |   if os.path.exists(cache_filename) and not args.overwrite_cache:
41 |     logger.info("Loading features from cached file %s" % cache_filename)
42 |     features = torch.load(cache_filename)
43 |   else:
44 |     processor = WiliProcessor()
45 |     logger.info("Creating features from dataset file at %s" % data_dir)
46 |     label_list = processor.get_labels(data_dir)
47 |     examples = processor.get_examples(data_dir, split)
48 |     logger.info("%d Examples loaded" % len(examples))
49 |     features = convert_examples_to_features(
50 |       processor, examples, tokenizer, max_length=args.max_seq_length,
51 |       label_list=label_list, pad_token_segment_id=0,
52 |       pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0])
53 |     logger.info("Saving features to cache file %s" % cache_filename)
54 |     torch.save(features, cache_filename)
55 |   
56 |   # Cut dataset to test langs
57 |   alias2lang, lang2id, _ = get_alias2lang(data_dir)
58 |   test_lang_ids = {lang2id[alias2lang[lang]] for lang in run_lang2id.keys()}
59 |   wili_id2run_langid = {
60 |     lang2id[alias2lang[lang]]:val for lang, val in run_lang2id.items()}
61 |   
62 |   all_input_ids, all_attention_mask = [], [] 
63 |   all_token_type_ids, all_labels = [], []
64 |   for f in features:
65 |     if f.label not in test_lang_ids: continue
66 |     all_input_ids.append(f.input_ids)
67 |     all_attention_mask.append(f.attention_mask)
68 |     all_token_type_ids.append(f.token_type_ids)
69 |     all_labels.append(wili_id2run_langid[f.label])
70 |   
71 |   all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
72 |   all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
73 |   all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
74 |   all_labels = torch.tensor(all_labels,  dtype=torch.long)
75 |   
76 |   dataset = TensorDataset(
77 |     all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
78 | 
79 |   return dataset
80 | 
81 | 
82 | class WiliProcessor(DataProcessor):
83 | 
84 |   def get_examples(self, data_dir, split):
85 |     examples = []
86 |     filename_x = os.path.join(data_dir, "x_%s.txt" % split)
87 |     filename_y = os.path.join(data_dir, "y_%s.txt" % split)
88 |     for i, (line_x, line_y) in enumerate(lines_gen(filename_x, filename_y)):
89 |       guid = "%s-%s" % (split, i)
90 |       examples.append(
91 |         InputExample(guid=guid, text_a=line_x, text_b=None, label=line_y))
92 |     return examples
93 |   
94 |   def get_labels(self, data_dir):
95 |     _, _, langs = get_alias2lang(data_dir)
96 |     return langs
97 | 


--------------------------------------------------------------------------------
/src/pequod/data/xqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import torch
 4 | 
 5 | from torch.utils.data import TensorDataset
 6 | from src.pequod.data.utils_squad import (read_squad_examples,
 7 |   convert_examples_to_features)
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def load_and_cache_examples(args, split, lang, tokenizer, key="", evaluate=False):
14 |   cache_filename = os.path.join(
15 |     args.data_dir, "cached_%s_%s_%s" % (split, lang, key))
16 |   
17 |   input_file = os.path.join(args.data_dir, "%s-%s.json" % (split, lang))
18 |   if os.path.exists(cache_filename):
19 |     logger.info("Loading features from cached file %s", cache_filename)
20 |     features = torch.load(cache_filename)
21 |     if evaluate:
22 |       examples = read_squad_examples(input_file=input_file,
23 |         is_training=not evaluate,
24 |         version_2_with_negative=args.version_2_with_negative)
25 |     else: examples = None
26 |   else:
27 |     logger.info("Creating features from dataset file at %s", input_file)
28 |     examples = read_squad_examples(input_file=input_file,
29 |       is_training=not evaluate,
30 |       version_2_with_negative=args.version_2_with_negative)
31 |     features = convert_examples_to_features(examples=examples,
32 |       tokenizer=tokenizer, max_seq_length=args.max_seq_length,
33 |       doc_stride=args.doc_stride, max_query_length=args.max_query_length,
34 |       is_training=not evaluate, cls_token=tokenizer.cls_token,
35 |       sep_token=tokenizer.sep_token)
36 |     logger.info("Saving features into cached file %s", cache_filename)
37 |     torch.save(features, cache_filename)
38 |   
39 |   # Convert to Tensors and build dataset
40 |   all_input_ids = torch.tensor(
41 |     [f.input_ids for f in features], dtype=torch.long)
42 |   all_input_mask = torch.tensor(
43 |     [f.input_mask for f in features], dtype=torch.long)
44 |   all_segment_ids = torch.tensor(
45 |     [f.segment_ids for f in features], dtype=torch.long)
46 |   all_cls_index = torch.tensor(
47 |     [f.cls_index for f in features], dtype=torch.long)
48 |   all_p_mask = torch.tensor(
49 |     [f.p_mask for f in features], dtype=torch.float)
50 |   if evaluate:
51 |     all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
52 |     dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
53 |       all_example_index, all_cls_index, all_p_mask)
54 |   else:
55 |     all_start_positions = torch.tensor(
56 |       [f.start_position for f in features], dtype=torch.long)
57 |     all_end_positions = torch.tensor(
58 |       [f.end_position for f in features], dtype=torch.long)
59 |     dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
60 |       all_start_positions, all_end_positions, all_cls_index, all_p_mask)
61 | 
62 |   return dataset, examples, features
63 | 


--------------------------------------------------------------------------------
/src/pequod/data/xretrieval.py:
--------------------------------------------------------------------------------
  1 | """Load examples from BUCC"""
  2 | 
  3 | 
  4 | import logging
  5 | import os
  6 | import torch
  7 | 
  8 | 
  9 | from transformers.data.processors.utils import (
 10 |   DataProcessor, InputExample, InputFeatures)
 11 | from torch.utils.data import (
 12 |   DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def load_and_cache_examples(args, langpair, lang, tokenizer, key="", prefix="tatoeba"):
 19 | 
 20 |   cache_dir = os.path.join(args.data_dir, "pequod_cache")
 21 |   os.makedirs(cache_dir, exist_ok=True)
 22 |   cache_filename = os.path.join(
 23 |     cache_dir, "cached_%s_%s_%s" % (langpair, lang, key))
 24 |   
 25 |   if os.path.exists(cache_filename) and not args.overwrite_cache:
 26 |     logger.info("Loading features from cached file %s" % cache_filename)
 27 |     features = torch.load(cache_filename)
 28 |   else:
 29 |     processer = TatoebaProcesser()
 30 |     logger.info("Creating features from dataset file at %s" % args.data_dir)
 31 |     examples = processer.get_examples(args.data_dir, langpair, lang, prefix)
 32 |     features = TatoebaProcesser.convert_examples_to_features(
 33 |       examples, tokenizer, args.max_seq_length, 0,
 34 |       pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],)
 35 |     #logger.info("Saving features to cache file %s" % cache_filename)
 36 |     #torch.save(features, cache_filename)
 37 |   
 38 |   all_input_ids = torch.tensor(
 39 |     [f.input_ids for f in features], dtype=torch.long)
 40 |   all_attention_mask = torch.tensor(
 41 |     [f.attention_mask for f in features], dtype=torch.long)
 42 |   all_token_type_ids = torch.tensor(
 43 |     [f.token_type_ids for f in features], dtype=torch.long)
 44 | 
 45 |   dataset = TensorDataset(
 46 |     all_input_ids, all_attention_mask, all_token_type_ids)
 47 | 
 48 |   return dataset
 49 | 
 50 | class TatoebaProcesser(DataProcessor):
 51 | 
 52 |   @classmethod
 53 |   def convert_examples_to_features(cls, examples, tokenizer, max_length, pad_token_segment_id, pad_token, mask_padding_with_zero=True):
 54 | 
 55 |     features = []
 56 |     for ex_index, example in enumerate(examples):
 57 |       inputs = tokenizer.encode_plus(
 58 |         example.text_a,
 59 |         None,
 60 |         add_special_tokens=True,
 61 |         max_length=max_length,
 62 |       )
 63 |       input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 64 | 
 65 |       attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 66 | 
 67 |       padding_length = max_length - len(input_ids)
 68 |       input_ids = input_ids + ([pad_token] * padding_length)
 69 |       attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
 70 |       token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
 71 |     
 72 |       assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
 73 |       assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
 74 |       assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
 75 | 
 76 |       if ex_index < 3:
 77 |         logger.info("*** Example ***")
 78 |         logger.info("guid: %s" % (example.guid))
 79 |         logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
 80 |         logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
 81 |         logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
 82 |       
 83 |       features.append(InputFeatures(
 84 |         input_ids=input_ids,
 85 |         attention_mask=attention_mask,
 86 |         token_type_ids=token_type_ids,
 87 |         label=None,
 88 |       ))
 89 | 
 90 |     return features
 91 | 
 92 |   def get_examples(self, data_dir, langpair, lang, prefix="tatoeba"):
 93 |     examples = []
 94 |     if prefix == "bucc":
 95 |       fn = os.path.join(data_dir, "%s.%s.txt" % (langpair, lang))
 96 |     else:
 97 |       fn = os.path.join(data_dir, "%s.%s" % (langpair, lang))
 98 |     #fn = os.path.join(data_dir, "%s.%s.%s" % (prefix, langpair, lang))
 99 |     with open(fn, encoding='utf-8') as fp:
100 |       for i, line in enumerate(fp):
101 |         line = line.strip()
102 |         examples.append(InputExample(
103 |           guid="%s-%s-%d" % (langpair, lang, i),
104 |           text_a=line,
105 |         ))
106 |     return examples
107 | 


--------------------------------------------------------------------------------
/src/pequod/eval/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import torch
 4 | import inspect
 5 | 
 6 | 
 7 | from src.pequod.data.utils_squad import RawResult, write_predictions
 8 | from src.pequod.data.utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 9 | 
10 | 
11 | def to_list(tensor):
12 |   return tensor.detach().cpu().tolist()
13 | 
14 | 
15 | def score_dict_to_string(score_dict):
16 |   return " ".join([("%s:%.2f" % (k, v)) for k, v in score_dict.items()])
17 | 
18 | 
19 | def score_dicts_to_latex(score_dicts):
20 |   keys = [k for k in score_dicts[0]]
21 |   return "\n".join([""] + [(
22 |     " & ".join([key] + [("%.2f" % (sd[key])) for sd in score_dicts])
23 |     ) for key in keys])
24 | 
25 | 
26 | def eval_classification(model, batch_dict_iter):
27 |   model.eval()
28 |   preds, labels = None, None
29 |   for batch_dict in batch_dict_iter:
30 |     label_id = batch_dict["labels"].detach().cpu().numpy()
31 |     batch_dict.pop("labels")
32 |     with torch.no_grad(): logits = model(**batch_dict)[0]
33 |     pred = logits.detach().cpu().numpy()
34 |     if preds is None: preds, labels = pred, label_id
35 |     else:
36 |       preds = np.append(preds, pred, axis=0)
37 |       labels = np.append(labels, label_id)
38 |   preds = np.argmax(preds, axis=1)
39 |   result = (preds == labels).mean()
40 |   return {"acc": result*100.0}
41 |   
42 | 
43 | def eval_qa(model, batch_dict_iter, prefix="", **kwargs):
44 | 
45 |   features = kwargs["all_features"]
46 |   output_dir = kwargs["output_dir"]
47 | 
48 |   model.eval()
49 |   all_results = []
50 |   for batch_dict, example_indices in batch_dict_iter:
51 |     with torch.no_grad(): outputs = model(**batch_dict)
52 | 
53 |     for i, example_index in enumerate(example_indices):
54 |       eval_feature = features[example_index.item()]
55 |       unique_id = int(eval_feature.unique_id)
56 |       result = RawResult(unique_id    = unique_id,
57 |                          start_logits = to_list(outputs[0][i]),
58 |                          end_logits   = to_list(outputs[1][i]))
59 |       all_results.append(result)
60 |   
61 |   output_prediction_file = os.path.join(
62 |     output_dir, "predictions_{}.json".format(prefix))
63 |   output_nbest_file = os.path.join(
64 |     output_dir, "nbest_predictions_{}.json".format(prefix))
65 |   if kwargs["version_2_with_negative"]:
66 |     output_null_log_odds_file = os.path.join(
67 |       output_dir, "null_odds_{}.json".format(prefix))
68 |   else: output_null_log_odds_file = None
69 |   
70 |   wrt_pred_kwargs = {
71 |     "all_results": all_results,
72 |     "output_prediction_file": output_prediction_file,
73 |     "output_nbest_file": output_nbest_file,
74 |     "output_null_log_odds_file": output_null_log_odds_file}
75 |   
76 |   for key in inspect.getfullargspec(write_predictions).args:
77 |     if key not in wrt_pred_kwargs:
78 |       wrt_pred_kwargs[key] = kwargs[key]
79 |   
80 |   write_predictions(**wrt_pred_kwargs)
81 | 
82 |   # Evaluate with the official SQuAD script
83 |   evaluate_options = EVAL_OPTS(
84 |     data_file=kwargs["predict_file"],
85 |     pred_file=output_prediction_file,
86 |     na_prob_file=output_null_log_odds_file,
87 |     out_file="/dev/null")
88 |   results = evaluate_on_squad(evaluate_options)
89 |   return results
90 | 


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/evaluator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/evaluator.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | 
 4 | from torch.utils.data import DataLoader
 5 | from src.pequod.training.trainer import to_cuda
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Evaluator(object):
12 | 
13 |   def __init__(self, args, model, tokenizer, **kwargs):
14 |     self.args = args
15 |     self.datasets = {}
16 |     self.model = model
17 |     self.tokenizer = tokenizer
18 |   
19 |   def _parse_batch(self, batch, has_label=True, **kwargs):
20 |     _batch = to_cuda(batch)
21 |     # _batch = batch
22 |     ret = {"input_ids": _batch[0],
23 |       "attention_mask": _batch[1],
24 |       "token_type_ids": _batch[2] if self.args.model_type == "bert" else None,}
25 |     if has_label: ret["labels"] = _batch[3]
26 |     ret.update(**kwargs)
27 |     return ret
28 |   
29 |   def run(self):
30 |     raise NotImplementedError
31 | 
32 |   def get_dataset(self, *args, **kwargs):
33 |     if args in self.datasets: return self.datasets[args]
34 |     dataset = self.load_and_cache_examples(*args, **kwargs)
35 |     self.datasets[args] = dataset
36 |     return dataset
37 |   
38 |   def load_and_cache_examples(self, *args, **kwargs):
39 |     raise NotImplementedError
40 | 
41 |   def get_dataloader(self, *args, **kwargs):
42 |     logger.info("Getting dataloader - args: %s" % str(args))
43 |     dataset = kwargs.pop("dataset", self.get_dataset(*args, **kwargs))
44 |     dataloader = DataLoader(dataset, batch_size=self.args.eval_batch_size)
45 |     return dataloader
46 | 


--------------------------------------------------------------------------------
/src/pequod/io.py:
--------------------------------------------------------------------------------
1 | """I/O"""
2 | 
3 | def _lines_gen_from_single_file(filename):
4 |   with open(filename) as fp:
5 |     for line in fp: yield line.strip()
6 | 
7 | 
8 | def lines_gen(*filenames):
9 |   for ret in zip(*map(_lines_gen_from_single_file, filenames)): yield ret


--------------------------------------------------------------------------------
/src/pequod/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__init__.py


--------------------------------------------------------------------------------
/src/pequod/model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/model/__pycache__/roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/roberta.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/model/roberta.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch import nn
 4 | from torch.nn import CrossEntropyLoss
 5 | from transformers.modeling_bert import BertPreTrainedModel, BertForQuestionAnswering
 6 | from transformers.modeling_roberta import RobertaModel
 7 | 
 8 | 
 9 | class RobertaForQuestionAnswering(BertPreTrainedModel):
10 | 
11 |   base_model_prefix = "roberta"
12 |   def __init__(self, config):
13 |     BertPreTrainedModel.__init__(self, config)
14 |     self.num_labels = config.num_labels
15 |     self.roberta = RobertaModel(config)
16 |     self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
17 |     BertPreTrainedModel.init_weights(self)
18 |   
19 |   def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, start_positions=None, end_positions=None, **kwargs):
20 | 
21 |     outputs = self.roberta(input_ids,
22 |       attention_mask=attention_mask,
23 |       token_type_ids=token_type_ids,
24 |       position_ids=position_ids, 
25 |       head_mask=head_mask,
26 |       **kwargs)
27 | 
28 |     sequence_output = outputs[0]
29 | 
30 |     logits = self.qa_outputs(sequence_output)
31 |     start_logits, end_logits = logits.split(1, dim=-1)
32 |     start_logits = start_logits.squeeze(-1)
33 |     end_logits = end_logits.squeeze(-1)
34 | 
35 |     outputs = (start_logits, end_logits,) + outputs[2:]
36 |     if start_positions is not None and end_positions is not None:
37 |       # If we are on multi-GPU, split add a dimension
38 |       if len(start_positions.size()) > 1:
39 |           start_positions = start_positions.squeeze(-1)
40 |       if len(end_positions.size()) > 1:
41 |           end_positions = end_positions.squeeze(-1)
42 |       # sometimes the start/end positions are outside our model inputs, we ignore these terms
43 |       ignored_index = start_logits.size(1)
44 |       start_positions.clamp_(0, ignored_index)
45 |       end_positions.clamp_(0, ignored_index)
46 | 
47 |       loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
48 |       start_loss = loss_fct(start_logits, start_positions)
49 |       end_loss = loss_fct(end_logits, end_positions)
50 |       total_loss = (start_loss + end_loss) / 2
51 |       outputs = (total_loss,) + outputs
52 | 
53 |     return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)


--------------------------------------------------------------------------------
/src/pequod/optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/optim/__init__.py


--------------------------------------------------------------------------------
/src/pequod/optim/la.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import torch
 4 | from torch.optim.optimizer import Optimizer
 5 | 
 6 | 
 7 | class LookaheadWrapper(Optimizer):
 8 |     r"""Implements a Lookahead wrapper around a given optimizer
 9 |     """
10 | 
11 |     def __init__(self, optimizer, la_steps, la_alpha=0.5):
12 |         self.optimizer = optimizer
13 |         self._la_step = 0 # counter for inner optimizer
14 |         self.la_alpha = la_alpha
15 |         self._total_la_steps = la_steps
16 | 
17 |         self.state = defaultdict(dict)
18 | 
19 |         # Cache the current optimizer parameters
20 |         for group in optimizer.param_groups:
21 |             for p in group['params']:
22 |                 param_state = self.state[p]
23 |                 param_state['cached_params'] = torch.zeros_like(p.data)
24 |                 param_state['cached_params'].copy_(p.data)
25 | 
26 |     def __getstate__(self):
27 |         return self.optimizer.__getstate__()
28 | 
29 |     def __setstate__(self, state):
30 |         self.optimizer.__setstate__(state)
31 |     
32 |     def zero_grad(self):
33 |         self.optimizer.zero_grad()
34 | 
35 |     def state_dict(self):
36 |         return self.optimizer.state_dict()
37 | 
38 |     def load_state_dict(self, state_dict):
39 |         self.optimizer.load_state_dict(state_dict)
40 | 
41 |     @property
42 |     def param_groups(self):
43 |         return self.optimizer.param_groups
44 | 
45 |     def step(self, closure=None):
46 |         """Performs a single Lookahead optimization step.
47 |         Arguments:
48 |             closure (callable, optional): A closure that reevaluates the model
49 |                 and returns the loss.
50 |         """
51 |         loss = self.optimizer.step(closure)
52 |         self._la_step += 1
53 | 
54 |         if self._la_step >= self._total_la_steps:
55 |             self._la_step = 0
56 |             # Lookahead and cache the current optimizer parameters
57 |             for group in self.optimizer.param_groups:
58 |                 for p in group['params']:
59 |                     param_state = self.state[p]
60 |                     p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params'])
61 |                     param_state['cached_params'].copy_(p.data)
62 |         return loss
63 | 


--------------------------------------------------------------------------------
/src/pequod/optim/la0.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import torch
 4 | from torch.optim.optimizer import Optimizer
 5 | 
 6 | 
 7 | class Lookahead0Wrapper(Optimizer):
 8 |     r"""Implements a Lookahead wrapper around a given optimizer
 9 |     """
10 | 
11 |     def __init__(self, optimizer, la_steps, la_alpha=0.5):
12 |         self.optimizer = optimizer
13 |         self._la_step = 0 # counter for inner optimizer
14 |         self.la_alpha = la_alpha
15 |         self._total_la_steps = la_steps
16 | 
17 |         self.state = defaultdict(dict)
18 | 
19 |         # Cache the current optimizer parameters
20 |         for group in optimizer.param_groups:
21 |             for p in group['params']:
22 |                 param_state = self.state[p]
23 |                 param_state['cached_params'] = torch.zeros_like(p.data)
24 |                 param_state['cached_params'].copy_(p.data)
25 | 
26 |     def __getstate__(self):
27 |         return self.optimizer.__getstate__()
28 | 
29 |     def __setstate__(self, state):
30 |         self.optimizer.__setstate__(state)
31 |     
32 |     def zero_grad(self):
33 |         self.optimizer.zero_grad()
34 | 
35 |     def state_dict(self):
36 |         return self.optimizer.state_dict()
37 | 
38 |     def load_state_dict(self, state_dict):
39 |         self.optimizer.load_state_dict(state_dict)
40 | 
41 |     @property
42 |     def param_groups(self):
43 |         return self.optimizer.param_groups
44 | 
45 |     def step(self, closure=None):
46 |         """Performs a single Lookahead optimization step.
47 |         Arguments:
48 |             closure (callable, optional): A closure that reevaluates the model
49 |                 and returns the loss.
50 |         """
51 |         loss = self.optimizer.step(closure)
52 |         self._la_step += 1
53 | 
54 |         if self._la_step >= self._total_la_steps:
55 |             self._la_step = 0
56 |             # Lookahead and cache the current optimizer parameters
57 |             for group in self.optimizer.param_groups:
58 |                 for p in group['params']:
59 |                     param_state = self.state[p]
60 |                     p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params'])
61 |                     # param_state['cached_params'].copy_(p.data)
62 |         return loss
63 | 


--------------------------------------------------------------------------------
/src/pequod/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__init__.py


--------------------------------------------------------------------------------
/src/pequod/text/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/text/tokenization_sentencepiece.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import sentencepiece as spm
  4 | from transformers.tokenization_utils import PreTrainedTokenizer
  5 | 
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class XLMRTokenizer(PreTrainedTokenizer):
 11 | 
 12 |   def __init__(self, bpe_file, dict_file, **kwargs):
 13 |     super(XLMRTokenizer, self).__init__(
 14 |       bos_token="<s>",
 15 |       eos_token="</s>",
 16 |       unk_token="<unk>",
 17 |       pad_token="<pad>",
 18 |       mask_token="<mask>",
 19 |       sep_token="</s>",
 20 |       cls_token="<s>",
 21 |       **kwargs)
 22 |     
 23 |     self.max_len_single_sentence = self.max_len - 2
 24 |     self.max_len_sentences_pair = self.max_len - 4
 25 |     
 26 |     self.sp = spm.SentencePieceProcessor()
 27 |     self.sp.Load(bpe_file)
 28 | 
 29 |     self.encoder = {}
 30 |     self.decoder = []
 31 | 
 32 |     for token in [self.bos_token, self.pad_token, self.eos_token, self.unk_token]:
 33 |       self._add_token(token)
 34 |     
 35 |     with open(dict_file, encoding="utf-8") as fp:
 36 |       for line in fp:
 37 |         # NOTE DO NOT USE .split()
 38 |         tokens_cnt = line.rstrip().split(" ")
 39 |         try:
 40 |           assert len(tokens_cnt) >= 2, line
 41 |         except AssertionError:
 42 |           logger.error(
 43 |             "tokenizer line %s asserterror, replaced as <unk-%d>" % (
 44 |               line, len(self.decoder)))
 45 |           exit(0)
 46 |         self._add_token(" ".join(tokens_cnt[:-1]))
 47 |   
 48 |   def _add_token(self, token):
 49 |     idx = len(self.encoder)
 50 |     self.encoder[token] = idx
 51 |     self.decoder.append(token)
 52 | 
 53 |   def _tokenize(self, text):
 54 |     return self.sp.EncodeAsPieces(text)
 55 |   
 56 |   def _convert_id_to_token(self, index):
 57 |     return self.decoder[index]
 58 | 
 59 |   def _convert_token_to_id(self, token):
 60 |     return self.encoder.get(token, self.encoder.get(self.unk_token))
 61 | 
 62 |   def convert_tokens_to_string(self, tokens):
 63 |     return "".join(tokens).replace('\u2581', ' ').strip()
 64 |   
 65 |   @classmethod
 66 |   def from_pretrained(cls, model_path, **kwargs):
 67 |     bpe_file = os.path.join(model_path, "sentencepiece.bpe.model")
 68 |     dict_file = os.path.join(model_path, "dict.txt")
 69 |     tokenizer = cls(bpe_file, dict_file)
 70 |     return tokenizer
 71 |   
 72 |   def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 73 |     if token_ids_1 is None:
 74 |         return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 75 |     cls = [self.cls_token_id]
 76 |     sep = [self.sep_token_id]
 77 |     return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 78 |   
 79 |   def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
 80 |     if already_has_special_tokens:
 81 |       if token_ids_1 is not None:
 82 |         raise ValueError("You should not supply a second sequence if the provided sequence of ids is already formated with special tokens for the model.")
 83 |       return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 84 | 
 85 |     if token_ids_1 is None:
 86 |       return [1] + ([0] * len(token_ids_0)) + [1]
 87 |     return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 88 | 
 89 |   def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
 90 |     sep = [self.sep_token_id]
 91 |     cls = [self.cls_token_id]
 92 | 
 93 |     if token_ids_1 is None:
 94 |       return len(cls + token_ids_0 + sep) * [0]
 95 |     return len(cls + token_ids_0 + sep) * [0] + len(sep + token_ids_1 + sep) * [1]
 96 | 
 97 | 
 98 | if __name__ == "__main__":  
 99 |   tokenizer = XLMRTokenizer.from_pretrained("/home/v-zechi/data/unilm/zechi/exp/bert_data/xlmr-large")
100 |   
101 |   for text in ["Hello world!", "你好，世界", "नमस्ते दुनिया", "مرحبا بالعالم", "Bonjour le monde"]:
102 |     print(tokenizer.tokenize(text))
103 |     print(tokenizer.encode_plus(text, text, add_special_tokens=True))
104 | 


--------------------------------------------------------------------------------
/src/pequod/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__init__.py


--------------------------------------------------------------------------------
/src/pequod/tools/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/tools/__pycache__/convert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/convert.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/training/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | import os
  4 | import random
  5 | import torch
  6 | import pickle
  7 | import logging
  8 | import numpy as np
  9 | 
 10 | # from transformers import (WEIGHTS_NAME,
 11 | #   BertConfig, BertForSequenceClassification, BertTokenizer,
 12 | #   RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
 13 | #   RobertaModel, BertModel, XLMModel,
 14 | #   XLMConfig, XLMForSequenceClassification, XLMTokenizer,
 15 | #   XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
 16 | #   DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
 17 | #   BertForQuestionAnswering)
 18 | #
 19 | # from src.pequod.model.roberta import RobertaForQuestionAnswering
 20 | from transformers import XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer
 21 | 
 22 | # ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
 23 | #   for conf in (BertConfig, XLNetConfig, XLMConfig,
 24 | #     RobertaConfig, DistilBertConfig)), ())
 25 | 
 26 | ALL_MODELS = []
 27 | 
 28 | # # Model classes for classification
 29 | # MODEL_CLASSES = {
 30 | #   'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
 31 | #   'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
 32 | #   'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 33 | #   'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 34 | #   'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
 35 | #   "xlmr": (RobertaConfig, RobertaForSequenceClassification, XLMRTokenizer)
 36 | # }
 37 | #
 38 | # QA_MODELS = {
 39 | #   "bert": BertForQuestionAnswering,
 40 | #   "roberta": RobertaForQuestionAnswering,
 41 | #   "xlmr": RobertaForQuestionAnswering,
 42 | # }
 43 | 
 44 | BERT_CLASSES = {
 45 |   "xlmr": (XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer),
 46 | }
 47 | 
 48 | 
 49 | def to_cuda(tup):
 50 |   return tuple(t.cuda() for t in tup)
 51 | 
 52 | 
 53 | def set_seed(args):
 54 |   random.seed(args.seed)
 55 |   np.random.seed(args.seed)
 56 |   torch.manual_seed(args.seed)
 57 |   #TODO multi gpu support
 58 |   # if args.n_gpu > 0:
 59 |   #   torch.cuda.manual_seed_all(args.seed)
 60 | 
 61 | 
 62 | def init_exp(args):
 63 |   # dump parameters
 64 |   set_dump_path(args)
 65 |   pickle.dump(args, open(os.path.join(args.dump_path, 'params.pkl'), 'wb'))
 66 | 
 67 |   # get running command
 68 |   command = ["python", sys.argv[0]]
 69 |   for x in sys.argv[1:]:
 70 |     if x.startswith('--'):
 71 |       assert '"' not in x and "'" not in x
 72 |       command.append(x)
 73 |     else:
 74 |       assert "'" not in x
 75 |       if re.match('^[a-zA-Z0-9_]+$', x):
 76 |         command.append("%s" % x)
 77 |       else:
 78 |         command.append("'%s'" % x)
 79 |   command = ' '.join(command)
 80 |   args.command = command + ' --exp_id "%s"' % args.exp_id
 81 | 
 82 |   # check experiment name
 83 |   assert len(args.exp_name.strip()) > 0
 84 | 
 85 |   logging.basicConfig(
 86 |     format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 87 |     datefmt = '%m/%d/%Y %H:%M:%S',
 88 |     level = logging.INFO)
 89 |   logger = logging.getLogger(__name__)
 90 |   logger.info("\n".join(
 91 |     "%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
 92 |   logger.info("The experiment will be stored in %s\n" % args.dump_path)
 93 |   logger.info("Running command: %s" % command)
 94 |   logger.info("")
 95 | 
 96 | 
 97 | def set_dump_path(args, output_dir=None, exp_name=None):
 98 |   if output_dir is None: output_dir = args.output_dir
 99 |   if exp_name is None: exp_name = args.exp_name
100 |   chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
101 |   while True:
102 |     exp_id = ''.join(random.choice(chars) for _ in range(10))
103 |     if not os.path.isdir(os.path.join(output_dir, exp_name, exp_id)):
104 |       break
105 |   args.exp_id = exp_id
106 |   dump_path = os.path.join(output_dir, exp_name, exp_id)
107 |   os.makedirs(dump_path)
108 |   args.dump_path = dump_path
109 | 


--------------------------------------------------------------------------------
/src/pequod/training/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/pequod/training/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/tools/__init__.py


--------------------------------------------------------------------------------
/src/tools/check_many2many_alignment.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | if __name__ == "__main__":
 4 |     parser = argparse.ArgumentParser()
 5 | 
 6 |     # Required parameters
 7 |     parser.add_argument(
 8 |         "--translation_path",
 9 |         default=None,
10 |         type=str,
11 |         required=True,
12 |         help="",
13 |     )
14 | 
15 |     drop_languages = ["en", "zh-CN", "zh", "ja", "ko", "th", "my", "ml", "ta"]
16 |     translate_languages = None
17 |     args = parser.parse_args()
18 |     src2tgt = {}
19 |     print("Reading translation from {}".format(args.translation_path))
20 |     with open(args.translation_path, encoding="utf-8") as f:
21 |         cnt = 0
22 |         for line in f:
23 |             cnt += 1
24 |             if cnt % 10000 == 0:
25 |                 print("Reading lines {}".format(cnt))
26 |             items = line.split("\t")
27 | 
28 |             if items == 3:
29 |                 src_sent, tgt_lang, tgt_sent = line.split("\t")
30 |                 alignment = None
31 |             else:
32 |                 src_sent, tgt_lang, tgt_sent, alignment_str = line.split("\t")
33 |                 alignment = []
34 |                 for x in alignment_str.split(" "):
35 |                     alignment.append((int(x.split("/")[0]), int(x.split("/")[1])))
36 | 
37 |             if tgt_lang in drop_languages:
38 |                 continue
39 |             if translate_languages is not None and tgt_lang not in translate_languages:
40 |                 continue
41 | 
42 |             cnt_src = {}
43 |             cnt_tgt = {}
44 |             for x in alignment:
45 | 
46 |                 if x[0] not in cnt_src:
47 |                     cnt_src[x[0]] = 0
48 |                 cnt_src[x[0]] += 1
49 | 
50 |                 if x[1] not in cnt_tgt:
51 |                     cnt_tgt[x[1]] = 0
52 |                 cnt_tgt[x[1]] += 1
53 | 
54 |                 if not (cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1):
55 |                     print(cnt_src, cnt_tgt)
56 |                     print(alignment)
57 |                     print(src_sent, tgt_sent)
58 | 
59 |                 assert cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/tools/sample_xnli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import random
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 | 
 8 |     # Required parameters
 9 |     parser.add_argument(
10 |         "--input_path",
11 |         default=None,
12 |         type=str,
13 |         required=True,
14 |         help="input xnli file",
15 |     )
16 |     parser.add_argument(
17 |         "--output_path",
18 |         default=None,
19 |         type=str,
20 |         required=True,
21 |         help="output xnli file",
22 |     )
23 |     parser.add_argument(
24 |         "--sample_ratio",
25 |         default=None,
26 |         type=float,
27 |         required=True,
28 |         help="sample ratio",
29 |     )
30 | 
31 |     args = parser.parse_args()
32 |     lines = open(args.input_path, "r").readlines()
33 |     head = lines[0]
34 |     lines = lines[1:]
35 |     random.seed(0)
36 |     random.shuffle(lines)
37 | 
38 |     n_lines = int(len(lines) * args.sample_ratio)
39 | 
40 |     fout = open(args.output_path, "w")
41 |     fout.write(head)
42 |     for i, line in enumerate(lines[:n_lines]):
43 |         fout.write(line)


--------------------------------------------------------------------------------
/src/transformers/activations.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def swish(x):
 8 |     return x * torch.sigmoid(x)
 9 | 
10 | 
11 | def _gelu_python(x):
12 |     """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
13 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
14 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
15 |         This is now written in C in torch.nn.functional
16 |         Also see https://arxiv.org/abs/1606.08415
17 |     """
18 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
19 | 
20 | 
21 | if torch.__version__ < "1.4.0":
22 |     gelu = _gelu_python
23 | else:
24 |     gelu = F.gelu
25 | 
26 | 
27 | def gelu_new(x):
28 |     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
29 |         Also see https://arxiv.org/abs/1606.08415
30 |     """
31 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
32 | 
33 | 
34 | ACT2FN = {
35 |     "relu": F.relu,
36 |     "swish": swish,
37 |     "gelu": gelu,
38 |     "tanh": F.tanh,
39 |     "gelu_new": gelu_new,
40 | }
41 | 
42 | 
43 | def get_activation(activation_string):
44 |     if activation_string in ACT2FN:
45 |         return ACT2FN[activation_string]
46 |     else:
47 |         raise KeyError(
48 |             "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
49 |                 activation_string, list(ACT2FN.keys())
50 |             )
51 |         )
52 | 


--------------------------------------------------------------------------------
/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseTransformersCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/src/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from transformers.commands import BaseTransformersCLICommand
 4 | 
 5 | 
 6 | def download_command_factory(args):
 7 |     return DownloadCommand(args.model, args.cache_dir, args.force)
 8 | 
 9 | 
10 | class DownloadCommand(BaseTransformersCLICommand):
11 |     @staticmethod
12 |     def register_subcommand(parser: ArgumentParser):
13 |         download_parser = parser.add_parser("download")
14 |         download_parser.add_argument(
15 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 |         )
17 |         download_parser.add_argument(
18 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 |         )
20 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
21 |         download_parser.set_defaults(func=download_command_factory)
22 | 
23 |     def __init__(self, model: str, cache: str, force: bool):
24 |         self._model = model
25 |         self._cache = cache
26 |         self._force = force
27 | 
28 |     def run(self):
29 |         from transformers import AutoModel, AutoTokenizer
30 | 
31 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 | 


--------------------------------------------------------------------------------
/src/transformers/commands/env.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers import __version__ as version
 5 | from transformers import is_tf_available, is_torch_available
 6 | from transformers.commands import BaseTransformersCLICommand
 7 | 
 8 | 
 9 | def info_command_factory(_):
10 |     return EnvironmentCommand()
11 | 
12 | 
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 |     @staticmethod
15 |     def register_subcommand(parser: ArgumentParser):
16 |         download_parser = parser.add_parser("env")
17 |         download_parser.set_defaults(func=info_command_factory)
18 | 
19 |     def run(self):
20 |         pt_version = "not installed"
21 |         pt_cuda_available = "NA"
22 |         if is_torch_available():
23 |             import torch
24 | 
25 |             pt_version = torch.__version__
26 |             pt_cuda_available = torch.cuda.is_available()
27 | 
28 |         tf_version = "not installed"
29 |         tf_cuda_available = "NA"
30 |         if is_tf_available():
31 |             import tensorflow as tf
32 | 
33 |             tf_version = tf.__version__
34 |             try:
35 |                 # deprecated in v2.1
36 |                 tf_cuda_available = tf.test.is_gpu_available()
37 |             except AttributeError:
38 |                 # returns list of devices, convert to bool
39 |                 tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 | 
41 |         info = {
42 |             "`transformers` version": version,
43 |             "Platform": platform.platform(),
44 |             "Python version": platform.python_version(),
45 |             "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 |             "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 |             "Using GPU in script?": "<fill in>",
48 |             "Using distributed or parallel set-up in script?": "<fill in>",
49 |         }
50 | 
51 |         print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 |         print(self.format_dict(info))
53 | 
54 |         return info
55 | 
56 |     @staticmethod
57 |     def format_dict(d):
58 |         return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 | 


--------------------------------------------------------------------------------
/src/transformers/commands/run.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands import BaseTransformersCLICommand
 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 9 | 
10 | 
11 | def try_infer_format_from_ext(path: str):
12 |     if not path:
13 |         return "pipe"
14 | 
15 |     for ext in PipelineDataFormat.SUPPORTED_FORMATS:
16 |         if path.endswith(ext):
17 |             return ext
18 | 
19 |     raise Exception(
20 |         "Unable to determine file format from file extension {}. "
21 |         "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
22 |     )
23 | 
24 | 
25 | def run_command_factory(args):
26 |     nlp = pipeline(
27 |         task=args.task,
28 |         model=args.model if args.model else None,
29 |         config=args.config,
30 |         tokenizer=args.tokenizer,
31 |         device=args.device,
32 |     )
33 |     format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
34 |     reader = PipelineDataFormat.from_str(
35 |         format=format,
36 |         output_path=args.output,
37 |         input_path=args.input,
38 |         column=args.column if args.column else nlp.default_input_names,
39 |         overwrite=args.overwrite,
40 |     )
41 |     return RunCommand(nlp, reader)
42 | 
43 | 
44 | class RunCommand(BaseTransformersCLICommand):
45 |     def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
46 |         self._nlp = nlp
47 |         self._reader = reader
48 | 
49 |     @staticmethod
50 |     def register_subcommand(parser: ArgumentParser):
51 |         run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
52 |         run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
53 |         run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
54 |         run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
55 |         run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
56 |         run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
57 |         run_parser.add_argument(
58 |             "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
59 |         )
60 |         run_parser.add_argument(
61 |             "--column",
62 |             type=str,
63 |             help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
64 |         )
65 |         run_parser.add_argument(
66 |             "--format",
67 |             type=str,
68 |             default="infer",
69 |             choices=PipelineDataFormat.SUPPORTED_FORMATS,
70 |             help="Input format to read from",
71 |         )
72 |         run_parser.add_argument(
73 |             "--device",
74 |             type=int,
75 |             default=-1,
76 |             help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
77 |         )
78 |         run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
79 |         run_parser.set_defaults(func=run_command_factory)
80 | 
81 |     def run(self):
82 |         nlp, outputs = self._nlp, []
83 | 
84 |         for entry in self._reader:
85 |             output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
86 |             if isinstance(output, dict):
87 |                 outputs.append(output)
88 |             else:
89 |                 outputs += output
90 | 
91 |         # Saving data
92 |         if self._nlp.binary_output:
93 |             binary_path = self._reader.save_binary(outputs)
94 |             logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
95 |         else:
96 |             self._reader.save(outputs)
97 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_bart.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ BART configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 26 |     "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json",
 27 |     "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json",
 28 |     "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json",
 29 | }
 30 | 
 31 | 
 32 | class BartConfig(PretrainedConfig):
 33 |     r"""
 34 |         Configuration class for Bart. Parameters are renamed from the fairseq implementation
 35 |     """
 36 |     model_type = "bart"
 37 |     pretrained_config_archive_map = BART_PRETRAINED_CONFIG_ARCHIVE_MAP
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         activation_dropout=0.0,
 42 |         vocab_size=50265,
 43 |         pad_token_id=1,
 44 |         eos_token_id=2,
 45 |         d_model=1024,
 46 |         encoder_ffn_dim=4096,
 47 |         encoder_layers=12,
 48 |         encoder_attention_heads=16,
 49 |         decoder_ffn_dim=4096,
 50 |         decoder_layers=12,
 51 |         decoder_attention_heads=16,
 52 |         encoder_layerdrop=0.0,
 53 |         decoder_layerdrop=0.0,
 54 |         attention_dropout=0.0,
 55 |         dropout=0.1,
 56 |         max_position_embeddings=1024,
 57 |         init_std=0.02,
 58 |         classifier_dropout=0.0,
 59 |         output_past=False,
 60 |         num_labels=3,
 61 |         bos_token_id=0,
 62 |         **common_kwargs
 63 |     ):
 64 |         r"""
 65 |             :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
 66 |             Examples:
 67 |                 config = BartConfig.from_pretrained('bart-large')
 68 |                 model = BartModel(config)
 69 |         """
 70 |         super().__init__(
 71 |             num_labels=num_labels,
 72 |             output_past=output_past,
 73 |             pad_token_id=pad_token_id,
 74 |             bos_token_id=bos_token_id,
 75 |             **common_kwargs,
 76 |         )
 77 |         self.vocab_size = vocab_size
 78 |         self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
 79 |         self.eos_token_id = eos_token_id
 80 |         self.encoder_ffn_dim = encoder_ffn_dim
 81 |         self.encoder_layers = self.num_hidden_layers = encoder_layers
 82 |         self.encoder_attention_heads = encoder_attention_heads
 83 |         self.encoder_layerdrop = encoder_layerdrop
 84 |         self.decoder_layerdrop = decoder_layerdrop
 85 |         self.decoder_ffn_dim = decoder_ffn_dim
 86 |         self.decoder_layers = decoder_layers
 87 |         self.decoder_attention_heads = decoder_attention_heads
 88 |         self.max_position_embeddings = max_position_embeddings
 89 |         self.init_std = init_std  # Normal(0, this parameter)
 90 | 
 91 |         # 3 Types of Dropout
 92 |         self.attention_dropout = attention_dropout
 93 |         self.activation_dropout = activation_dropout
 94 |         self.dropout = dropout
 95 | 
 96 |         # Classifier stuff
 97 |         self.classif_dropout = classifier_dropout
 98 | 
 99 |     @property
100 |     def num_attention_heads(self):
101 |         return self.encoder_attention_heads
102 | 
103 |     @property
104 |     def hidden_size(self):
105 |         return self.d_model
106 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 | 
32 | 
33 | class CamembertConfig(RobertaConfig):
34 |     """
35 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 |     superclass for the appropriate documentation alongside usage examples.
37 |     """
38 | 
39 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
40 |     model_type = "camembert"
41 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class MMBTConfig(object):
26 |     """Configuration class to store the configuration of a `MMBT Model`.
27 | 
28 |     Args:
29 |         config (:obj:`~transformers.PreTrainedConfig`):
30 |             Config of the underlying Transformer models. Its values are
31 |             copied over to use a single config.
32 |         num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_bert import BertConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
28 |     "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
29 |     "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
30 |     "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
31 |     "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
32 |     "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
33 | }
34 | 
35 | 
36 | class RobertaConfig(BertConfig):
37 |     r"""
38 |         This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
39 |         It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
40 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
41 |         the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
42 | 
43 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
44 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
45 |         for more information.
46 | 
47 |         The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
48 |         It reuses the same defaults. Please check the parent class for more information.
49 | 
50 |         Example::
51 | 
52 |             from transformers import RobertaConfig, RobertaModel
53 | 
54 |             # Initializing a RoBERTa configuration
55 |             configuration = RobertaConfig()
56 | 
57 |             # Initializing a model from the configuration
58 |             model = RobertaModel(configuration)
59 | 
60 |             # Accessing the model configuration
61 |             configuration = model.config
62 | 
63 |         Attributes:
64 |             pretrained_config_archive_map (Dict[str, str]):
65 |                 A dictionary containing all the available pre-trained checkpoints.
66 |     """
67 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
68 |     model_type = "roberta"
69 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_t5.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2010, The T5 Authors and HuggingFace Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ T5 model configuration """
 16 | 
 17 | 
 18 | import logging
 19 | 
 20 | from .configuration_utils import PretrainedConfig
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 26 |     "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
 27 |     "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
 28 |     "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
 29 |     "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
 30 |     "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 31 | }
 32 | 
 33 | 
 34 | class T5Config(PretrainedConfig):
 35 |     r"""
 36 |         :class:`~transformers.T5Config` is the configuration class to store the configuration of a
 37 |         `T5Model`.
 38 | 
 39 | 
 40 |         Arguments:
 41 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
 42 |             hidden_size: Size of the encoder layers and the pooler layer.
 43 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 44 |             num_attention_heads: Number of attention heads for each attention layer in
 45 |                 the Transformer encoder.
 46 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 47 |                 layer in the Transformer encoder.
 48 |             hidden_act: The non-linear activation function (function or string) in the
 49 |                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 50 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 51 |                 layers in the embeddings, encoder, and pooler.
 52 |             attention_probs_dropout_prob: The dropout ratio for the attention
 53 |                 probabilities.
 54 |             max_position_embeddings: The maximum sequence length that this model might
 55 |                 ever be used with. Typically set this to something large just in case
 56 |                 (e.g., 512 or 1024 or 2048).
 57 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 58 |                 `T5Model`.
 59 |             initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
 60 |             layer_norm_eps: The epsilon used by LayerNorm.
 61 |     """
 62 |     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 63 |     model_type = "t5"
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         vocab_size=32128,
 68 |         n_positions=512,
 69 |         d_model=512,
 70 |         d_kv=64,
 71 |         d_ff=2048,
 72 |         num_layers=6,
 73 |         num_heads=8,
 74 |         relative_attention_num_buckets=32,
 75 |         dropout_rate=0.1,
 76 |         layer_norm_epsilon=1e-6,
 77 |         initializer_factor=1.0,
 78 |         **kwargs
 79 |     ):
 80 |         super().__init__(**kwargs)
 81 |         self.vocab_size = vocab_size
 82 |         self.n_positions = n_positions
 83 |         self.d_model = d_model
 84 |         self.d_kv = d_kv
 85 |         self.d_ff = d_ff
 86 |         self.num_layers = num_layers
 87 |         self.num_heads = num_heads
 88 |         self.relative_attention_num_buckets = relative_attention_num_buckets
 89 |         self.dropout_rate = dropout_rate
 90 |         self.layer_norm_epsilon = layer_norm_epsilon
 91 |         self.initializer_factor = initializer_factor
 92 | 
 93 |     @property
 94 |     def max_position_embeddings(self):
 95 |         return self.n_positions
 96 | 
 97 |     @property
 98 |     def hidden_size(self):
 99 |         return self.d_model
100 | 
101 |     @property
102 |     def num_attention_heads(self):
103 |         return self.num_heads
104 | 
105 |     @property
106 |     def num_hidden_layers(self):
107 |         return self.num_layers
108 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
28 |     "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
29 |     "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
30 |     "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
31 |     "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
32 |     "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
33 | }
34 | 
35 | 
36 | class XLMRobertaConfig(RobertaConfig):
37 |     """
38 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
39 |     superclass for the appropriate documentation alongside usage examples.
40 |     """
41 | 
42 |     pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
43 |     model_type = "xlm-roberta"
44 | 


--------------------------------------------------------------------------------
/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = AlbertConfig.from_json_file(albert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = AlbertForMaskedLM(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--albert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BART checkpoint."""
 16 | 
 17 | 
 18 | import argparse
 19 | import logging
 20 | from pathlib import Path
 21 | 
 22 | import fairseq
 23 | import torch
 24 | from packaging import version
 25 | 
 26 | from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer
 27 | 
 28 | 
 29 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"]
 30 | 
 31 | if version.parse(fairseq.__version__) < version.parse("0.9.0"):
 32 |     raise Exception("requires fairseq >= 0.9.0")
 33 | 
 34 | 
 35 | logging.basicConfig(level=logging.INFO)
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | SAMPLE_TEXT = " Hello world! cécé herlolip"
 39 | 
 40 | rename_keys = [
 41 |     ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
 42 |     ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
 43 |     ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
 44 |     ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
 45 | ]
 46 | IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"]
 47 | 
 48 | 
 49 | def rename_key(dct, old, new):
 50 |     val = dct.pop(old)
 51 |     dct[new] = val
 52 | 
 53 | 
 54 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
 55 |     """
 56 |     Copy/paste/tweak model's weights to our BERT structure.
 57 |     """
 58 |     bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
 59 |     bart.eval()  # disable dropout
 60 |     bart.model.upgrade_state_dict(bart.model.state_dict())
 61 |     hf_model_name = checkpoint_path.replace(".", "-")
 62 |     config = BartConfig.from_pretrained(hf_model_name)
 63 |     tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
 64 |     tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
 65 |     assert torch.eq(tokens, tokens2).all()
 66 | 
 67 |     if checkpoint_path in ["bart.large", "bart.large.cnn"]:
 68 |         state_dict = bart.model.state_dict()
 69 |         for k in IGNORE_KEYS:
 70 |             state_dict.pop(k, None)
 71 |         state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
 72 |         model = BartModel(config)
 73 |         their_output = bart.extract_features(tokens)
 74 |     else:  # MNLI Case
 75 |         state_dict = bart.state_dict()
 76 |         for k in IGNORE_KEYS:
 77 |             state_dict.pop(k, None)
 78 |         state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
 79 |         for src, dest in rename_keys:
 80 |             rename_key(state_dict, src, dest)
 81 |         model = BartForSequenceClassification(config)
 82 |         their_output = bart.predict("mnli", tokens, return_logits=True)
 83 | 
 84 |     # Load state dict
 85 |     model.load_state_dict(state_dict)
 86 |     model.eval()
 87 |     # Check results
 88 | 
 89 |     if checkpoint_path == "bart.large.cnn":  # generate doesnt work yet
 90 |         model = BartForMaskedLM(config, base_model=model)
 91 |         assert "lm_head.weight" in model.state_dict()
 92 |         assert model.lm_head.out_features == config.max_position_embeddings
 93 |         model.eval()
 94 |         our_outputs = model.model.forward(tokens)[0]
 95 |     else:
 96 |         our_outputs = model.forward(tokens)[0]
 97 |     assert their_output.shape == our_outputs.shape
 98 |     assert (their_output == our_outputs).all().item()
 99 |     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
100 |     model.save_pretrained(pytorch_dump_folder_path)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     parser = argparse.ArgumentParser()
105 |     # Required parameters
106 |     parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="")
107 | 
108 |     parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
109 |     args = parser.parse_args()
110 |     convert_bart_checkpoint(
111 |         args.fairseq_path, args.pytorch_dump_folder_path,
112 |     )
113 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--bert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained BERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import argparse
 19 | import os
 20 | 
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | import torch
 24 | 
 25 | from transformers import BertModel
 26 | 
 27 | 
 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
 29 | 
 30 |     """
 31 |     :param model:BertModel Pytorch model instance to be converted
 32 |     :param ckpt_dir: Tensorflow model directory
 33 |     :param model_name: model name
 34 |     :return:
 35 | 
 36 |     Currently supported HF models:
 37 |         Y BertModel
 38 |         N BertForMaskedLM
 39 |         N BertForPreTraining
 40 |         N BertForMultipleChoice
 41 |         N BertForNextSentencePrediction
 42 |         N BertForSequenceClassification
 43 |         N BertForQuestionAnswering
 44 |     """
 45 | 
 46 |     tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
 47 | 
 48 |     var_map = (
 49 |         ("layer.", "layer_"),
 50 |         ("word_embeddings.weight", "word_embeddings"),
 51 |         ("position_embeddings.weight", "position_embeddings"),
 52 |         ("token_type_embeddings.weight", "token_type_embeddings"),
 53 |         (".", "/"),
 54 |         ("LayerNorm/weight", "LayerNorm/gamma"),
 55 |         ("LayerNorm/bias", "LayerNorm/beta"),
 56 |         ("weight", "kernel"),
 57 |     )
 58 | 
 59 |     if not os.path.isdir(ckpt_dir):
 60 |         os.makedirs(ckpt_dir)
 61 | 
 62 |     state_dict = model.state_dict()
 63 | 
 64 |     def to_tf_var_name(name: str):
 65 |         for patt, repl in iter(var_map):
 66 |             name = name.replace(patt, repl)
 67 |         return "bert/{}".format(name)
 68 | 
 69 |     def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
 70 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 71 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 72 |         session.run(tf.variables_initializer([tf_var]))
 73 |         session.run(tf_var)
 74 |         return tf_var
 75 | 
 76 |     tf.reset_default_graph()
 77 |     with tf.Session() as session:
 78 |         for var_name in state_dict:
 79 |             tf_name = to_tf_var_name(var_name)
 80 |             torch_tensor = state_dict[var_name].numpy()
 81 |             if any([x in var_name for x in tensors_to_transpose]):
 82 |                 torch_tensor = torch_tensor.T
 83 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 84 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 85 |             tf_weight = session.run(tf_var)
 86 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 87 | 
 88 |         saver = tf.train.Saver(tf.trainable_variables())
 89 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 90 | 
 91 | 
 92 | def main(raw_args=None):
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
 95 |     parser.add_argument(
 96 |         "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
 97 |     )
 98 |     parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
 99 |     parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
100 |     args = parser.parse_args(raw_args)
101 | 
102 |     model = BertModel.from_pretrained(
103 |         pretrained_model_name_or_path=args.model_name,
104 |         state_dict=torch.load(args.pytorch_model_path),
105 |         cache_dir=args.cache_dir,
106 |     )
107 | 
108 |     convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if gpt2_config_file == "":
32 |         config = GPT2Config()
33 |     else:
34 |         config = GPT2Config.from_json_file(gpt2_config_file)
35 |     model = GPT2Model(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
55 |     )
56 |     parser.add_argument(
57 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
58 |     )
59 |     parser.add_argument(
60 |         "--gpt2_config_file",
61 |         default="",
62 |         type=str,
63 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
64 |         "This specifies the model architecture.",
65 |     )
66 |     args = parser.parse_args()
67 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
68 | 


--------------------------------------------------------------------------------
/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if openai_config_file == "":
32 |         config = OpenAIGPTConfig()
33 |     else:
34 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
35 |     model = OpenAIGPTModel(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--openai_checkpoint_folder_path",
55 |         default=None,
56 |         type=str,
57 |         required=True,
58 |         help="Path to the TensorFlow checkpoint path.",
59 |     )
60 |     parser.add_argument(
61 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
62 |     )
63 |     parser.add_argument(
64 |         "--openai_config_file",
65 |         default="",
66 |         type=str,
67 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |         "This specifies the model architecture.",
69 |     )
70 |     args = parser.parse_args()
71 |     convert_openai_checkpoint_to_pytorch(
72 |         args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
73 |     )
74 | 


--------------------------------------------------------------------------------
/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = T5Config.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = T5Model(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained T5 model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import json
20 | import logging
21 | 
22 | import numpy
23 | import torch
24 | 
25 | from transformers import CONFIG_NAME, WEIGHTS_NAME
26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
27 | 
28 | 
29 | logging.basicConfig(level=logging.INFO)
30 | 
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
35 | 
36 |     state_dict = chkpt["model"]
37 | 
38 |     # We have the base model one level deeper than the original XLM repository
39 |     two_levels_state_dict = {}
40 |     for k, v in state_dict.items():
41 |         if "pred_layer" in k:
42 |             two_levels_state_dict[k] = v
43 |         else:
44 |             two_levels_state_dict["transformer." + k] = v
45 | 
46 |     config = chkpt["params"]
47 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 | 
49 |     vocab = chkpt["dico_word2id"]
50 |     vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
51 | 
52 |     # Save pytorch-model
53 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
54 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
55 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
56 | 
57 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 |     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 | 
60 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
61 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 |         f.write(json.dumps(config, indent=2) + "\n")
63 | 
64 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
65 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 |         f.write(json.dumps(vocab, indent=2) + "\n")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 |     # Required parameters
72 |     parser.add_argument(
73 |         "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
74 |     )
75 |     parser.add_argument(
76 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
77 |     )
78 |     args = parser.parse_args()
79 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
80 | 


--------------------------------------------------------------------------------
/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | 
 18 | import argparse
 19 | import logging
 20 | import os
 21 | 
 22 | import torch
 23 | 
 24 | from transformers import (
 25 |     CONFIG_NAME,
 26 |     WEIGHTS_NAME,
 27 |     XLNetConfig,
 28 |     XLNetForQuestionAnswering,
 29 |     XLNetForSequenceClassification,
 30 |     XLNetLMHeadModel,
 31 |     load_tf_weights_in_xlnet,
 32 | )
 33 | 
 34 | 
 35 | GLUE_TASKS_NUM_LABELS = {
 36 |     "cola": 2,
 37 |     "mnli": 3,
 38 |     "mrpc": 2,
 39 |     "sst-2": 2,
 40 |     "sts-b": 1,
 41 |     "qqp": 2,
 42 |     "qnli": 2,
 43 |     "rte": 2,
 44 |     "wnli": 2,
 45 | }
 46 | 
 47 | 
 48 | logging.basicConfig(level=logging.INFO)
 49 | 
 50 | 
 51 | def convert_xlnet_checkpoint_to_pytorch(
 52 |     tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
 53 | ):
 54 |     # Initialise PyTorch model
 55 |     config = XLNetConfig.from_json_file(bert_config_file)
 56 | 
 57 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 58 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 59 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 60 |         config.finetuning_task = finetuning_task
 61 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 62 |         model = XLNetForSequenceClassification(config)
 63 |     elif "squad" in finetuning_task:
 64 |         config.finetuning_task = finetuning_task
 65 |         model = XLNetForQuestionAnswering(config)
 66 |     else:
 67 |         model = XLNetLMHeadModel(config)
 68 | 
 69 |     # Load weights from tf checkpoint
 70 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 71 | 
 72 |     # Save pytorch-model
 73 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 74 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 75 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 76 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 77 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 78 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 79 |         f.write(config.to_json_string())
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     parser = argparse.ArgumentParser()
 84 |     # Required parameters
 85 |     parser.add_argument(
 86 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
 87 |     )
 88 |     parser.add_argument(
 89 |         "--xlnet_config_file",
 90 |         default=None,
 91 |         type=str,
 92 |         required=True,
 93 |         help="The config json file corresponding to the pre-trained XLNet model. \n"
 94 |         "This specifies the model architecture.",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--pytorch_dump_folder_path",
 98 |         default=None,
 99 |         type=str,
100 |         required=True,
101 |         help="Path to the folder to store the PyTorch model or dataset/vocab.",
102 |     )
103 |     parser.add_argument(
104 |         "--finetuning_task",
105 |         default=None,
106 |         type=str,
107 |         help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
108 |     )
109 |     args = parser.parse_args()
110 |     print(args)
111 | 
112 |     convert_xlnet_checkpoint_to_pytorch(
113 |         args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
114 |     )
115 | 


--------------------------------------------------------------------------------
/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .metrics import is_sklearn_available
 6 | from .processors import (
 7 |     DataProcessor,
 8 |     InputExample,
 9 |     InputFeatures,
10 |     SingleSentenceClassificationProcessor,
11 |     SquadExample,
12 |     SquadFeatures,
13 |     SquadV1Processor,
14 |     SquadV2Processor,
15 |     glue_convert_examples_to_features,
16 |     glue_output_modes,
17 |     glue_processors,
18 |     glue_tasks_num_labels,
19 | 
20 |     xglue_convert_examples_to_features,
21 |     xglue_convert_examples_to_vat_features,
22 |     xglue_output_modes,
23 |     xglue_processors,
24 |     xglue_tasks_num_labels,
25 | 
26 |     xtreme_convert_examples_to_features,
27 |     xtreme_output_modes,
28 |     xtreme_processors,
29 |     xtreme_tasks_num_labels,
30 | 
31 |     squad_convert_examples_to_features,
32 |     xnli_output_modes,
33 |     xnli_processors,
34 |     xnli_tasks_num_labels,
35 | )
36 | 
37 | 
38 | if is_sklearn_available():
39 |     from .metrics import glue_compute_metrics, xnli_compute_metrics, xglue_compute_metrics, xtreme_compute_metrics
40 | 


--------------------------------------------------------------------------------
/src/transformers/data/metrics/evaluate_squad.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Based on the SQuAD evaluation script from:
  3 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
 16 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 17 | from __future__ import print_function
 18 | from collections import Counter
 19 | import string
 20 | import re
 21 | import argparse
 22 | import json
 23 | import sys
 24 | 
 25 | 
 26 | def normalize_answer(s):
 27 |     """Lower text and remove punctuation, articles and extra whitespace."""
 28 |     def remove_articles(text):
 29 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
 30 | 
 31 |     def white_space_fix(text):
 32 |         return ' '.join(text.split())
 33 | 
 34 |     def remove_punc(text):
 35 |         exclude = set(string.punctuation)
 36 |         return ''.join(ch for ch in text if ch not in exclude)
 37 | 
 38 |     def lower(text):
 39 |         return text.lower()
 40 | 
 41 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 42 | 
 43 | 
 44 | def f1_score(prediction, ground_truth):
 45 |     prediction_tokens = normalize_answer(prediction).split()
 46 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 47 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 48 |     num_same = sum(common.values())
 49 |     if num_same == 0:
 50 |         return 0
 51 |     precision = 1.0 * num_same / len(prediction_tokens)
 52 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 53 |     f1 = (2 * precision * recall) / (precision + recall)
 54 |     return f1
 55 | 
 56 | 
 57 | def exact_match_score(prediction, ground_truth):
 58 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
 59 | 
 60 | 
 61 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 62 |     scores_for_ground_truths = []
 63 |     for ground_truth in ground_truths:
 64 |         score = metric_fn(prediction, ground_truth)
 65 |         scores_for_ground_truths.append(score)
 66 |     return max(scores_for_ground_truths)
 67 | 
 68 | 
 69 | def evaluate(dataset, predictions):
 70 |     f1 = exact_match = total = 0
 71 |     for article in dataset:
 72 |         for paragraph in article['paragraphs']:
 73 |             for qa in paragraph['qas']:
 74 |                 total += 1
 75 |                 if qa['id'] not in predictions:
 76 |                     message = 'Unanswered question ' + qa['id'] + \
 77 |                               ' will receive score 0.'
 78 |                     print(message, file=sys.stderr)
 79 |                     continue
 80 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
 81 |                 prediction = predictions[qa['id']]
 82 |                 exact_match += metric_max_over_ground_truths(
 83 |                     exact_match_score, prediction, ground_truths)
 84 |                 f1 += metric_max_over_ground_truths(
 85 |                     f1_score, prediction, ground_truths)
 86 | 
 87 |     exact_match = 100.0 * exact_match / total
 88 |     f1 = 100.0 * f1 / total
 89 | 
 90 |     return {'exact_match': exact_match, 'f1': f1}
 91 | 
 92 | 
 93 | def evaluate_with_path(dataset_file, prediction_file):
 94 |     with open(dataset_file) as dataset_file_reader:
 95 |         dataset_json = json.load(dataset_file_reader)
 96 |         dataset = dataset_json['data']
 97 |     with open(prediction_file) as prediction_file_reader:
 98 |         predictions = json.load(prediction_file_reader)
 99 |     return evaluate(dataset, predictions)
100 | 
101 | if __name__ == '__main__':
102 |     expected_version = '1.1'
103 |     parser = argparse.ArgumentParser(
104 |         description='Evaluation for SQuAD ' + expected_version)
105 |     parser.add_argument('dataset_file', help='Dataset file')
106 |     parser.add_argument('prediction_file', help='Prediction File')
107 |     args = parser.parse_args()
108 |     with open(args.dataset_file) as dataset_file:
109 |         dataset_json = json.load(dataset_file)
110 |         if (dataset_json['version'] != expected_version):
111 |             print('Evaluation expects v-' + expected_version +
112 |                   ', but got dataset with v-' + dataset_json['version'],
113 |                   file=sys.stderr)
114 |         dataset = dataset_json['data']
115 |     with open(args.prediction_file) as prediction_file:
116 |         predictions = json.load(prediction_file)
117 |     print(json.dumps(evaluate(dataset, predictions)))


--------------------------------------------------------------------------------
/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .xglue import xglue_convert_examples_to_features, xglue_output_modes, xglue_processors, xglue_tasks_num_labels
 6 | from .xtreme import xtreme_convert_examples_to_features, xtreme_output_modes, xtreme_processors, xtreme_tasks_num_labels
 7 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
 8 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
 9 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
10 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
11 | from .xglue import xglue_convert_examples_to_vat_features
12 | 


--------------------------------------------------------------------------------
/src/transformers/data/processors/xnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XNLI utils (dataset loading and evaluation) """
17 | 
18 | 
19 | import logging
20 | import os
21 | 
22 | from .utils import DataProcessor, InputExample
23 | 
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | 
28 | class XnliProcessor(DataProcessor):
29 |     """Processor for the XNLI dataset.
30 |     Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
31 | 
32 |     def __init__(self, language, train_language=None):
33 |         self.language = language
34 |         self.train_language = train_language
35 | 
36 |     def get_train_examples(self, data_dir):
37 |         """See base class."""
38 |         lg = self.language if self.train_language is None else self.train_language
39 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
40 |         examples = []
41 |         for (i, line) in enumerate(lines):
42 |             if i == 0:
43 |                 continue
44 |             guid = "%s-%s" % ("train", i)
45 |             text_a = line[0]
46 |             text_b = line[1]
47 |             label = "contradiction" if line[2] == "contradictory" else line[2]
48 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
49 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
50 |         return examples
51 | 
52 |     def get_test_examples(self, data_dir):
53 |         """See base class."""
54 |         lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
55 |         examples = []
56 |         for (i, line) in enumerate(lines):
57 |             if i == 0:
58 |                 continue
59 |             language = line[0]
60 |             if language != self.language:
61 |                 continue
62 |             guid = "%s-%s" % ("test", i)
63 |             text_a = line[6]
64 |             text_b = line[7]
65 |             label = line[1]
66 |             assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
67 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
68 |         return examples
69 | 
70 |     def get_labels(self):
71 |         """See base class."""
72 |         return ["contradiction", "entailment", "neutral"]
73 | 
74 | 
75 | xnli_processors = {
76 |     "xnli": XnliProcessor,
77 | }
78 | 
79 | xnli_output_modes = {
80 |     "xnli": "classification",
81 | }
82 | 
83 | xnli_tasks_num_labels = {
84 |     "xnli": 3,
85 | }
86 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_tf_camembert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ TF 2.0 RoBERTa model. """
 17 | 
 18 | 
 19 | import logging
 20 | 
 21 | from .configuration_camembert import CamembertConfig
 22 | from .file_utils import add_start_docstrings
 23 | from .modeling_tf_roberta import (
 24 |     TFRobertaForMaskedLM,
 25 |     TFRobertaForSequenceClassification,
 26 |     TFRobertaForTokenClassification,
 27 |     TFRobertaModel,
 28 | )
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {}
 34 | 
 35 | 
 36 | CAMEMBERT_START_DOCSTRING = r"""
 37 | 
 38 |     .. note::
 39 | 
 40 |         TF 2.0 models accepts two formats as inputs:
 41 | 
 42 |             - having all inputs as keyword arguments (like PyTorch models), or
 43 |             - having all inputs as a list, tuple or dict in the first positional arguments.
 44 | 
 45 |         This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
 46 |         all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
 47 | 
 48 |         If you choose this second option, there are three possibilities you can use to gather all the input Tensors
 49 |         in the first positional argument :
 50 | 
 51 |         - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
 52 |         - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
 53 |           :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
 54 |         - a dictionary with one or several input Tensors associated to the input names given in the docstring:
 55 |           :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 56 | 
 57 |     Parameters:
 58 |         config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
 59 |             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
 60 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 61 | """
 62 | 
 63 | 
 64 | @add_start_docstrings(
 65 |     "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
 66 |     CAMEMBERT_START_DOCSTRING,
 67 | )
 68 | class TFCamembertModel(TFRobertaModel):
 69 |     """
 70 |     This class overrides :class:`~transformers.TFRobertaModel`. Please check the
 71 |     superclass for the appropriate documentation alongside usage examples.
 72 |     """
 73 | 
 74 |     config_class = CamembertConfig
 75 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 76 | 
 77 | 
 78 | @add_start_docstrings(
 79 |     """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
 80 | )
 81 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
 82 |     """
 83 |     This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
 84 |     superclass for the appropriate documentation alongside usage examples.
 85 |     """
 86 | 
 87 |     config_class = CamembertConfig
 88 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 89 | 
 90 | 
 91 | @add_start_docstrings(
 92 |     """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
 93 |     on top of the pooled output) e.g. for GLUE tasks. """,
 94 |     CAMEMBERT_START_DOCSTRING,
 95 | )
 96 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
 97 |     """
 98 |     This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
 99 |     superclass for the appropriate documentation alongside usage examples.
100 |     """
101 | 
102 |     config_class = CamembertConfig
103 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
104 | 
105 | 
106 | @add_start_docstrings(
107 |     """CamemBERT Model with a token classification head on top (a linear layer on top of
108 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 |     CAMEMBERT_START_DOCSTRING,
110 | )
111 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
112 |     """
113 |     This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 |     superclass for the appropriate documentation alongside usage examples.
115 |     """
116 | 
117 |     config_class = CamembertConfig
118 |     pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
119 | 


--------------------------------------------------------------------------------
/src/transformers/modeling_tf_xlm_roberta.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ TF 2.0  XLM-RoBERTa model. """
 17 | 
 18 | 
 19 | import logging
 20 | 
 21 | from .configuration_xlm_roberta import XLMRobertaConfig
 22 | from .file_utils import add_start_docstrings
 23 | from .modeling_tf_roberta import (
 24 |     TFRobertaForMaskedLM,
 25 |     TFRobertaForSequenceClassification,
 26 |     TFRobertaForTokenClassification,
 27 |     TFRobertaModel,
 28 | )
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
 34 | 
 35 | 
 36 | XLM_ROBERTA_START_DOCSTRING = r"""
 37 | 
 38 |     .. note::
 39 | 
 40 |         TF 2.0 models accepts two formats as inputs:
 41 | 
 42 |             - having all inputs as keyword arguments (like PyTorch models), or
 43 |             - having all inputs as a list, tuple or dict in the first positional arguments.
 44 | 
 45 |         This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
 46 |         all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
 47 | 
 48 |         If you choose this second option, there are three possibilities you can use to gather all the input Tensors
 49 |         in the first positional argument :
 50 | 
 51 |         - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
 52 |         - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
 53 |           :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
 54 |         - a dictionary with one or several input Tensors associated to the input names given in the docstring:
 55 |           :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 56 | 
 57 |     Parameters:
 58 |         config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
 59 |             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
 60 |             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 61 | """
 62 | 
 63 | 
 64 | @add_start_docstrings(
 65 |     "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
 66 |     XLM_ROBERTA_START_DOCSTRING,
 67 | )
 68 | class TFXLMRobertaModel(TFRobertaModel):
 69 |     """
 70 |     This class overrides :class:`~transformers.TFRobertaModel`. Please check the
 71 |     superclass for the appropriate documentation alongside usage examples.
 72 |     """
 73 | 
 74 |     config_class = XLMRobertaConfig
 75 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 76 | 
 77 | 
 78 | @add_start_docstrings(
 79 |     """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
 80 | )
 81 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
 82 |     """
 83 |     This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
 84 |     superclass for the appropriate documentation alongside usage examples.
 85 |     """
 86 | 
 87 |     config_class = XLMRobertaConfig
 88 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 89 | 
 90 | 
 91 | @add_start_docstrings(
 92 |     """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
 93 |     on top of the pooled output) e.g. for GLUE tasks. """,
 94 |     XLM_ROBERTA_START_DOCSTRING,
 95 | )
 96 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification):
 97 |     """
 98 |     This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
 99 |     superclass for the appropriate documentation alongside usage examples.
100 |     """
101 | 
102 |     config_class = XLMRobertaConfig
103 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
104 | 
105 | 
106 | @add_start_docstrings(
107 |     """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
108 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 |     XLM_ROBERTA_START_DOCSTRING,
110 | )
111 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
112 |     """
113 |     This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 |     superclass for the appropriate documentation alongside usage examples.
115 |     """
116 | 
117 |     config_class = XLMRobertaConfig
118 |     pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
119 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .tokenization_roberta import RobertaTokenizer
17 | 
18 | 
19 | # vocab and merges same as roberta
20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
23 | 
24 | 
25 | class BartTokenizer(RobertaTokenizer):
26 |     # merges and vocab same as Roberta
27 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
28 |     pretrained_vocab_files_map = {
29 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
30 |         "merges_file": {m: merges_url for m in _all_bart_models},
31 |     }
32 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | 
18 | import logging
19 | 
20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast
21 | 
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
26 | 
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 |     "vocab_file": {
29 |         "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
30 |         "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
31 |         "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
32 |         "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
33 |         "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
34 |         "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
35 |     }
36 | }
37 | 
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 |     "distilbert-base-uncased": 512,
40 |     "distilbert-base-uncased-distilled-squad": 512,
41 |     "distilbert-base-cased": 512,
42 |     "distilbert-base-cased-distilled-squad": 512,
43 |     "distilbert-base-german-cased": 512,
44 |     "distilbert-base-multilingual-cased": 512,
45 | }
46 | 
47 | 
48 | PRETRAINED_INIT_CONFIGURATION = {
49 |     "distilbert-base-uncased": {"do_lower_case": True},
50 |     "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
51 |     "distilbert-base-cased": {"do_lower_case": False},
52 |     "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
53 |     "distilbert-base-german-cased": {"do_lower_case": False},
54 |     "distilbert-base-multilingual-cased": {"do_lower_case": False},
55 | }
56 | 
57 | 
58 | class DistilBertTokenizer(BertTokenizer):
59 |     r"""
60 |     Constructs a DistilBertTokenizer.
61 |     :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
62 |     tokenization: punctuation splitting + wordpiece.
63 | 
64 |     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
65 |     parameters.
66 |     """
67 | 
68 |     vocab_files_names = VOCAB_FILES_NAMES
69 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
70 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
71 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
72 | 
73 | 
74 | class DistilBertTokenizerFast(BertTokenizerFast):
75 |     vocab_files_names = VOCAB_FILES_NAMES
76 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
77 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
78 |     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
79 | 


--------------------------------------------------------------------------------
/src/transformers/utils_encoder_decoder.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Classes to support Encoder-Decoder architectures """
16 | 
17 | 
18 | def prepare_encoder_decoder_model_kwargs(**kwargs):
19 |     """ Prepare the encoder and decoder's keyword arguments.
20 | 
21 |     Keyword arguments come in 3 flavors:
22 |     - encoder-specific (prefixed by `encoder_`)
23 |     - decoder-specific (prefixed by `decoder_`)
24 |     - those that apply to the model as whole.
25 | 
26 |     We let the specific kwargs override the common ones in case of
27 |     conflict.
28 |     """
29 | 
30 |     kwargs_common = {
31 |         argument: value
32 |         for argument, value in kwargs.items()
33 |         if not argument.startswith("encoder_") and not argument.startswith("decoder_")
34 |     }
35 |     if "input_ids" in kwargs_common:
36 |         kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids")
37 | 
38 |     decoder_kwargs = kwargs_common.copy()
39 |     encoder_kwargs = kwargs_common.copy()
40 |     encoder_kwargs.update(
41 |         {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")}
42 |     )
43 |     decoder_kwargs.update(
44 |         {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")}
45 |     )
46 |     decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
47 |     return encoder_kwargs, decoder_kwargs
48 | 


--------------------------------------------------------------------------------
/src/ud-conversion-tools/conllu_to_conll.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from itertools import islice
 3 | from pathlib import Path
 4 | import argparse
 5 | import sys, copy
 6 | 
 7 | from lib.conll import CoNLLReader
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
11 |     parser.add_argument('input', help="conllu file")
12 |     parser.add_argument('output', help="target file", type=Path)
13 |     parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
14 |     parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
15 |     parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'],  metavar='prop', type=str, nargs='+')
16 |     parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
17 |     parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
18 |     parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
19 |     parser.add_argument('--print_comments',default=False,action="store_true")
20 |     parser.add_argument('--print_fused_forms',default=False,action="store_true")
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     if sys.version_info < (3,0):
25 |         print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
26 |         sys.exit(1)
27 | 
28 |     POSRANKPRECEDENCEDICT = defaultdict(list)
29 |     POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ")
30 |     # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
31 |     POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
32 |     POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
33 |     POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ")
34 | 
35 |     if args.lang in POSRANKPRECEDENCEDICT:
36 |         current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
37 |     else:
38 |         current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]
39 | 
40 |     cio = CoNLLReader()
41 |     orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
42 |     modif_treebank = copy.copy(orig_treebank)
43 | 
44 |     # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
45 |     # We keep it for future modifications, i.e. any language-specific modules
46 |     for s in modif_treebank:
47 |         # print('sentence', s.get_sentence_as_string(printid=True))
48 |         s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)
49 | 
50 |     cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
51 | 
52 | if __name__ == "__main__":
53 |     main()


--------------------------------------------------------------------------------
/src/ud-conversion-tools/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/ud-conversion-tools/lib/__init__.py


--------------------------------------------------------------------------------
/transformers-cli:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands.convert import ConvertCommand
 5 | from transformers.commands.download import DownloadCommand
 6 | from transformers.commands.env import EnvironmentCommand
 7 | from transformers.commands.run import RunCommand
 8 | from transformers.commands.serving import ServeCommand
 9 | from transformers.commands.user import UserCommands
10 | 
11 | if __name__ == '__main__':
12 |     parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
13 |     commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
14 | 
15 |     # Register commands
16 |     ConvertCommand.register_subcommand(commands_parser)
17 |     DownloadCommand.register_subcommand(commands_parser)
18 |     EnvironmentCommand.register_subcommand(commands_parser)
19 |     RunCommand.register_subcommand(commands_parser)
20 |     ServeCommand.register_subcommand(commands_parser)
21 |     UserCommands.register_subcommand(commands_parser)
22 | 
23 |     # Let's go
24 |     args = parser.parse_args()
25 | 
26 |     if not hasattr(args, 'func'):
27 |         parser.print_help()
28 |         exit(1)
29 | 
30 |     # Run
31 |     service = args.func(args)
32 |     service.run()
33 | 


--------------------------------------------------------------------------------