├── .idea ├── .gitignore ├── deployment.xml ├── encodings.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── remote-mappings.xml ├── sshConfigs.xml ├── vcs.xml ├── webServers.xml └── xTune.iml ├── README.md ├── scripts ├── cross-lingual-transfer │ ├── train_mlqa.sh │ ├── train_panx.sh │ ├── train_pawsx.sh │ ├── train_tydiqa.sh │ ├── train_udpos.sh │ ├── train_xnli.sh │ └── train_xquad.sh ├── download_data.sh ├── download_model.sh ├── preprocess_panx.sh ├── preprocess_udpos.sh ├── train.sh └── translate-train-all │ ├── train_mlqa.sh │ ├── train_panx.sh │ ├── train_pawsx.sh │ ├── train_tydiqa.sh │ ├── train_udpos.sh │ ├── train_xnli.sh │ └── train_xquad.sh ├── setup.py ├── src ├── pequod │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-37.pyc │ ├── data │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── sampler.cpython-37.pyc │ │ │ ├── utils_squad.cpython-37.pyc │ │ │ ├── utils_squad_evaluate.cpython-37.pyc │ │ │ ├── xdoc.cpython-37.pyc │ │ │ ├── xqa.cpython-37.pyc │ │ │ └── xretrieval.cpython-37.pyc │ │ ├── dataloader.py │ │ ├── sampler.py │ │ ├── utils_squad.py │ │ ├── utils_squad_evaluate.py │ │ ├── wili.py │ │ ├── xdoc.py │ │ ├── xqa.py │ │ └── xretrieval.py │ ├── eval │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── bretrieval.cpython-37.pyc │ │ │ ├── bucc_eval.cpython-37.pyc │ │ │ ├── evaluator.cpython-37.pyc │ │ │ ├── utils_retrieve.cpython-37.pyc │ │ │ └── xretrieval.cpython-37.pyc │ │ ├── bretrieval.py │ │ ├── evaluator.py │ │ ├── utils_retrieve.py │ │ └── xretrieval.py │ ├── io.py │ ├── model │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── roberta.cpython-37.pyc │ │ └── roberta.py │ ├── optim │ │ ├── __init__.py │ │ ├── la.py │ │ └── la0.py │ ├── text │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── tokenization_sentencepiece.cpython-37.pyc │ │ └── tokenization_sentencepiece.py │ ├── tools │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── convert.cpython-37.pyc │ │ └── convert.py │ └── training │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── trainer.cpython-37.pyc │ │ ├── trainer.py │ │ └── xtrainer.py ├── run_cls.py ├── run_qa.py ├── run_tag.py ├── tools │ ├── __init__.py │ ├── check_many2many_alignment.py │ ├── dump_hf_state_dict.py │ ├── get_eval_results.py │ ├── sample_xnli.py │ └── xnli_sampling_statistics.py ├── transformers │ ├── __init__.py │ ├── activations.py │ ├── commands │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data │ │ ├── __init__.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ ├── evaluate_mlqa.py │ │ │ ├── evaluate_squad.py │ │ │ ├── mlqa_evaluation_v1.py │ │ │ └── squad_metrics.py │ │ └── processors │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ ├── squad.py │ │ │ ├── utils.py │ │ │ ├── xglue.py │ │ │ ├── xnli.py │ │ │ └── xtreme.py │ ├── file_utils.py │ ├── hf_api.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_openai.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ └── utils_encoder_decoder.py ├── ud-conversion-tools │ ├── conllu_to_conll.py │ └── lib │ │ ├── __init__.py │ │ └── conll.py └── utils_tag.py ├── transformers-cli └── utils_preprocess.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /../../../../../../:\Users\v-zhebo\OneDrive - Microsoft\stabletune\.idea/dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/remote-mappings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/sshConfigs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/webServers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/xTune.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xTune 2 | 3 | Code for ACL2021 paper [Consistency Regularization for Cross-Lingual Fine-Tuning](https://arxiv.org/pdf/2106.08226.pdf). 4 | ## Environment 5 | 6 | DockerFile: `dancingsoul/pytorch:xTune` 7 | 8 | Install the fine-tuning code: `pip install --user .` 9 | 10 | ## Data & Model Preparation 11 | 12 | ### XTREME Datasets 13 | 14 | 1) Create a download folder with `mkdir -p download` in the root of this project. 15 | 2) manually download `panx_dataset` (for NER) [here][2], (note that it will download as `AmazonPhotos.zip`) to the download directory. 16 | 3) run the following command to download the remaining datasets: `bash scripts/download_data.sh` 17 | The code of downloading dataset from XTREME is from [xtreme offical repo][1]. 18 | 19 | Note that we keep the labels in test set for easier evaluation. To prevent accidental evaluation on the test sets while running experiments, the code of [xtreme offical repo][1] removes labels of the test data during pre-processing and changes the order of the test sentences for cross-lingual sentence retrieval. 20 | Replace `csv.writer(fout, delimiter='\t')` with `csv.writer(fout, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')` in utils_process.py if using XTREME official repo. 21 | 22 | ### Translations 23 | 24 | XTREME provides translations for SQuAD v1.1 (only train and dev), MLQA, PAWS-X, TyDiQA-GoldP, XNLI, and XQuAD, which can be downloaded from [here][3]. The `xtreme_translations` folder should be moved to the download directory. 25 | 26 | The target language translations for panx and udpos are obtained with Google Translate, since they are not provided. Our processed version can be downloaded from [here][4]. It should be merged with the above `xtreme_translations` folder. 27 | 28 | ### Bi-lingual dictionaries 29 | 30 | We obtain the bi-lingual dictionaries from the [MUSE][6] repo. For convenience, you can download them from [here][7] and move it to the download directory, i.e., `./download/dicts`. 31 | 32 | ### Models 33 | 34 | XLM-Roberta is supported. We utilize the [huggingface][5] format, which can be downloaded with `bash scripts/download_model.sh`. 35 | 36 | ## Fine-tuning Usage 37 | 38 | Our default settings were using Nvidia V100-32GB GPU cards. If there were out-of-memory errors, you can reduce `per_gpu_train_batch_size` while increasing `gradient_accumulation_steps`, or use multi-GPU training. 39 | 40 | xTune consists of a two-stage training process. 41 | - Stage 1: fine-tuning with example consistency on the English training set. 42 | - Stage 2: fine-tuning with example consistency on the augmented training set and regularize model consistency with the model from Stage 1. 43 | 44 | It's recommended to use both Stage 1 and Stage 2 for token-level tasks, such as sequential labeling, and question answering. For text classification, you can only use Stage 1 if the computation budget was limited. 45 | 46 | ```bash 47 | bash ./scripts/train.sh [setting] [dataset] [model] [stage] [gpu] [data_dir] [output_dir] 48 | ``` 49 | where the options are described as follows: 50 | - `[setting]`: `translate-train-all` (using input translation for the languages other than English) or `cross-lingual-transfer` (only using English for zero-shot cross-lingual transfer) 51 | - `[dataset]`: dataset names in XTREME, i.e., `xnli`, `panx`, `pawsx`, `udpos`, `mlqa`, `tydiqa`, `xquad` 52 | - `[model]`: `xlm-roberta-base`, `xlm-roberta-large` 53 | - `[stage]`: `1` (first stage), `2` (second stage) 54 | - `[gpu]`: used to set environment variable `CUDA_VISIBLE_DEVICES` 55 | - `[data_dir]`: folder of training data 56 | - `[output_dir]`: folder of fine-tuning output 57 | 58 | ## Examples: XTREME Tasks 59 | 60 | ### XNLI fine-tuning on English training set and translated training sets (`translate-train-all`) 61 | 62 | ```bash 63 | # run stage 1 of xTune 64 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 1 65 | # run stage 2 of xTune (optional) 66 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 2 67 | ``` 68 | 69 | ### XNLI fine-tuning on English training set (`cross-lingual-transfer`) 70 | 71 | ```bash 72 | # run stage 1 of xTune 73 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 1 74 | # run stage 2 of xTune (optional) 75 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 2 76 | ``` 77 | 78 | ## Paper 79 | Please cite our paper `\cite{bo2021xtune}` if you found the resources in the repository useful. 80 | 81 | ``` 82 | @inproceedings{bo2021xtune, 83 | author = {Bo Zheng, Li Dong, Shaohan Huang, Wenhui Wang, Zewen Chi, Saksham Singhal, Wanxiang Che, Ting Liu, Xia Song, Furu Wei}, 84 | booktitle = {Proceedings of ACL 2021}, 85 | title = {{Consistency Regularization for Cross-Lingual Fine-Tuning}}, 86 | year = {2021} 87 | } 88 | ``` 89 | 90 | ## Reference 91 | 92 | 1. https://github.com/google-research/xtreme 93 | 2. https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN?_encoding=UTF8&%2AVersion%2A=1&%2Aentries%2A=0&mgh=1 94 | 3. https://console.cloud.google.com/storage/browser/xtreme_translations 95 | 4. https://drive.google.com/drive/folders/1Rdbc0Us_4I5MpRCwLASxBwqSW8_dlF87?usp=sharing 96 | 5. https://github.com/huggingface/transformers/ 97 | 6. https://github.com/facebookresearch/MUSE 98 | 7. https://drive.google.com/drive/folders/1k9rQinwUXicglA5oyzo9xtgqiuUVDkjT?usp=sharing 99 | -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_mlqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | 27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/ 28 | 29 | TASK='mlqa' 30 | 31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/ 32 | MODEL_PATH=$DATA_DIR/$MODEL 33 | 34 | EPOCH=4 35 | MAXL=384 36 | LANGS="en,es,de,ar,hi,vi,zh" 37 | BSR=0.3 38 | SA=0.3 39 | SNBS=-1 40 | CSR=0.3 41 | R1_LAMBDA=5.0 42 | R2_LAMBDA=5.0 43 | if [ $MODEL == "xlm-roberta-large" ]; then 44 | BATCH_SIZE=4 45 | GRAD_ACC=8 46 | LR=1.5e-5 47 | else 48 | BATCH_SIZE=32 49 | GRAD_ACC=1 50 | LR=3e-5 51 | fi 52 | 53 | if [ $STAGE == 1 ]; then 54 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 55 | python ./src/run_qa.py --model_type xlmr \ 56 | --task_name $TASK \ 57 | --model_name_or_path $MODEL_PATH \ 58 | --do_train \ 59 | --do_eval \ 60 | --language $LANGS \ 61 | --train_language en \ 62 | --data_dir $DATA_DIR/$TASK/ \ 63 | --per_gpu_train_batch_size $BATCH_SIZE \ 64 | --gradient_accumulation_steps $GRAD_ACC \ 65 | --per_gpu_eval_batch_size 128 \ 66 | --learning_rate $LR \ 67 | --num_train_epochs $EPOCH \ 68 | --save_steps 0 \ 69 | --logging_each_epoch \ 70 | --max_seq_length $MAXL \ 71 | --doc_stride 128 \ 72 | --output_dir $OUTPUT_DIR \ 73 | --overwrite_output_dir \ 74 | --evaluate_during_training \ 75 | --logging_steps 50 \ 76 | --evaluate_steps 0 \ 77 | --seed $SEED \ 78 | --fp16 --fp16_opt_level O2 \ 79 | --warmup_steps -1 \ 80 | --enable_r1_loss \ 81 | --r1_lambda $R1_LAMBDA \ 82 | --original_loss \ 83 | --overall_ratio 1.0 \ 84 | --keep_boundary_unchanged \ 85 | --enable_code_switch \ 86 | --code_switch_ratio $CSR \ 87 | --dict_dir $DATA_DIR/dicts \ 88 | --dict_languages es,de,ar,hi,vi,zh \ 89 | --noised_max_seq_length $MAXL 90 | elif [ $STAGE == 2 ]; then 91 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 92 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/" 93 | python ./src/run_qa.py --model_type xlmr \ 94 | --task_name $TASK \ 95 | --model_name_or_path $MODEL_PATH \ 96 | --do_train \ 97 | --do_eval \ 98 | --language $LANGS \ 99 | --train_language en \ 100 | --data_dir $DATA_DIR/$TASK/ \ 101 | --per_gpu_train_batch_size $BATCH_SIZE \ 102 | --gradient_accumulation_steps $GRAD_ACC \ 103 | --per_gpu_eval_batch_size 128 \ 104 | --learning_rate $LR \ 105 | --num_train_epochs $EPOCH \ 106 | --save_steps 0 \ 107 | --logging_each_epoch \ 108 | --max_seq_length $MAXL \ 109 | --doc_stride 128 \ 110 | --output_dir $OUTPUT_DIR \ 111 | --overwrite_output_dir \ 112 | --evaluate_during_training \ 113 | --logging_steps 50 \ 114 | --evaluate_steps 0 \ 115 | --seed $SEED \ 116 | --fp16 --fp16_opt_level O2 \ 117 | --warmup_steps -1 \ 118 | --enable_r1_loss \ 119 | --r1_lambda $R1_LAMBDA \ 120 | --original_loss \ 121 | --overall_ratio 1.0 \ 122 | --keep_boundary_unchanged \ 123 | --enable_bpe_sampling \ 124 | --bpe_sampling_ratio $BSR \ 125 | --sampling_alpha $SA \ 126 | --sampling_nbest_size $SNBS \ 127 | --noised_max_seq_length $MAXL \ 128 | --enable_data_augmentation \ 129 | --augment_ratio 1.0 \ 130 | --augment_method ss \ 131 | --max_steps 24000 \ 132 | --r2_lambda $R2_LAMBDA \ 133 | --first_stage_model_path $FIRST_MODEL_PATH 134 | fi 135 | 136 | 137 | -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_panx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='panx' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | EPOCH=10 29 | MAX_LENGTH=128 30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu" 31 | EVALUATE_STEPS=1000 32 | BSR=0.3 33 | SA=0.3 34 | SNBS=-1 35 | R1_LAMBDA=5.0 36 | R2_LAMBDA=5.0 37 | if [ $MODEL == "xlm-roberta-large" ]; then 38 | BATCH_SIZE=32 39 | GRAD_ACC=1 40 | LR=7e-6 41 | else 42 | BATCH_SIZE=32 43 | GRAD_ACC=1 44 | LR=1e-5 45 | fi 46 | 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt 48 | 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/ 50 | 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python src/run_tag.py --model_type xlmr \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --do_predict \ 59 | --do_predict_dev \ 60 | --predict_langs $LANGS \ 61 | --train_langs en \ 62 | --data_dir $DATA_DIR \ 63 | --labels $DATA_DIR/labels.txt \ 64 | --per_gpu_train_batch_size $BATCH_SIZE \ 65 | --gradient_accumulation_steps $GRAD_ACC \ 66 | --per_gpu_eval_batch_size 128 \ 67 | --learning_rate $LR \ 68 | --num_train_epochs $EPOCH \ 69 | --max_seq_length $MAX_LENGTH \ 70 | --noised_max_seq_length $MAX_LENGTH \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps $EVALUATE_STEPS \ 76 | --seed $SEED \ 77 | --warmup_steps -1 \ 78 | --save_only_best_checkpoint \ 79 | --eval_all_checkpoints \ 80 | --eval_patience -1 \ 81 | --fp16 --fp16_opt_level O2 \ 82 | --hidden_dropout_prob 0.1 \ 83 | --original_loss \ 84 | --enable_r1_loss \ 85 | --r1_lambda $R1_LAMBDA \ 86 | --use_token_label_probs \ 87 | --enable_bpe_sampling \ 88 | --bpe_sampling_ratio $BSR \ 89 | --sampling_alpha $SA \ 90 | --sampling_nbest_size $SNBS 91 | elif [ $STAGE == 2 ]; then 92 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 93 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/" 94 | python src/run_tag.py --model_type xlmr \ 95 | --model_name_or_path $MODEL_PATH \ 96 | --do_train \ 97 | --do_eval \ 98 | --do_predict \ 99 | --do_predict_dev \ 100 | --predict_langs $LANGS \ 101 | --train_langs en \ 102 | --data_dir $DATA_DIR \ 103 | --labels $DATA_DIR/labels.txt \ 104 | --per_gpu_train_batch_size $BATCH_SIZE \ 105 | --gradient_accumulation_steps $GRAD_ACC \ 106 | --per_gpu_eval_batch_size 128 \ 107 | --learning_rate $LR \ 108 | --num_train_epochs $EPOCH \ 109 | --max_seq_length $MAX_LENGTH \ 110 | --noised_max_seq_length $MAX_LENGTH \ 111 | --output_dir $OUTPUT_DIR \ 112 | --overwrite_output_dir \ 113 | --evaluate_during_training \ 114 | --logging_steps 50 \ 115 | --evaluate_steps $EVALUATE_STEPS \ 116 | --seed $SEED \ 117 | --warmup_steps -1 \ 118 | --save_only_best_checkpoint \ 119 | --eval_all_checkpoints \ 120 | --eval_patience -1 \ 121 | --fp16 --fp16_opt_level O2 \ 122 | --hidden_dropout_prob 0.1 \ 123 | --original_loss \ 124 | --enable_r1_loss \ 125 | --r1_lambda $R1_LAMBDA \ 126 | --use_token_label_probs \ 127 | --enable_bpe_sampling \ 128 | --bpe_sampling_ratio $BSR \ 129 | --sampling_alpha $SA \ 130 | --sampling_nbest_size $SNBS \ 131 | --enable_data_augmentation \ 132 | --augment_ratio 1.0 \ 133 | --augment_method ss \ 134 | --r2_lambda $R2_LAMBDA \ 135 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 136 | --use_hard_labels 137 | fi -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_pawsx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='pawsx' 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/ 28 | MODEL_PATH=$DATA_DIR/$MODEL 29 | EPOCH=10 30 | MAXL=256 31 | LANGS="de,en,es,fr,ja,ko,zh" 32 | EVALUATE_STEPS=1000 33 | CSR=0.5 34 | R1_LAMBDA=5.0 35 | R2_LAMBDA=2.0 36 | if [ $MODEL == "xlm-roberta-large" ]; then 37 | BATCH_SIZE=16 38 | GRAD_ACC=2 39 | LR=1e-5 40 | else 41 | BATCH_SIZE=32 42 | GRAD_ACC=1 43 | LR=1e-5 44 | fi 45 | 46 | if [ $STAGE == 1 ]; then 47 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 48 | mkdir -p $OUTPUT_DIR 49 | python ./src/run_cls.py --model_type xlmr \ 50 | --model_name_or_path $MODEL_PATH \ 51 | --language $LANGS \ 52 | --train_language en \ 53 | --do_train \ 54 | --data_dir $DATA_DIR/$TASK/ \ 55 | --per_gpu_train_batch_size $BATCH_SIZE \ 56 | --gradient_accumulation_steps $GRAD_ACC \ 57 | --per_gpu_eval_batch_size 64 \ 58 | --learning_rate $LR \ 59 | --num_train_epochs $EPOCH \ 60 | --max_seq_length $MAXL \ 61 | --output_dir $OUTPUT_DIR \ 62 | --task_name $TASK \ 63 | --save_steps -1 \ 64 | --overwrite_output_dir \ 65 | --evaluate_during_training \ 66 | --evaluate_steps $EVALUATE_STEPS \ 67 | --logging_steps 50 \ 68 | --logging_steps_in_sample -1 \ 69 | --logging_each_epoch \ 70 | --gpu_id 0 \ 71 | --seed $SEED \ 72 | --fp16 --fp16_opt_level O2 \ 73 | --warmup_steps -1 \ 74 | --enable_r1_loss \ 75 | --r1_lambda $R1_LAMBDA \ 76 | --original_loss \ 77 | --overall_ratio 1.0 \ 78 | --enable_code_switch \ 79 | --code_switch_ratio $CSR \ 80 | --dict_dir $DATA_DIR/dicts \ 81 | --dict_languages de,es,fr,ja,ko,zh 82 | elif [ $STAGE == 2 ]; then 83 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 84 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/" 85 | mkdir -p $OUTPUT_DIR 86 | python ./src/run_cls.py --model_type xlmr \ 87 | --model_name_or_path $MODEL_PATH \ 88 | --language $LANGS \ 89 | --train_language en \ 90 | --do_train \ 91 | --data_dir $DATA_DIR/$TASK/ \ 92 | --per_gpu_train_batch_size $BATCH_SIZE \ 93 | --gradient_accumulation_steps $GRAD_ACC \ 94 | --per_gpu_eval_batch_size 64 \ 95 | --learning_rate $LR \ 96 | --num_train_epochs $EPOCH \ 97 | --max_seq_length $MAXL \ 98 | --output_dir $OUTPUT_DIR \ 99 | --task_name $TASK \ 100 | --save_steps -1 \ 101 | --overwrite_output_dir \ 102 | --evaluate_during_training \ 103 | --evaluate_steps $EVALUATE_STEPS \ 104 | --logging_steps 50 \ 105 | --logging_steps_in_sample -1 \ 106 | --logging_each_epoch \ 107 | --gpu_id 0 \ 108 | --seed $SEED \ 109 | --fp16 --fp16_opt_level O2 \ 110 | --warmup_steps -1 \ 111 | --enable_r1_loss \ 112 | --r1_lambda $R1_LAMBDA \ 113 | --original_loss \ 114 | --overall_ratio 1.0 \ 115 | --enable_code_switch \ 116 | --code_switch_ratio $CSR \ 117 | --dict_dir $DATA_DIR/dicts \ 118 | --dict_languages de,es,fr,ja,ko,zh \ 119 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 120 | --enable_data_augmentation \ 121 | --augment_ratio 1.0 \ 122 | --augment_method cs \ 123 | --r2_lambda $R2_LAMBDA 124 | fi -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_tydiqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='tydiqa' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/ 29 | 30 | 31 | MAXL=384 32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te" 33 | BSR=0.3 34 | SA=0.3 35 | SNBS=-1 36 | R1_LAMBDA=5.0 37 | R2_LAMBDA=5.0 38 | if [ $MODEL == "xlm-roberta-large" ]; then 39 | BATCH_SIZE=4 40 | GRAD_ACC=8 41 | LR=1.5e-5 42 | EPOCH=10 43 | MAX_STEPS=2500 44 | else 45 | BATCH_SIZE=32 46 | GRAD_ACC=1 47 | LR=3e-5 48 | EPOCH=20 49 | MAX_STEPS=5000 50 | fi 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python ./src/run_qa.py --model_type xlmr \ 55 | --task_name $TASK \ 56 | --model_name_or_path $MODEL_PATH \ 57 | --do_train \ 58 | --do_eval \ 59 | --language $LANGS \ 60 | --train_language en \ 61 | --data_dir $DATA_DIR/$TASK/ \ 62 | --per_gpu_train_batch_size $BATCH_SIZE \ 63 | --gradient_accumulation_steps $GRAD_ACC \ 64 | --per_gpu_eval_batch_size 128 \ 65 | --learning_rate $LR \ 66 | --num_train_epochs $EPOCH \ 67 | --save_steps 0 \ 68 | --logging_each_epoch \ 69 | --max_seq_length $MAXL \ 70 | --doc_stride 128 \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps 0 \ 76 | --seed $SEED \ 77 | --fp16 --fp16_opt_level O2 \ 78 | --warmup_steps -1 \ 79 | --enable_r1_loss \ 80 | --r1_lambda $R1_LAMBDA \ 81 | --original_loss \ 82 | --overall_ratio 1.0 \ 83 | --keep_boundary_unchanged \ 84 | --enable_bpe_sampling \ 85 | --bpe_sampling_ratio $BSR \ 86 | --sampling_alpha $SA \ 87 | --sampling_nbest_size $SNBS \ 88 | --noised_max_seq_length $MAXL 89 | elif [ $STAGE == 2 ]; then 90 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 91 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/" 92 | python ./src/run_qa.py --model_type xlmr \ 93 | --task_name $TASK \ 94 | --model_name_or_path $MODEL_PATH \ 95 | --do_train \ 96 | --do_eval \ 97 | --language $LANGS \ 98 | --train_language en \ 99 | --data_dir $DATA_DIR/$TASK/ \ 100 | --per_gpu_train_batch_size $BATCH_SIZE \ 101 | --gradient_accumulation_steps $GRAD_ACC \ 102 | --per_gpu_eval_batch_size 128 \ 103 | --learning_rate $LR \ 104 | --num_train_epochs $EPOCH \ 105 | --save_steps 0 \ 106 | --logging_each_epoch \ 107 | --max_seq_length $MAXL \ 108 | --doc_stride 128 \ 109 | --output_dir $OUTPUT_DIR \ 110 | --overwrite_output_dir \ 111 | --evaluate_during_training \ 112 | --logging_steps 50 \ 113 | --evaluate_steps 0 \ 114 | --seed $SEED \ 115 | --fp16 --fp16_opt_level O2 \ 116 | --warmup_steps -1 \ 117 | --enable_r1_loss \ 118 | --r1_lambda $R1_LAMBDA \ 119 | --original_loss \ 120 | --overall_ratio 1.0 \ 121 | --keep_boundary_unchanged \ 122 | --enable_bpe_sampling \ 123 | --bpe_sampling_ratio $BSR \ 124 | --sampling_alpha $SA \ 125 | --sampling_nbest_size $SNBS \ 126 | --noised_max_seq_length $MAXL \ 127 | --enable_data_augmentation \ 128 | --augment_ratio 1.0 \ 129 | --augment_method ss \ 130 | --max_steps $MAX_STEPS \ 131 | --r2_lambda $R2_LAMBDA \ 132 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 133 | fi 134 | 135 | 136 | -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_udpos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='udpos' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | EPOCH=10 29 | MAX_LENGTH=128 30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh" 31 | EVALUATE_STEPS=500 32 | BSR=0.5 33 | SA=0.3 34 | SNBS=-1 35 | R1_LAMBDA=5.0 36 | R2_LAMBDA=0.3 37 | if [ $MODEL == "xlm-roberta-large" ]; then 38 | BATCH_SIZE=32 39 | GRAD_ACC=1 40 | LR=5e-6 41 | else 42 | BATCH_SIZE=32 43 | GRAD_ACC=1 44 | LR=2e-5 45 | fi 46 | 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt 48 | 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/ 50 | 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python src/run_tag.py --model_type xlmr \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --do_predict \ 59 | --do_predict_dev \ 60 | --predict_langs $LANGS \ 61 | --train_langs en \ 62 | --data_dir $DATA_DIR \ 63 | --labels $DATA_DIR/labels.txt \ 64 | --per_gpu_train_batch_size $BATCH_SIZE \ 65 | --gradient_accumulation_steps $GRAD_ACC \ 66 | --per_gpu_eval_batch_size 128 \ 67 | --learning_rate $LR \ 68 | --num_train_epochs $EPOCH \ 69 | --max_seq_length $MAX_LENGTH \ 70 | --noised_max_seq_length $MAX_LENGTH \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps $EVALUATE_STEPS \ 76 | --seed $SEED \ 77 | --warmup_steps -1 \ 78 | --save_only_best_checkpoint \ 79 | --eval_all_checkpoints \ 80 | --eval_patience -1 \ 81 | --fp16 --fp16_opt_level O2 \ 82 | --hidden_dropout_prob 0.1 \ 83 | --original_loss \ 84 | --use_pooling_strategy \ 85 | --enable_r1_loss \ 86 | --r1_lambda $R1_LAMBDA \ 87 | --use_token_label_probs \ 88 | --enable_bpe_sampling \ 89 | --bpe_sampling_ratio $BSR \ 90 | --sampling_alpha $SA \ 91 | --sampling_nbest_size $SNBS 92 | elif [ $STAGE == 2 ]; then 93 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 94 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/" 95 | python src/run_tag.py --model_type xlmr \ 96 | --model_name_or_path $MODEL_PATH \ 97 | --do_train \ 98 | --do_eval \ 99 | --do_predict \ 100 | --do_predict_dev \ 101 | --predict_langs $LANGS \ 102 | --train_langs en \ 103 | --data_dir $DATA_DIR \ 104 | --labels $DATA_DIR/labels.txt \ 105 | --per_gpu_train_batch_size $BATCH_SIZE \ 106 | --gradient_accumulation_steps $GRAD_ACC \ 107 | --per_gpu_eval_batch_size 128 \ 108 | --learning_rate $LR \ 109 | --num_train_epochs $EPOCH \ 110 | --max_seq_length $MAX_LENGTH \ 111 | --noised_max_seq_length $MAX_LENGTH \ 112 | --output_dir $OUTPUT_DIR \ 113 | --overwrite_output_dir \ 114 | --evaluate_during_training \ 115 | --logging_steps 50 \ 116 | --evaluate_steps $EVALUATE_STEPS \ 117 | --seed $SEED \ 118 | --warmup_steps -1 \ 119 | --save_only_best_checkpoint \ 120 | --eval_all_checkpoints \ 121 | --eval_patience -1 \ 122 | --fp16 --fp16_opt_level O2 \ 123 | --hidden_dropout_prob 0.1 \ 124 | --original_loss \ 125 | --use_pooling_strategy \ 126 | --enable_r1_loss \ 127 | --r1_lambda $R1_LAMBDA \ 128 | --use_token_label_probs \ 129 | --enable_bpe_sampling \ 130 | --bpe_sampling_ratio $BSR \ 131 | --sampling_alpha $SA \ 132 | --sampling_nbest_size $SNBS \ 133 | --enable_data_augmentation \ 134 | --augment_ratio 1.0 \ 135 | --augment_method ss \ 136 | --r2_lambda $R2_LAMBDA \ 137 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 138 | fi -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_xnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='xnli' 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/ 28 | MODEL_PATH=$DATA_DIR/$MODEL 29 | EPOCH=10 30 | MAXL=256 31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh" 32 | EVALUATE_STEPS=5000 33 | CSR=0.3 34 | R1_LAMBDA=5.0 35 | R2_LAMBDA=5.0 36 | if [ $MODEL == "xlm-roberta-large" ]; then 37 | BATCH_SIZE=16 38 | GRAD_ACC=2 39 | LR=5e-6 40 | else 41 | BATCH_SIZE=32 42 | GRAD_ACC=1 43 | LR=7e-6 44 | fi 45 | 46 | if [ $STAGE == 1 ]; then 47 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 48 | mkdir -p $OUTPUT_DIR 49 | python ./src/run_cls.py --model_type xlmr \ 50 | --model_name_or_path $MODEL_PATH \ 51 | --language $LANGS \ 52 | --train_language en \ 53 | --do_train \ 54 | --data_dir $DATA_DIR/$TASK/ \ 55 | --per_gpu_train_batch_size $BATCH_SIZE \ 56 | --gradient_accumulation_steps $GRAD_ACC \ 57 | --per_gpu_eval_batch_size 64 \ 58 | --learning_rate $LR \ 59 | --num_train_epochs $EPOCH \ 60 | --max_seq_length $MAXL \ 61 | --output_dir $OUTPUT_DIR \ 62 | --task_name $TASK \ 63 | --save_steps -1 \ 64 | --overwrite_output_dir \ 65 | --evaluate_during_training \ 66 | --evaluate_steps $EVALUATE_STEPS \ 67 | --logging_steps 50 \ 68 | --logging_steps_in_sample -1 \ 69 | --logging_each_epoch \ 70 | --gpu_id 0 \ 71 | --seed $SEED \ 72 | --fp16 --fp16_opt_level O2 \ 73 | --warmup_steps -1 \ 74 | --enable_r1_loss \ 75 | --r1_lambda $R1_LAMBDA \ 76 | --original_loss \ 77 | --overall_ratio 1.0 \ 78 | --enable_code_switch \ 79 | --code_switch_ratio $CSR \ 80 | --dict_dir $DATA_DIR/dicts \ 81 | --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh 82 | elif [ $STAGE == 2 ]; then 83 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 84 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/" 85 | mkdir -p $OUTPUT_DIR 86 | python ./src/run_cls.py --model_type xlmr \ 87 | --model_name_or_path $MODEL_PATH \ 88 | --language $LANGS \ 89 | --train_language en \ 90 | --do_train \ 91 | --data_dir $DATA_DIR/$TASK/ \ 92 | --per_gpu_train_batch_size $BATCH_SIZE \ 93 | --gradient_accumulation_steps $GRAD_ACC \ 94 | --per_gpu_eval_batch_size 64 \ 95 | --learning_rate $LR \ 96 | --num_train_epochs $EPOCH \ 97 | --max_seq_length $MAXL \ 98 | --output_dir $OUTPUT_DIR \ 99 | --task_name $TASK \ 100 | --save_steps -1 \ 101 | --overwrite_output_dir \ 102 | --evaluate_during_training \ 103 | --evaluate_steps $EVALUATE_STEPS \ 104 | --logging_steps 50 \ 105 | --logging_steps_in_sample -1 \ 106 | --logging_each_epoch \ 107 | --gpu_id 0 \ 108 | --seed $SEED \ 109 | --fp16 --fp16_opt_level O2 \ 110 | --warmup_steps -1 \ 111 | --enable_r1_loss \ 112 | --r1_lambda $R1_LAMBDA \ 113 | --original_loss \ 114 | --overall_ratio 1.0 \ 115 | --enable_code_switch \ 116 | --code_switch_ratio $CSR \ 117 | --dict_dir $DATA_DIR/dicts \ 118 | --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh \ 119 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 120 | --enable_data_augmentation \ 121 | --augment_ratio 1.0 \ 122 | --augment_method cs \ 123 | --r2_lambda $R2_LAMBDA 124 | fi -------------------------------------------------------------------------------- /scripts/cross-lingual-transfer/train_xquad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/ 27 | 28 | TASK='xquad' 29 | MODEL_PATH=$DATA_DIR/$MODEL 30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/ 31 | 32 | EPOCH=4 33 | MAXL=384 34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh" 35 | BSR=0.3 36 | SA=0.3 37 | SNBS=-1 38 | CSR=0.3 39 | R1_LAMBDA=5.0 40 | R2_LAMBDA=5.0 41 | if [ $MODEL == "xlm-roberta-large" ]; then 42 | BATCH_SIZE=4 43 | GRAD_ACC=8 44 | LR=1.5e-5 45 | else 46 | BATCH_SIZE=32 47 | GRAD_ACC=1 48 | LR=3e-5 49 | fi 50 | 51 | if [ $STAGE == 1 ]; then 52 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 53 | python ./src/run_qa.py --model_type xlmr \ 54 | --task_name $TASK \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --language $LANGS \ 59 | --train_language en \ 60 | --data_dir $DATA_DIR/$TASK/ \ 61 | --per_gpu_train_batch_size $BATCH_SIZE \ 62 | --gradient_accumulation_steps $GRAD_ACC \ 63 | --per_gpu_eval_batch_size 128 \ 64 | --learning_rate $LR \ 65 | --num_train_epochs $EPOCH \ 66 | --save_steps 0 \ 67 | --logging_each_epoch \ 68 | --max_seq_length $MAXL \ 69 | --doc_stride 128 \ 70 | --output_dir $OUTPUT_DIR \ 71 | --overwrite_output_dir \ 72 | --evaluate_during_training \ 73 | --logging_steps 50 \ 74 | --evaluate_steps 0 \ 75 | --seed $SEED \ 76 | --fp16 --fp16_opt_level O2 \ 77 | --warmup_steps -1 \ 78 | --enable_r1_loss \ 79 | --r1_lambda $R1_LAMBDA \ 80 | --original_loss \ 81 | --overall_ratio 1.0 \ 82 | --keep_boundary_unchanged \ 83 | --enable_code_switch \ 84 | --code_switch_ratio $CSR \ 85 | --dict_dir $DATA_DIR/dicts \ 86 | --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \ 87 | --noised_max_seq_length $MAXL 88 | elif [ $STAGE == 2 ]; then 89 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 90 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/" 91 | python ./src/run_qa.py --model_type xlmr \ 92 | --task_name $TASK \ 93 | --model_name_or_path $MODEL_PATH \ 94 | --do_train \ 95 | --do_eval \ 96 | --language $LANGS \ 97 | --train_language en \ 98 | --data_dir $DATA_DIR/$TASK/ \ 99 | --per_gpu_train_batch_size $BATCH_SIZE \ 100 | --gradient_accumulation_steps $GRAD_ACC \ 101 | --per_gpu_eval_batch_size 128 \ 102 | --learning_rate $LR \ 103 | --num_train_epochs $EPOCH \ 104 | --save_steps 0 \ 105 | --logging_each_epoch \ 106 | --max_seq_length $MAXL \ 107 | --doc_stride 128 \ 108 | --output_dir $OUTPUT_DIR \ 109 | --overwrite_output_dir \ 110 | --evaluate_during_training \ 111 | --logging_steps 50 \ 112 | --evaluate_steps 0 \ 113 | --seed $SEED \ 114 | --fp16 --fp16_opt_level O2 \ 115 | --warmup_steps -1 \ 116 | --enable_r1_loss \ 117 | --r1_lambda $R1_LAMBDA \ 118 | --original_loss \ 119 | --overall_ratio 1.0 \ 120 | --keep_boundary_unchanged \ 121 | --enable_bpe_sampling \ 122 | --bpe_sampling_ratio $BSR \ 123 | --sampling_alpha $SA \ 124 | --sampling_nbest_size $SNBS \ 125 | --noised_max_seq_length $MAXL \ 126 | --enable_data_augmentation \ 127 | --augment_ratio 1.0 \ 128 | --augment_method ss \ 129 | --max_steps 24000 \ 130 | --r2_lambda $R2_LAMBDA \ 131 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 132 | fi 133 | 134 | 135 | -------------------------------------------------------------------------------- /scripts/download_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | DIR=$REPO/download/ 18 | mkdir -p $DIR 19 | 20 | # download xlm-roberta-base 21 | function download_xlm-roberta-base { 22 | mkdir -p $DIR/xlm-roberta-base/ 23 | cd $DIR/xlm-roberta-base/ 24 | wget https://huggingface.co/xlm-roberta-base/resolve/main/pytorch_model.bin -q --show-progress 25 | wget https://huggingface.co/xlm-roberta-base/resolve/main/config.json -q --show-progress 26 | wget https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model -q --show-progress 27 | wget https://huggingface.co/xlm-roberta-base/resolve/main/tokenizer.json -q --show-progress 28 | echo "Successfully downloaded xlm-roberta-base at $DIR/xlm-roberta-base" >> $DIR/download_model.log 29 | } 30 | 31 | # download xlm-roberta-large 32 | function download_xlm-roberta-large { 33 | mkdir -p $DIR/xlm-roberta-large/ 34 | cd $DIR/xlm-roberta-large/ 35 | wget https://huggingface.co/xlm-roberta-large/resolve/main/pytorch_model.bin -q --show-progress 36 | wget https://huggingface.co/xlm-roberta-large/resolve/main/config.json -q --show-progress 37 | wget https://huggingface.co/xlm-roberta-large/resolve/main/sentencepiece.bpe.model -q --show-progress 38 | wget https://huggingface.co/xlm-roberta-large/resolve/main/tokenizer.json -q --show-progress 39 | echo "Successfully downloaded xlm-roberta-large at $DIR/xlm-roberta-large" >> $DIR/download_model.log 40 | } 41 | 42 | download_xlm-roberta-base 43 | download_xlm-roberta-large 44 | -------------------------------------------------------------------------------- /scripts/preprocess_panx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-bert-base-multilingual-cased} 18 | DATA_DIR=${2:-"$REPO/download/"} 19 | 20 | TASK='panx' 21 | MAXL=128 22 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu" 23 | LC="" 24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then 25 | MODEL_TYPE="bert" 26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then 27 | MODEL_TYPE="xlm" 28 | LC=" --do_lower_case" 29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then 30 | MODEL_TYPE="xlmr" 31 | fi 32 | SAVE_DIR="$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAXL}" 33 | mkdir -p $SAVE_DIR 34 | python3 $REPO/utils_preprocess.py \ 35 | --data_dir $DATA_DIR/$TASK/ \ 36 | --task panx_tokenize \ 37 | --model_name_or_path $MODEL \ 38 | --model_type $MODEL_TYPE \ 39 | --max_len $MAXL \ 40 | --output_dir $SAVE_DIR \ 41 | --languages $LANGS $LC >> $SAVE_DIR/preprocess.log 42 | if [ ! -f $SAVE_DIR/labels.txt ]; then 43 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt 44 | fi 45 | -------------------------------------------------------------------------------- /scripts/preprocess_udpos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-bert-base-multilingual-cased} 18 | DATA_DIR=${2:-"$REPO/download/"} 19 | 20 | TASK='udpos' 21 | MAXL=128 22 | LANGS='af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh' 23 | LC="" 24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then 25 | MODEL_TYPE="bert" 26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then 27 | MODEL_TYPE="xlm" 28 | LC=" --do_lower_case" 29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then 30 | MODEL_TYPE="xlmr" 31 | fi 32 | 33 | SAVE_DIR="$DATA_DIR/${TASK}/udpos_processed_maxlen${MAXL}" 34 | mkdir -p $SAVE_DIR 35 | python3 $REPO/utils_preprocess.py \ 36 | --data_dir $DATA_DIR/${TASK}/ \ 37 | --task udpos_tokenize \ 38 | --model_name_or_path $MODEL \ 39 | --model_type $MODEL_TYPE \ 40 | --max_len $MAXL \ 41 | --output_dir $SAVE_DIR \ 42 | --languages $LANGS $LC >> $SAVE_DIR/process.log 43 | if [ ! -f $SAVE_DIR/labels.txt ]; then 44 | echo "create label" 45 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt 46 | fi 47 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | SETTING=${1:-cross-lingual-transfer} 18 | TASK=${2:-xnli} 19 | MODEL=${3:-"xlm-roberta-base"} 20 | STAGE=${4:-1} 21 | GPU=${5:-0} 22 | DATA_DIR=${6:-"$REPO/download/"} 23 | OUT_DIR=${7:-"$REPO/outputs/"} 24 | SEED=${8:-1} 25 | 26 | echo "Fine-tuning $MODEL on $TASK using GPU $GPU in STAGE $STAGE with SETTING $SETTING" 27 | echo "Load data from $DATA_DIR, and save models to $OUT_DIR" 28 | 29 | if [ $TASK == "udpos" ]; then 30 | bash $REPO/scripts/preprocess_udpos.sh $MODEL $DATA_DIR 31 | elif [ $TASK == "panx" ]; then 32 | bash $REPO/scripts/preprocess_panx.sh $MODEL $DATA_DIR 33 | fi 34 | 35 | bash $REPO/scripts/$SETTING/train_${TASK}.sh $MODEL $STAGE $GPU $DATA_DIR $OUT_DIR $SEED -------------------------------------------------------------------------------- /scripts/translate-train-all/train_mlqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | 27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/ 28 | 29 | TASK='mlqa' 30 | 31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/ 32 | MODEL_PATH=$DATA_DIR/$MODEL 33 | 34 | EPOCH=4 35 | MAXL=384 36 | LANGS="en,es,de,ar,hi,vi,zh" 37 | BSR=0.3 38 | SA=0.3 39 | SNBS=-1 40 | CSR=0.3 41 | R1_LAMBDA=5.0 42 | R2_LAMBDA=0.5 43 | if [ $MODEL == "xlm-roberta-large" ]; then 44 | BATCH_SIZE=4 45 | GRAD_ACC=8 46 | LR=1.5e-5 47 | else 48 | BATCH_SIZE=32 49 | GRAD_ACC=1 50 | LR=3e-5 51 | fi 52 | 53 | if [ $STAGE == 1 ]; then 54 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 55 | python ./src/run_qa.py --model_type xlmr \ 56 | --task_name $TASK \ 57 | --model_name_or_path $MODEL_PATH \ 58 | --do_train \ 59 | --do_eval \ 60 | --language $LANGS \ 61 | --train_language en \ 62 | --data_dir $DATA_DIR/$TASK/ \ 63 | --per_gpu_train_batch_size $BATCH_SIZE \ 64 | --gradient_accumulation_steps $GRAD_ACC \ 65 | --per_gpu_eval_batch_size 128 \ 66 | --learning_rate $LR \ 67 | --num_train_epochs $EPOCH \ 68 | --save_steps 0 \ 69 | --logging_each_epoch \ 70 | --max_seq_length $MAXL \ 71 | --doc_stride 128 \ 72 | --output_dir $OUTPUT_DIR \ 73 | --overwrite_output_dir \ 74 | --evaluate_during_training \ 75 | --logging_steps 50 \ 76 | --evaluate_steps 0 \ 77 | --seed $SEED \ 78 | --fp16 --fp16_opt_level O2 \ 79 | --warmup_steps -1 \ 80 | --enable_r1_loss \ 81 | --r1_lambda $R1_LAMBDA \ 82 | --original_loss \ 83 | --overall_ratio 1.0 \ 84 | --keep_boundary_unchanged \ 85 | --enable_code_switch \ 86 | --code_switch_ratio $CSR \ 87 | --dict_dir $DATA_DIR/dicts \ 88 | --dict_languages es,de,ar,hi,vi,zh \ 89 | --noised_max_seq_length $MAXL 90 | elif [ $STAGE == 2 ]; then 91 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 92 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 93 | python ./src/run_qa.py --model_type xlmr \ 94 | --task_name $TASK \ 95 | --model_name_or_path $MODEL_PATH \ 96 | --do_train \ 97 | --do_eval \ 98 | --language $LANGS \ 99 | --train_language en \ 100 | --data_dir $DATA_DIR/$TASK/ \ 101 | --per_gpu_train_batch_size $BATCH_SIZE \ 102 | --gradient_accumulation_steps $GRAD_ACC \ 103 | --per_gpu_eval_batch_size 128 \ 104 | --learning_rate $LR \ 105 | --num_train_epochs $EPOCH \ 106 | --save_steps 0 \ 107 | --logging_each_epoch \ 108 | --max_seq_length $MAXL \ 109 | --doc_stride 128 \ 110 | --output_dir $OUTPUT_DIR \ 111 | --overwrite_output_dir \ 112 | --evaluate_during_training \ 113 | --logging_steps 50 \ 114 | --evaluate_steps 0 \ 115 | --seed $SEED \ 116 | --fp16 --fp16_opt_level O2 \ 117 | --warmup_steps -1 \ 118 | --enable_r1_loss \ 119 | --r1_lambda $R1_LAMBDA \ 120 | --original_loss \ 121 | --overall_ratio 1.0 \ 122 | --keep_boundary_unchanged \ 123 | --enable_bpe_sampling \ 124 | --bpe_sampling_ratio $BSR \ 125 | --sampling_alpha $SA \ 126 | --sampling_nbest_size $SNBS \ 127 | --noised_max_seq_length $MAXL \ 128 | --enable_data_augmentation \ 129 | --augment_ratio 1.0 \ 130 | --augment_method mt \ 131 | --translation_path $TRANSLATION_PATH \ 132 | --max_steps 24000 \ 133 | --r2_lambda $R2_LAMBDA \ 134 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 135 | fi 136 | 137 | 138 | -------------------------------------------------------------------------------- /scripts/translate-train-all/train_panx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='panx' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | EPOCH=10 29 | MAX_LENGTH=128 30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu" 31 | EVALUATE_STEPS=1000 32 | BSR=0.3 33 | SA=0.3 34 | SNBS=-1 35 | R1_LAMBDA=5.0 36 | R2_LAMBDA=1.0 37 | if [ $MODEL == "xlm-roberta-large" ]; then 38 | BATCH_SIZE=32 39 | GRAD_ACC=1 40 | LR=7e-6 41 | else 42 | BATCH_SIZE=32 43 | GRAD_ACC=1 44 | LR=1e-5 45 | fi 46 | 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt 48 | 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/ 50 | 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python src/run_tag.py --model_type xlmr \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --do_predict \ 59 | --do_predict_dev \ 60 | --predict_langs $LANGS \ 61 | --train_langs en \ 62 | --data_dir $DATA_DIR \ 63 | --labels $DATA_DIR/labels.txt \ 64 | --per_gpu_train_batch_size $BATCH_SIZE \ 65 | --gradient_accumulation_steps $GRAD_ACC \ 66 | --per_gpu_eval_batch_size 128 \ 67 | --learning_rate $LR \ 68 | --num_train_epochs $EPOCH \ 69 | --max_seq_length $MAX_LENGTH \ 70 | --noised_max_seq_length $MAX_LENGTH \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps $EVALUATE_STEPS \ 76 | --seed $SEED \ 77 | --warmup_steps -1 \ 78 | --save_only_best_checkpoint \ 79 | --eval_all_checkpoints \ 80 | --eval_patience -1 \ 81 | --fp16 --fp16_opt_level O2 \ 82 | --hidden_dropout_prob 0.1 \ 83 | --original_loss \ 84 | --enable_r1_loss \ 85 | --r1_lambda $R1_LAMBDA \ 86 | --use_token_label_probs \ 87 | --enable_bpe_sampling \ 88 | --bpe_sampling_ratio $BSR \ 89 | --sampling_alpha $SA \ 90 | --sampling_nbest_size $SNBS 91 | elif [ $STAGE == 2 ]; then 92 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 93 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 94 | python src/run_tag.py --model_type xlmr \ 95 | --model_name_or_path $MODEL_PATH \ 96 | --do_train \ 97 | --do_eval \ 98 | --do_predict \ 99 | --do_predict_dev \ 100 | --predict_langs $LANGS \ 101 | --train_langs en \ 102 | --data_dir $DATA_DIR \ 103 | --labels $DATA_DIR/labels.txt \ 104 | --per_gpu_train_batch_size $BATCH_SIZE \ 105 | --gradient_accumulation_steps $GRAD_ACC \ 106 | --per_gpu_eval_batch_size 128 \ 107 | --learning_rate $LR \ 108 | --num_train_epochs $EPOCH \ 109 | --max_seq_length $MAX_LENGTH \ 110 | --noised_max_seq_length $MAX_LENGTH \ 111 | --output_dir $OUTPUT_DIR \ 112 | --overwrite_output_dir \ 113 | --evaluate_during_training \ 114 | --logging_steps 50 \ 115 | --evaluate_steps $EVALUATE_STEPS \ 116 | --seed $SEED \ 117 | --warmup_steps -1 \ 118 | --save_only_best_checkpoint \ 119 | --eval_all_checkpoints \ 120 | --eval_patience -1 \ 121 | --fp16 --fp16_opt_level O2 \ 122 | --hidden_dropout_prob 0.1 \ 123 | --original_loss \ 124 | --enable_r1_loss \ 125 | --r1_lambda $R1_LAMBDA \ 126 | --use_token_label_probs \ 127 | --enable_bpe_sampling \ 128 | --bpe_sampling_ratio $BSR \ 129 | --sampling_alpha $SA \ 130 | --sampling_nbest_size $SNBS \ 131 | --enable_data_augmentation \ 132 | --augment_ratio 1.0 \ 133 | --augment_method mt \ 134 | --translation_path $TRANSLATION_PATH \ 135 | --r2_lambda $R2_LAMBDA \ 136 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 137 | --use_hard_labels 138 | fi -------------------------------------------------------------------------------- /scripts/translate-train-all/train_pawsx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='pawsx' 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/ 28 | MODEL_PATH=$DATA_DIR/$MODEL 29 | EPOCH=10 30 | MAXL=256 31 | LANGS="de,en,es,fr,ja,ko,zh" 32 | EVALUATE_STEPS=1000 33 | R1_LAMBDA=5.0 34 | R2_LAMBDA=1.0 35 | if [ $MODEL == "xlm-roberta-large" ]; then 36 | BATCH_SIZE=16 37 | GRAD_ACC=2 38 | LR=1e-5 39 | else 40 | BATCH_SIZE=32 41 | GRAD_ACC=1 42 | LR=1e-5 43 | fi 44 | 45 | if [ $STAGE == 1 ]; then 46 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/" 47 | mkdir -p $OUTPUT_DIR 48 | python ./src/run_cls.py --model_type xlmr \ 49 | --model_name_or_path $MODEL_PATH \ 50 | --language $LANGS \ 51 | --train_language en \ 52 | --do_train \ 53 | --data_dir $DATA_DIR/$TASK/ \ 54 | --per_gpu_train_batch_size $BATCH_SIZE \ 55 | --gradient_accumulation_steps $GRAD_ACC \ 56 | --per_gpu_eval_batch_size 64 \ 57 | --learning_rate $LR \ 58 | --num_train_epochs $EPOCH \ 59 | --max_seq_length $MAXL \ 60 | --output_dir $OUTPUT_DIR \ 61 | --task_name $TASK \ 62 | --save_steps -1 \ 63 | --overwrite_output_dir \ 64 | --evaluate_during_training \ 65 | --evaluate_steps $EVALUATE_STEPS \ 66 | --logging_steps 50 \ 67 | --logging_steps_in_sample -1 \ 68 | --logging_each_epoch \ 69 | --gpu_id 0 \ 70 | --seed $SEED \ 71 | --fp16 --fp16_opt_level O2 \ 72 | --warmup_steps -1 \ 73 | --enable_r1_loss \ 74 | --r1_lambda $R1_LAMBDA \ 75 | --original_loss \ 76 | --enable_translate_data \ 77 | --translation_path $TRANSLATION_PATH 78 | elif [ $STAGE == 2 ]; then 79 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 80 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 81 | mkdir -p $OUTPUT_DIR 82 | python ./src/run_cls.py --model_type xlmr \ 83 | --model_name_or_path $MODEL_PATH \ 84 | --language $LANGS \ 85 | --train_language en \ 86 | --do_train \ 87 | --data_dir $DATA_DIR/$TASK/ \ 88 | --per_gpu_train_batch_size $BATCH_SIZE \ 89 | --gradient_accumulation_steps $GRAD_ACC \ 90 | --per_gpu_eval_batch_size 64 \ 91 | --learning_rate $LR \ 92 | --num_train_epochs $EPOCH \ 93 | --max_seq_length $MAXL \ 94 | --output_dir $OUTPUT_DIR \ 95 | --task_name $TASK \ 96 | --save_steps -1 \ 97 | --overwrite_output_dir \ 98 | --evaluate_during_training \ 99 | --evaluate_steps $EVALUATE_STEPS \ 100 | --logging_steps 50 \ 101 | --logging_steps_in_sample -1 \ 102 | --logging_each_epoch \ 103 | --gpu_id 0 \ 104 | --seed $SEED \ 105 | --fp16 --fp16_opt_level O2 \ 106 | --warmup_steps -1 \ 107 | --enable_r1_loss \ 108 | --r1_lambda $R1_LAMBDA \ 109 | --original_loss \ 110 | --enable_translate_data \ 111 | --translation_path $TRANSLATION_PATH \ 112 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 113 | --enable_data_augmentation \ 114 | --augment_ratio 1.0 \ 115 | --augment_method mt \ 116 | --r2_lambda $R2_LAMBDA 117 | fi 118 | 119 | 120 | -------------------------------------------------------------------------------- /scripts/translate-train-all/train_tydiqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='tydiqa' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/ 29 | 30 | 31 | MAXL=384 32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te" 33 | BSR=0.3 34 | SA=0.3 35 | SNBS=-1 36 | R1_LAMBDA=5.0 37 | R2_LAMBDA=0.3 38 | if [ $MODEL == "xlm-roberta-large" ]; then 39 | BATCH_SIZE=4 40 | GRAD_ACC=8 41 | LR=1.5e-5 42 | EPOCH=10 43 | MAX_STEPS=2500 44 | else 45 | BATCH_SIZE=32 46 | GRAD_ACC=1 47 | LR=3e-5 48 | EPOCH=20 49 | MAX_STEPS=5000 50 | fi 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python ./src/run_qa.py --model_type xlmr \ 55 | --task_name $TASK \ 56 | --model_name_or_path $MODEL_PATH \ 57 | --do_train \ 58 | --do_eval \ 59 | --language $LANGS \ 60 | --train_language en \ 61 | --data_dir $DATA_DIR/$TASK/ \ 62 | --per_gpu_train_batch_size $BATCH_SIZE \ 63 | --gradient_accumulation_steps $GRAD_ACC \ 64 | --per_gpu_eval_batch_size 128 \ 65 | --learning_rate $LR \ 66 | --num_train_epochs $EPOCH \ 67 | --save_steps 0 \ 68 | --logging_each_epoch \ 69 | --max_seq_length $MAXL \ 70 | --doc_stride 128 \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps 0 \ 76 | --seed $SEED \ 77 | --fp16 --fp16_opt_level O2 \ 78 | --warmup_steps -1 \ 79 | --enable_r1_loss \ 80 | --r1_lambda $R1_LAMBDA \ 81 | --original_loss \ 82 | --overall_ratio 1.0 \ 83 | --keep_boundary_unchanged \ 84 | --enable_bpe_sampling \ 85 | --bpe_sampling_ratio $BSR \ 86 | --sampling_alpha $SA \ 87 | --sampling_nbest_size $SNBS \ 88 | --noised_max_seq_length $MAXL 89 | elif [ $STAGE == 2 ]; then 90 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 91 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 92 | python ./src/run_qa.py --model_type xlmr \ 93 | --task_name $TASK \ 94 | --model_name_or_path $MODEL_PATH \ 95 | --do_train \ 96 | --do_eval \ 97 | --language $LANGS \ 98 | --train_language en \ 99 | --data_dir $DATA_DIR/$TASK/ \ 100 | --per_gpu_train_batch_size $BATCH_SIZE \ 101 | --gradient_accumulation_steps $GRAD_ACC \ 102 | --per_gpu_eval_batch_size 128 \ 103 | --learning_rate $LR \ 104 | --num_train_epochs $EPOCH \ 105 | --save_steps 0 \ 106 | --logging_each_epoch \ 107 | --max_seq_length $MAXL \ 108 | --doc_stride 128 \ 109 | --output_dir $OUTPUT_DIR \ 110 | --overwrite_output_dir \ 111 | --evaluate_during_training \ 112 | --logging_steps 50 \ 113 | --evaluate_steps 0 \ 114 | --seed $SEED \ 115 | --fp16 --fp16_opt_level O2 \ 116 | --warmup_steps -1 \ 117 | --enable_r1_loss \ 118 | --r1_lambda $R1_LAMBDA \ 119 | --original_loss \ 120 | --overall_ratio 1.0 \ 121 | --keep_boundary_unchanged \ 122 | --enable_bpe_sampling \ 123 | --bpe_sampling_ratio $BSR \ 124 | --sampling_alpha $SA \ 125 | --sampling_nbest_size $SNBS \ 126 | --noised_max_seq_length $MAXL \ 127 | --enable_data_augmentation \ 128 | --augment_ratio 1.0 \ 129 | --augment_method mt \ 130 | --translation_path $TRANSLATION_PATH \ 131 | --max_steps $MAX_STEPS \ 132 | --r2_lambda $R2_LAMBDA \ 133 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 134 | fi 135 | 136 | 137 | -------------------------------------------------------------------------------- /scripts/translate-train-all/train_udpos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='udpos' 27 | MODEL_PATH=$DATA_DIR/$MODEL 28 | EPOCH=10 29 | MAX_LENGTH=128 30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh" 31 | EVALUATE_STEPS=500 32 | BSR=0.5 33 | SA=0.3 34 | SNBS=-1 35 | R1_LAMBDA=5.0 36 | R2_LAMBDA=0.3 37 | if [ $MODEL == "xlm-roberta-large" ]; then 38 | BATCH_SIZE=32 39 | GRAD_ACC=1 40 | LR=5e-6 41 | else 42 | BATCH_SIZE=32 43 | GRAD_ACC=1 44 | LR=2e-5 45 | fi 46 | 47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt 48 | 49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/ 50 | 51 | 52 | if [ $STAGE == 1 ]; then 53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/" 54 | python src/run_tag.py --model_type xlmr \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --do_predict \ 59 | --do_predict_dev \ 60 | --predict_langs $LANGS \ 61 | --train_langs en \ 62 | --data_dir $DATA_DIR \ 63 | --labels $DATA_DIR/labels.txt \ 64 | --per_gpu_train_batch_size $BATCH_SIZE \ 65 | --gradient_accumulation_steps $GRAD_ACC \ 66 | --per_gpu_eval_batch_size 128 \ 67 | --learning_rate $LR \ 68 | --num_train_epochs $EPOCH \ 69 | --max_seq_length $MAX_LENGTH \ 70 | --noised_max_seq_length $MAX_LENGTH \ 71 | --output_dir $OUTPUT_DIR \ 72 | --overwrite_output_dir \ 73 | --evaluate_during_training \ 74 | --logging_steps 50 \ 75 | --evaluate_steps $EVALUATE_STEPS \ 76 | --seed $SEED \ 77 | --warmup_steps -1 \ 78 | --save_only_best_checkpoint \ 79 | --eval_all_checkpoints \ 80 | --eval_patience -1 \ 81 | --fp16 --fp16_opt_level O2 \ 82 | --hidden_dropout_prob 0.1 \ 83 | --original_loss \ 84 | --use_pooling_strategy \ 85 | --enable_r1_loss \ 86 | --r1_lambda $R1_LAMBDA \ 87 | --use_token_label_probs \ 88 | --enable_bpe_sampling \ 89 | --bpe_sampling_ratio $BSR \ 90 | --sampling_alpha $SA \ 91 | --sampling_nbest_size $SNBS 92 | elif [ $STAGE == 2 ]; then 93 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 94 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 95 | python src/run_tag.py --model_type xlmr \ 96 | --model_name_or_path $MODEL_PATH \ 97 | --do_train \ 98 | --do_eval \ 99 | --do_predict \ 100 | --do_predict_dev \ 101 | --predict_langs $LANGS \ 102 | --train_langs en \ 103 | --data_dir $DATA_DIR \ 104 | --labels $DATA_DIR/labels.txt \ 105 | --per_gpu_train_batch_size $BATCH_SIZE \ 106 | --gradient_accumulation_steps $GRAD_ACC \ 107 | --per_gpu_eval_batch_size 128 \ 108 | --learning_rate $LR \ 109 | --num_train_epochs $EPOCH \ 110 | --max_seq_length $MAX_LENGTH \ 111 | --noised_max_seq_length $MAX_LENGTH \ 112 | --output_dir $OUTPUT_DIR \ 113 | --overwrite_output_dir \ 114 | --evaluate_during_training \ 115 | --logging_steps 50 \ 116 | --evaluate_steps $EVALUATE_STEPS \ 117 | --seed $SEED \ 118 | --warmup_steps -1 \ 119 | --save_only_best_checkpoint \ 120 | --eval_all_checkpoints \ 121 | --eval_patience -1 \ 122 | --fp16 --fp16_opt_level O2 \ 123 | --hidden_dropout_prob 0.1 \ 124 | --original_loss \ 125 | --use_pooling_strategy \ 126 | --enable_r1_loss \ 127 | --r1_lambda $R1_LAMBDA \ 128 | --use_token_label_probs \ 129 | --enable_bpe_sampling \ 130 | --bpe_sampling_ratio $BSR \ 131 | --sampling_alpha $SA \ 132 | --sampling_nbest_size $SNBS \ 133 | --enable_data_augmentation \ 134 | --augment_ratio 1.0 \ 135 | --augment_method mt \ 136 | --translation_path $TRANSLATION_PATH \ 137 | --r2_lambda $R2_LAMBDA \ 138 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 139 | fi -------------------------------------------------------------------------------- /scripts/translate-train-all/train_xnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | TASK='xnli' 27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/ 28 | MODEL_PATH=$DATA_DIR/$MODEL 29 | EPOCH=10 30 | MAXL=256 31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh" 32 | EVALUATE_STEPS=5000 33 | R1_LAMBDA=5.0 34 | R2_LAMBDA=1.0 35 | if [ $MODEL == "xlm-roberta-large" ]; then 36 | BATCH_SIZE=16 37 | GRAD_ACC=2 38 | LR=5e-6 39 | else 40 | BATCH_SIZE=32 41 | GRAD_ACC=1 42 | LR=7e-6 43 | fi 44 | 45 | if [ $STAGE == 1 ]; then 46 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/" 47 | mkdir -p $OUTPUT_DIR 48 | python ./src/run_cls.py --model_type xlmr \ 49 | --model_name_or_path $MODEL_PATH \ 50 | --language $LANGS \ 51 | --train_language en \ 52 | --do_train \ 53 | --data_dir $DATA_DIR/$TASK/ \ 54 | --per_gpu_train_batch_size $BATCH_SIZE \ 55 | --gradient_accumulation_steps $GRAD_ACC \ 56 | --per_gpu_eval_batch_size 64 \ 57 | --learning_rate $LR \ 58 | --num_train_epochs $EPOCH \ 59 | --max_seq_length $MAXL \ 60 | --output_dir $OUTPUT_DIR \ 61 | --task_name $TASK \ 62 | --save_steps -1 \ 63 | --overwrite_output_dir \ 64 | --evaluate_during_training \ 65 | --evaluate_steps $EVALUATE_STEPS \ 66 | --logging_steps 50 \ 67 | --logging_steps_in_sample -1 \ 68 | --logging_each_epoch \ 69 | --gpu_id 0 \ 70 | --seed $SEED \ 71 | --fp16 --fp16_opt_level O2 \ 72 | --warmup_steps -1 \ 73 | --enable_r1_loss \ 74 | --r1_lambda $R1_LAMBDA \ 75 | --original_loss \ 76 | --enable_translate_data \ 77 | --translation_path $TRANSLATION_PATH 78 | elif [ $STAGE == 2 ]; then 79 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best" 80 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 81 | mkdir -p $OUTPUT_DIR 82 | python ./src/run_cls.py --model_type xlmr \ 83 | --model_name_or_path $MODEL_PATH \ 84 | --language $LANGS \ 85 | --train_language en \ 86 | --do_train \ 87 | --data_dir $DATA_DIR/$TASK/ \ 88 | --per_gpu_train_batch_size $BATCH_SIZE \ 89 | --gradient_accumulation_steps $GRAD_ACC \ 90 | --per_gpu_eval_batch_size 64 \ 91 | --learning_rate $LR \ 92 | --num_train_epochs $EPOCH \ 93 | --max_seq_length $MAXL \ 94 | --output_dir $OUTPUT_DIR \ 95 | --task_name $TASK \ 96 | --save_steps -1 \ 97 | --overwrite_output_dir \ 98 | --evaluate_during_training \ 99 | --evaluate_steps $EVALUATE_STEPS \ 100 | --logging_steps 50 \ 101 | --logging_steps_in_sample -1 \ 102 | --logging_each_epoch \ 103 | --gpu_id 0 \ 104 | --seed $SEED \ 105 | --fp16 --fp16_opt_level O2 \ 106 | --warmup_steps -1 \ 107 | --enable_r1_loss \ 108 | --r1_lambda $R1_LAMBDA \ 109 | --original_loss \ 110 | --enable_translate_data \ 111 | --translation_path $TRANSLATION_PATH \ 112 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \ 113 | --enable_data_augmentation \ 114 | --augment_ratio 1.0 \ 115 | --augment_method mt \ 116 | --r2_lambda $R2_LAMBDA 117 | fi -------------------------------------------------------------------------------- /scripts/translate-train-all/train_xquad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google and DeepMind. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | REPO=$PWD 17 | MODEL=${1:-"xlm-roberta-base"} 18 | STAGE=${2:-1} 19 | GPU=${3:-0} 20 | DATA_DIR=${4:-"$REPO/download/"} 21 | OUT_DIR=${5:-"$REPO/outputs/"} 22 | SEED=${6:-1} 23 | 24 | export CUDA_VISIBLE_DEVICES=$GPU 25 | 26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/ 27 | 28 | TASK='xquad' 29 | MODEL_PATH=$DATA_DIR/$MODEL 30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/ 31 | 32 | EPOCH=4 33 | MAXL=384 34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh" 35 | BSR=0.3 36 | SA=0.3 37 | SNBS=-1 38 | CSR=0.3 39 | R1_LAMBDA=5.0 40 | R2_LAMBDA=0.1 41 | if [ $MODEL == "xlm-roberta-large" ]; then 42 | BATCH_SIZE=4 43 | GRAD_ACC=8 44 | LR=1.5e-5 45 | else 46 | BATCH_SIZE=32 47 | GRAD_ACC=1 48 | LR=3e-5 49 | fi 50 | 51 | if [ $STAGE == 1 ]; then 52 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 53 | python ./src/run_qa.py --model_type xlmr \ 54 | --task_name $TASK \ 55 | --model_name_or_path $MODEL_PATH \ 56 | --do_train \ 57 | --do_eval \ 58 | --language $LANGS \ 59 | --train_language en \ 60 | --data_dir $DATA_DIR/$TASK/ \ 61 | --per_gpu_train_batch_size $BATCH_SIZE \ 62 | --gradient_accumulation_steps $GRAD_ACC \ 63 | --per_gpu_eval_batch_size 128 \ 64 | --learning_rate $LR \ 65 | --num_train_epochs $EPOCH \ 66 | --save_steps 0 \ 67 | --logging_each_epoch \ 68 | --max_seq_length $MAXL \ 69 | --doc_stride 128 \ 70 | --output_dir $OUTPUT_DIR \ 71 | --overwrite_output_dir \ 72 | --evaluate_during_training \ 73 | --logging_steps 50 \ 74 | --evaluate_steps 0 \ 75 | --seed $SEED \ 76 | --fp16 --fp16_opt_level O2 \ 77 | --warmup_steps -1 \ 78 | --enable_r1_loss \ 79 | --r1_lambda $R1_LAMBDA \ 80 | --original_loss \ 81 | --overall_ratio 1.0 \ 82 | --keep_boundary_unchanged \ 83 | --enable_code_switch \ 84 | --code_switch_ratio $CSR \ 85 | --dict_dir $DATA_DIR/dicts \ 86 | --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \ 87 | --noised_max_seq_length $MAXL 88 | elif [ $STAGE == 2 ]; then 89 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/" 90 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/" 91 | python ./src/run_qa.py --model_type xlmr \ 92 | --task_name $TASK \ 93 | --model_name_or_path $MODEL_PATH \ 94 | --do_train \ 95 | --do_eval \ 96 | --language $LANGS \ 97 | --train_language en \ 98 | --data_dir $DATA_DIR/$TASK/ \ 99 | --per_gpu_train_batch_size $BATCH_SIZE \ 100 | --gradient_accumulation_steps $GRAD_ACC \ 101 | --per_gpu_eval_batch_size 128 \ 102 | --learning_rate $LR \ 103 | --num_train_epochs $EPOCH \ 104 | --save_steps 0 \ 105 | --logging_each_epoch \ 106 | --max_seq_length $MAXL \ 107 | --doc_stride 128 \ 108 | --output_dir $OUTPUT_DIR \ 109 | --overwrite_output_dir \ 110 | --evaluate_during_training \ 111 | --logging_steps 50 \ 112 | --evaluate_steps 0 \ 113 | --seed $SEED \ 114 | --fp16 --fp16_opt_level O2 \ 115 | --warmup_steps -1 \ 116 | --enable_r1_loss \ 117 | --r1_lambda $R1_LAMBDA \ 118 | --original_loss \ 119 | --overall_ratio 1.0 \ 120 | --keep_boundary_unchanged \ 121 | --enable_bpe_sampling \ 122 | --bpe_sampling_ratio $BSR \ 123 | --sampling_alpha $SA \ 124 | --sampling_nbest_size $SNBS \ 125 | --noised_max_seq_length $MAXL \ 126 | --enable_data_augmentation \ 127 | --augment_ratio 1.0 \ 128 | --augment_method mt \ 129 | --translation_path $TRANSLATION_PATH \ 130 | --max_steps 24000 \ 131 | --r2_lambda $R2_LAMBDA \ 132 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH 133 | fi 134 | 135 | 136 | -------------------------------------------------------------------------------- /src/pequod/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__init__.py -------------------------------------------------------------------------------- /src/pequod/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from transformers.data.processors.utils import InputFeatures 3 | 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def convert_examples_to_features( 9 | processor, examples, tokenizer, max_length, label_list, 10 | pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): 11 | 12 | if label_list is None: label_list = processor.get_labels() 13 | 14 | label_map = {label: i for i, label in enumerate(label_list)} 15 | 16 | features = [] 17 | for ex_index, example in enumerate(examples): 18 | if ex_index % 10000 == 0: 19 | logger.info("Writing example %d" % ex_index) 20 | inputs = tokenizer.encode_plus( 21 | example.text_a, 22 | example.text_b, 23 | add_special_tokens=True, 24 | max_length=max_length) 25 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 26 | 27 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 28 | 29 | padding_length = max_length - len(input_ids) 30 | input_ids = input_ids + ([pad_token] * padding_length) 31 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 32 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 33 | 34 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) 35 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) 36 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) 37 | 38 | label = label_map[example.label] 39 | if ex_index < 3: 40 | logger.info("*** Example ***") 41 | logger.info("guid: %s" % (example.guid)) 42 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 43 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) 44 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) 45 | logger.info("label: %s (id = %d)" % (example.label, label)) 46 | 47 | features.append(InputFeatures( 48 | input_ids=input_ids, 49 | attention_mask=attention_mask, 50 | token_type_ids=token_type_ids, 51 | label=label)) 52 | 53 | return features -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/sampler.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/sampler.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/utils_squad.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/xdoc.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xdoc.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/xqa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xqa.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/__pycache__/xretrieval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xretrieval.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/data/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/dataloader.py -------------------------------------------------------------------------------- /src/pequod/data/sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.utils.data.sampler import Sampler 4 | 5 | 6 | class SubSampler(Sampler): 7 | 8 | def __init__(self, data_source, num_samples): 9 | self.data_source = data_source 10 | self.num_samples = num_samples 11 | 12 | def __len__(self): 13 | return self.num_samples 14 | 15 | def __iter__(self): 16 | n = len(self.data_source) 17 | if self.num_samples <= n: 18 | return iter(torch.randperm(n).tolist()[:self.num_samples]) 19 | return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64).tolist()) -------------------------------------------------------------------------------- /src/pequod/data/wili.py: -------------------------------------------------------------------------------- 1 | """Loading examples and features for WiLI-2018 dataset""" 2 | 3 | import logging 4 | import os 5 | import torch 6 | 7 | from transformers.data.processors.utils import (DataProcessor, 8 | InputExample, InputFeatures) 9 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 10 | TensorDataset) 11 | from src.data import convert_examples_to_features 12 | from src.io import lines_gen 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | _alias2lang = {} 19 | _lang2id = {} 20 | _langs = [] 21 | 22 | def get_alias2lang(data_dir): 23 | if len(_alias2lang) > 0: return _alias2lang, _lang2id, _langs 24 | for line, in lines_gen(os.path.join(data_dir, "labels-new")): 25 | value = None 26 | for alias in line.split(";"): 27 | alias = alias.strip() 28 | if alias == "": continue 29 | if value is None: value = alias 30 | _alias2lang[alias] = value 31 | _langs.append(value) 32 | for i, lang in enumerate(_langs): _lang2id[lang] = i 33 | return _alias2lang, _lang2id, _langs 34 | 35 | 36 | def load_and_cache_examples(args, data_dir, split, run_lang2id, tokenizer, key=""): 37 | cache_filename = os.path.join( 38 | data_dir, "cached_%s_%s" % (split, key)) 39 | 40 | if os.path.exists(cache_filename) and not args.overwrite_cache: 41 | logger.info("Loading features from cached file %s" % cache_filename) 42 | features = torch.load(cache_filename) 43 | else: 44 | processor = WiliProcessor() 45 | logger.info("Creating features from dataset file at %s" % data_dir) 46 | label_list = processor.get_labels(data_dir) 47 | examples = processor.get_examples(data_dir, split) 48 | logger.info("%d Examples loaded" % len(examples)) 49 | features = convert_examples_to_features( 50 | processor, examples, tokenizer, max_length=args.max_seq_length, 51 | label_list=label_list, pad_token_segment_id=0, 52 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]) 53 | logger.info("Saving features to cache file %s" % cache_filename) 54 | torch.save(features, cache_filename) 55 | 56 | # Cut dataset to test langs 57 | alias2lang, lang2id, _ = get_alias2lang(data_dir) 58 | test_lang_ids = {lang2id[alias2lang[lang]] for lang in run_lang2id.keys()} 59 | wili_id2run_langid = { 60 | lang2id[alias2lang[lang]]:val for lang, val in run_lang2id.items()} 61 | 62 | all_input_ids, all_attention_mask = [], [] 63 | all_token_type_ids, all_labels = [], [] 64 | for f in features: 65 | if f.label not in test_lang_ids: continue 66 | all_input_ids.append(f.input_ids) 67 | all_attention_mask.append(f.attention_mask) 68 | all_token_type_ids.append(f.token_type_ids) 69 | all_labels.append(wili_id2run_langid[f.label]) 70 | 71 | all_input_ids = torch.tensor(all_input_ids, dtype=torch.long) 72 | all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long) 73 | all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long) 74 | all_labels = torch.tensor(all_labels, dtype=torch.long) 75 | 76 | dataset = TensorDataset( 77 | all_input_ids, all_attention_mask, all_token_type_ids, all_labels) 78 | 79 | return dataset 80 | 81 | 82 | class WiliProcessor(DataProcessor): 83 | 84 | def get_examples(self, data_dir, split): 85 | examples = [] 86 | filename_x = os.path.join(data_dir, "x_%s.txt" % split) 87 | filename_y = os.path.join(data_dir, "y_%s.txt" % split) 88 | for i, (line_x, line_y) in enumerate(lines_gen(filename_x, filename_y)): 89 | guid = "%s-%s" % (split, i) 90 | examples.append( 91 | InputExample(guid=guid, text_a=line_x, text_b=None, label=line_y)) 92 | return examples 93 | 94 | def get_labels(self, data_dir): 95 | _, _, langs = get_alias2lang(data_dir) 96 | return langs 97 | -------------------------------------------------------------------------------- /src/pequod/data/xqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import torch 4 | 5 | from torch.utils.data import TensorDataset 6 | from src.pequod.data.utils_squad import (read_squad_examples, 7 | convert_examples_to_features) 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def load_and_cache_examples(args, split, lang, tokenizer, key="", evaluate=False): 14 | cache_filename = os.path.join( 15 | args.data_dir, "cached_%s_%s_%s" % (split, lang, key)) 16 | 17 | input_file = os.path.join(args.data_dir, "%s-%s.json" % (split, lang)) 18 | if os.path.exists(cache_filename): 19 | logger.info("Loading features from cached file %s", cache_filename) 20 | features = torch.load(cache_filename) 21 | if evaluate: 22 | examples = read_squad_examples(input_file=input_file, 23 | is_training=not evaluate, 24 | version_2_with_negative=args.version_2_with_negative) 25 | else: examples = None 26 | else: 27 | logger.info("Creating features from dataset file at %s", input_file) 28 | examples = read_squad_examples(input_file=input_file, 29 | is_training=not evaluate, 30 | version_2_with_negative=args.version_2_with_negative) 31 | features = convert_examples_to_features(examples=examples, 32 | tokenizer=tokenizer, max_seq_length=args.max_seq_length, 33 | doc_stride=args.doc_stride, max_query_length=args.max_query_length, 34 | is_training=not evaluate, cls_token=tokenizer.cls_token, 35 | sep_token=tokenizer.sep_token) 36 | logger.info("Saving features into cached file %s", cache_filename) 37 | torch.save(features, cache_filename) 38 | 39 | # Convert to Tensors and build dataset 40 | all_input_ids = torch.tensor( 41 | [f.input_ids for f in features], dtype=torch.long) 42 | all_input_mask = torch.tensor( 43 | [f.input_mask for f in features], dtype=torch.long) 44 | all_segment_ids = torch.tensor( 45 | [f.segment_ids for f in features], dtype=torch.long) 46 | all_cls_index = torch.tensor( 47 | [f.cls_index for f in features], dtype=torch.long) 48 | all_p_mask = torch.tensor( 49 | [f.p_mask for f in features], dtype=torch.float) 50 | if evaluate: 51 | all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) 52 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, 53 | all_example_index, all_cls_index, all_p_mask) 54 | else: 55 | all_start_positions = torch.tensor( 56 | [f.start_position for f in features], dtype=torch.long) 57 | all_end_positions = torch.tensor( 58 | [f.end_position for f in features], dtype=torch.long) 59 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, 60 | all_start_positions, all_end_positions, all_cls_index, all_p_mask) 61 | 62 | return dataset, examples, features 63 | -------------------------------------------------------------------------------- /src/pequod/data/xretrieval.py: -------------------------------------------------------------------------------- 1 | """Load examples from BUCC""" 2 | 3 | 4 | import logging 5 | import os 6 | import torch 7 | 8 | 9 | from transformers.data.processors.utils import ( 10 | DataProcessor, InputExample, InputFeatures) 11 | from torch.utils.data import ( 12 | DataLoader, RandomSampler, SequentialSampler, TensorDataset) 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def load_and_cache_examples(args, langpair, lang, tokenizer, key="", prefix="tatoeba"): 19 | 20 | cache_dir = os.path.join(args.data_dir, "pequod_cache") 21 | os.makedirs(cache_dir, exist_ok=True) 22 | cache_filename = os.path.join( 23 | cache_dir, "cached_%s_%s_%s" % (langpair, lang, key)) 24 | 25 | if os.path.exists(cache_filename) and not args.overwrite_cache: 26 | logger.info("Loading features from cached file %s" % cache_filename) 27 | features = torch.load(cache_filename) 28 | else: 29 | processer = TatoebaProcesser() 30 | logger.info("Creating features from dataset file at %s" % args.data_dir) 31 | examples = processer.get_examples(args.data_dir, langpair, lang, prefix) 32 | features = TatoebaProcesser.convert_examples_to_features( 33 | examples, tokenizer, args.max_seq_length, 0, 34 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],) 35 | #logger.info("Saving features to cache file %s" % cache_filename) 36 | #torch.save(features, cache_filename) 37 | 38 | all_input_ids = torch.tensor( 39 | [f.input_ids for f in features], dtype=torch.long) 40 | all_attention_mask = torch.tensor( 41 | [f.attention_mask for f in features], dtype=torch.long) 42 | all_token_type_ids = torch.tensor( 43 | [f.token_type_ids for f in features], dtype=torch.long) 44 | 45 | dataset = TensorDataset( 46 | all_input_ids, all_attention_mask, all_token_type_ids) 47 | 48 | return dataset 49 | 50 | class TatoebaProcesser(DataProcessor): 51 | 52 | @classmethod 53 | def convert_examples_to_features(cls, examples, tokenizer, max_length, pad_token_segment_id, pad_token, mask_padding_with_zero=True): 54 | 55 | features = [] 56 | for ex_index, example in enumerate(examples): 57 | inputs = tokenizer.encode_plus( 58 | example.text_a, 59 | None, 60 | add_special_tokens=True, 61 | max_length=max_length, 62 | ) 63 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 64 | 65 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 66 | 67 | padding_length = max_length - len(input_ids) 68 | input_ids = input_ids + ([pad_token] * padding_length) 69 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 70 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 71 | 72 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) 73 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) 74 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) 75 | 76 | if ex_index < 3: 77 | logger.info("*** Example ***") 78 | logger.info("guid: %s" % (example.guid)) 79 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 80 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) 81 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) 82 | 83 | features.append(InputFeatures( 84 | input_ids=input_ids, 85 | attention_mask=attention_mask, 86 | token_type_ids=token_type_ids, 87 | label=None, 88 | )) 89 | 90 | return features 91 | 92 | def get_examples(self, data_dir, langpair, lang, prefix="tatoeba"): 93 | examples = [] 94 | if prefix == "bucc": 95 | fn = os.path.join(data_dir, "%s.%s.txt" % (langpair, lang)) 96 | else: 97 | fn = os.path.join(data_dir, "%s.%s" % (langpair, lang)) 98 | #fn = os.path.join(data_dir, "%s.%s.%s" % (prefix, langpair, lang)) 99 | with open(fn, encoding='utf-8') as fp: 100 | for i, line in enumerate(fp): 101 | line = line.strip() 102 | examples.append(InputExample( 103 | guid="%s-%s-%d" % (langpair, lang, i), 104 | text_a=line, 105 | )) 106 | return examples 107 | -------------------------------------------------------------------------------- /src/pequod/eval/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import inspect 5 | 6 | 7 | from src.pequod.data.utils_squad import RawResult, write_predictions 8 | from src.pequod.data.utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad 9 | 10 | 11 | def to_list(tensor): 12 | return tensor.detach().cpu().tolist() 13 | 14 | 15 | def score_dict_to_string(score_dict): 16 | return " ".join([("%s:%.2f" % (k, v)) for k, v in score_dict.items()]) 17 | 18 | 19 | def score_dicts_to_latex(score_dicts): 20 | keys = [k for k in score_dicts[0]] 21 | return "\n".join([""] + [( 22 | " & ".join([key] + [("%.2f" % (sd[key])) for sd in score_dicts]) 23 | ) for key in keys]) 24 | 25 | 26 | def eval_classification(model, batch_dict_iter): 27 | model.eval() 28 | preds, labels = None, None 29 | for batch_dict in batch_dict_iter: 30 | label_id = batch_dict["labels"].detach().cpu().numpy() 31 | batch_dict.pop("labels") 32 | with torch.no_grad(): logits = model(**batch_dict)[0] 33 | pred = logits.detach().cpu().numpy() 34 | if preds is None: preds, labels = pred, label_id 35 | else: 36 | preds = np.append(preds, pred, axis=0) 37 | labels = np.append(labels, label_id) 38 | preds = np.argmax(preds, axis=1) 39 | result = (preds == labels).mean() 40 | return {"acc": result*100.0} 41 | 42 | 43 | def eval_qa(model, batch_dict_iter, prefix="", **kwargs): 44 | 45 | features = kwargs["all_features"] 46 | output_dir = kwargs["output_dir"] 47 | 48 | model.eval() 49 | all_results = [] 50 | for batch_dict, example_indices in batch_dict_iter: 51 | with torch.no_grad(): outputs = model(**batch_dict) 52 | 53 | for i, example_index in enumerate(example_indices): 54 | eval_feature = features[example_index.item()] 55 | unique_id = int(eval_feature.unique_id) 56 | result = RawResult(unique_id = unique_id, 57 | start_logits = to_list(outputs[0][i]), 58 | end_logits = to_list(outputs[1][i])) 59 | all_results.append(result) 60 | 61 | output_prediction_file = os.path.join( 62 | output_dir, "predictions_{}.json".format(prefix)) 63 | output_nbest_file = os.path.join( 64 | output_dir, "nbest_predictions_{}.json".format(prefix)) 65 | if kwargs["version_2_with_negative"]: 66 | output_null_log_odds_file = os.path.join( 67 | output_dir, "null_odds_{}.json".format(prefix)) 68 | else: output_null_log_odds_file = None 69 | 70 | wrt_pred_kwargs = { 71 | "all_results": all_results, 72 | "output_prediction_file": output_prediction_file, 73 | "output_nbest_file": output_nbest_file, 74 | "output_null_log_odds_file": output_null_log_odds_file} 75 | 76 | for key in inspect.getfullargspec(write_predictions).args: 77 | if key not in wrt_pred_kwargs: 78 | wrt_pred_kwargs[key] = kwargs[key] 79 | 80 | write_predictions(**wrt_pred_kwargs) 81 | 82 | # Evaluate with the official SQuAD script 83 | evaluate_options = EVAL_OPTS( 84 | data_file=kwargs["predict_file"], 85 | pred_file=output_prediction_file, 86 | na_prob_file=output_null_log_odds_file, 87 | out_file="/dev/null") 88 | results = evaluate_on_squad(evaluate_options) 89 | return results 90 | -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/evaluator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/evaluator.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/eval/evaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | 4 | from torch.utils.data import DataLoader 5 | from src.pequod.training.trainer import to_cuda 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class Evaluator(object): 12 | 13 | def __init__(self, args, model, tokenizer, **kwargs): 14 | self.args = args 15 | self.datasets = {} 16 | self.model = model 17 | self.tokenizer = tokenizer 18 | 19 | def _parse_batch(self, batch, has_label=True, **kwargs): 20 | _batch = to_cuda(batch) 21 | # _batch = batch 22 | ret = {"input_ids": _batch[0], 23 | "attention_mask": _batch[1], 24 | "token_type_ids": _batch[2] if self.args.model_type == "bert" else None,} 25 | if has_label: ret["labels"] = _batch[3] 26 | ret.update(**kwargs) 27 | return ret 28 | 29 | def run(self): 30 | raise NotImplementedError 31 | 32 | def get_dataset(self, *args, **kwargs): 33 | if args in self.datasets: return self.datasets[args] 34 | dataset = self.load_and_cache_examples(*args, **kwargs) 35 | self.datasets[args] = dataset 36 | return dataset 37 | 38 | def load_and_cache_examples(self, *args, **kwargs): 39 | raise NotImplementedError 40 | 41 | def get_dataloader(self, *args, **kwargs): 42 | logger.info("Getting dataloader - args: %s" % str(args)) 43 | dataset = kwargs.pop("dataset", self.get_dataset(*args, **kwargs)) 44 | dataloader = DataLoader(dataset, batch_size=self.args.eval_batch_size) 45 | return dataloader 46 | -------------------------------------------------------------------------------- /src/pequod/io.py: -------------------------------------------------------------------------------- 1 | """I/O""" 2 | 3 | def _lines_gen_from_single_file(filename): 4 | with open(filename) as fp: 5 | for line in fp: yield line.strip() 6 | 7 | 8 | def lines_gen(*filenames): 9 | for ret in zip(*map(_lines_gen_from_single_file, filenames)): yield ret -------------------------------------------------------------------------------- /src/pequod/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__init__.py -------------------------------------------------------------------------------- /src/pequod/model/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/model/__pycache__/roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/roberta.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/model/roberta.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch import nn 4 | from torch.nn import CrossEntropyLoss 5 | from transformers.modeling_bert import BertPreTrainedModel, BertForQuestionAnswering 6 | from transformers.modeling_roberta import RobertaModel 7 | 8 | 9 | class RobertaForQuestionAnswering(BertPreTrainedModel): 10 | 11 | base_model_prefix = "roberta" 12 | def __init__(self, config): 13 | BertPreTrainedModel.__init__(self, config) 14 | self.num_labels = config.num_labels 15 | self.roberta = RobertaModel(config) 16 | self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) 17 | BertPreTrainedModel.init_weights(self) 18 | 19 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, start_positions=None, end_positions=None, **kwargs): 20 | 21 | outputs = self.roberta(input_ids, 22 | attention_mask=attention_mask, 23 | token_type_ids=token_type_ids, 24 | position_ids=position_ids, 25 | head_mask=head_mask, 26 | **kwargs) 27 | 28 | sequence_output = outputs[0] 29 | 30 | logits = self.qa_outputs(sequence_output) 31 | start_logits, end_logits = logits.split(1, dim=-1) 32 | start_logits = start_logits.squeeze(-1) 33 | end_logits = end_logits.squeeze(-1) 34 | 35 | outputs = (start_logits, end_logits,) + outputs[2:] 36 | if start_positions is not None and end_positions is not None: 37 | # If we are on multi-GPU, split add a dimension 38 | if len(start_positions.size()) > 1: 39 | start_positions = start_positions.squeeze(-1) 40 | if len(end_positions.size()) > 1: 41 | end_positions = end_positions.squeeze(-1) 42 | # sometimes the start/end positions are outside our model inputs, we ignore these terms 43 | ignored_index = start_logits.size(1) 44 | start_positions.clamp_(0, ignored_index) 45 | end_positions.clamp_(0, ignored_index) 46 | 47 | loss_fct = CrossEntropyLoss(ignore_index=ignored_index) 48 | start_loss = loss_fct(start_logits, start_positions) 49 | end_loss = loss_fct(end_logits, end_positions) 50 | total_loss = (start_loss + end_loss) / 2 51 | outputs = (total_loss,) + outputs 52 | 53 | return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) -------------------------------------------------------------------------------- /src/pequod/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/optim/__init__.py -------------------------------------------------------------------------------- /src/pequod/optim/la.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | 6 | 7 | class LookaheadWrapper(Optimizer): 8 | r"""Implements a Lookahead wrapper around a given optimizer 9 | """ 10 | 11 | def __init__(self, optimizer, la_steps, la_alpha=0.5): 12 | self.optimizer = optimizer 13 | self._la_step = 0 # counter for inner optimizer 14 | self.la_alpha = la_alpha 15 | self._total_la_steps = la_steps 16 | 17 | self.state = defaultdict(dict) 18 | 19 | # Cache the current optimizer parameters 20 | for group in optimizer.param_groups: 21 | for p in group['params']: 22 | param_state = self.state[p] 23 | param_state['cached_params'] = torch.zeros_like(p.data) 24 | param_state['cached_params'].copy_(p.data) 25 | 26 | def __getstate__(self): 27 | return self.optimizer.__getstate__() 28 | 29 | def __setstate__(self, state): 30 | self.optimizer.__setstate__(state) 31 | 32 | def zero_grad(self): 33 | self.optimizer.zero_grad() 34 | 35 | def state_dict(self): 36 | return self.optimizer.state_dict() 37 | 38 | def load_state_dict(self, state_dict): 39 | self.optimizer.load_state_dict(state_dict) 40 | 41 | @property 42 | def param_groups(self): 43 | return self.optimizer.param_groups 44 | 45 | def step(self, closure=None): 46 | """Performs a single Lookahead optimization step. 47 | Arguments: 48 | closure (callable, optional): A closure that reevaluates the model 49 | and returns the loss. 50 | """ 51 | loss = self.optimizer.step(closure) 52 | self._la_step += 1 53 | 54 | if self._la_step >= self._total_la_steps: 55 | self._la_step = 0 56 | # Lookahead and cache the current optimizer parameters 57 | for group in self.optimizer.param_groups: 58 | for p in group['params']: 59 | param_state = self.state[p] 60 | p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params']) 61 | param_state['cached_params'].copy_(p.data) 62 | return loss 63 | -------------------------------------------------------------------------------- /src/pequod/optim/la0.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | 6 | 7 | class Lookahead0Wrapper(Optimizer): 8 | r"""Implements a Lookahead wrapper around a given optimizer 9 | """ 10 | 11 | def __init__(self, optimizer, la_steps, la_alpha=0.5): 12 | self.optimizer = optimizer 13 | self._la_step = 0 # counter for inner optimizer 14 | self.la_alpha = la_alpha 15 | self._total_la_steps = la_steps 16 | 17 | self.state = defaultdict(dict) 18 | 19 | # Cache the current optimizer parameters 20 | for group in optimizer.param_groups: 21 | for p in group['params']: 22 | param_state = self.state[p] 23 | param_state['cached_params'] = torch.zeros_like(p.data) 24 | param_state['cached_params'].copy_(p.data) 25 | 26 | def __getstate__(self): 27 | return self.optimizer.__getstate__() 28 | 29 | def __setstate__(self, state): 30 | self.optimizer.__setstate__(state) 31 | 32 | def zero_grad(self): 33 | self.optimizer.zero_grad() 34 | 35 | def state_dict(self): 36 | return self.optimizer.state_dict() 37 | 38 | def load_state_dict(self, state_dict): 39 | self.optimizer.load_state_dict(state_dict) 40 | 41 | @property 42 | def param_groups(self): 43 | return self.optimizer.param_groups 44 | 45 | def step(self, closure=None): 46 | """Performs a single Lookahead optimization step. 47 | Arguments: 48 | closure (callable, optional): A closure that reevaluates the model 49 | and returns the loss. 50 | """ 51 | loss = self.optimizer.step(closure) 52 | self._la_step += 1 53 | 54 | if self._la_step >= self._total_la_steps: 55 | self._la_step = 0 56 | # Lookahead and cache the current optimizer parameters 57 | for group in self.optimizer.param_groups: 58 | for p in group['params']: 59 | param_state = self.state[p] 60 | p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params']) 61 | # param_state['cached_params'].copy_(p.data) 62 | return loss 63 | -------------------------------------------------------------------------------- /src/pequod/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__init__.py -------------------------------------------------------------------------------- /src/pequod/text/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/text/tokenization_sentencepiece.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import sentencepiece as spm 4 | from transformers.tokenization_utils import PreTrainedTokenizer 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class XLMRTokenizer(PreTrainedTokenizer): 11 | 12 | def __init__(self, bpe_file, dict_file, **kwargs): 13 | super(XLMRTokenizer, self).__init__( 14 | bos_token="", 15 | eos_token="", 16 | unk_token="", 17 | pad_token="", 18 | mask_token="", 19 | sep_token="", 20 | cls_token="", 21 | **kwargs) 22 | 23 | self.max_len_single_sentence = self.max_len - 2 24 | self.max_len_sentences_pair = self.max_len - 4 25 | 26 | self.sp = spm.SentencePieceProcessor() 27 | self.sp.Load(bpe_file) 28 | 29 | self.encoder = {} 30 | self.decoder = [] 31 | 32 | for token in [self.bos_token, self.pad_token, self.eos_token, self.unk_token]: 33 | self._add_token(token) 34 | 35 | with open(dict_file, encoding="utf-8") as fp: 36 | for line in fp: 37 | # NOTE DO NOT USE .split() 38 | tokens_cnt = line.rstrip().split(" ") 39 | try: 40 | assert len(tokens_cnt) >= 2, line 41 | except AssertionError: 42 | logger.error( 43 | "tokenizer line %s asserterror, replaced as " % ( 44 | line, len(self.decoder))) 45 | exit(0) 46 | self._add_token(" ".join(tokens_cnt[:-1])) 47 | 48 | def _add_token(self, token): 49 | idx = len(self.encoder) 50 | self.encoder[token] = idx 51 | self.decoder.append(token) 52 | 53 | def _tokenize(self, text): 54 | return self.sp.EncodeAsPieces(text) 55 | 56 | def _convert_id_to_token(self, index): 57 | return self.decoder[index] 58 | 59 | def _convert_token_to_id(self, token): 60 | return self.encoder.get(token, self.encoder.get(self.unk_token)) 61 | 62 | def convert_tokens_to_string(self, tokens): 63 | return "".join(tokens).replace('\u2581', ' ').strip() 64 | 65 | @classmethod 66 | def from_pretrained(cls, model_path, **kwargs): 67 | bpe_file = os.path.join(model_path, "sentencepiece.bpe.model") 68 | dict_file = os.path.join(model_path, "dict.txt") 69 | tokenizer = cls(bpe_file, dict_file) 70 | return tokenizer 71 | 72 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 73 | if token_ids_1 is None: 74 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 75 | cls = [self.cls_token_id] 76 | sep = [self.sep_token_id] 77 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 78 | 79 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 80 | if already_has_special_tokens: 81 | if token_ids_1 is not None: 82 | raise ValueError("You should not supply a second sequence if the provided sequence of ids is already formated with special tokens for the model.") 83 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 84 | 85 | if token_ids_1 is None: 86 | return [1] + ([0] * len(token_ids_0)) + [1] 87 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 88 | 89 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 90 | sep = [self.sep_token_id] 91 | cls = [self.cls_token_id] 92 | 93 | if token_ids_1 is None: 94 | return len(cls + token_ids_0 + sep) * [0] 95 | return len(cls + token_ids_0 + sep) * [0] + len(sep + token_ids_1 + sep) * [1] 96 | 97 | 98 | if __name__ == "__main__": 99 | tokenizer = XLMRTokenizer.from_pretrained("/home/v-zechi/data/unilm/zechi/exp/bert_data/xlmr-large") 100 | 101 | for text in ["Hello world!", "你好,世界", "नमस्ते दुनिया", "مرحبا بالعالم", "Bonjour le monde"]: 102 | print(tokenizer.tokenize(text)) 103 | print(tokenizer.encode_plus(text, text, add_special_tokens=True)) 104 | -------------------------------------------------------------------------------- /src/pequod/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__init__.py -------------------------------------------------------------------------------- /src/pequod/tools/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/tools/__pycache__/convert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/convert.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/training/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import os 4 | import random 5 | import torch 6 | import pickle 7 | import logging 8 | import numpy as np 9 | 10 | # from transformers import (WEIGHTS_NAME, 11 | # BertConfig, BertForSequenceClassification, BertTokenizer, 12 | # RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, 13 | # RobertaModel, BertModel, XLMModel, 14 | # XLMConfig, XLMForSequenceClassification, XLMTokenizer, 15 | # XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer, 16 | # DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, 17 | # BertForQuestionAnswering) 18 | # 19 | # from src.pequod.model.roberta import RobertaForQuestionAnswering 20 | from transformers import XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer 21 | 22 | # ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ 23 | # for conf in (BertConfig, XLNetConfig, XLMConfig, 24 | # RobertaConfig, DistilBertConfig)), ()) 25 | 26 | ALL_MODELS = [] 27 | 28 | # # Model classes for classification 29 | # MODEL_CLASSES = { 30 | # 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 31 | # 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 32 | # 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 33 | # 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 34 | # 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), 35 | # "xlmr": (RobertaConfig, RobertaForSequenceClassification, XLMRTokenizer) 36 | # } 37 | # 38 | # QA_MODELS = { 39 | # "bert": BertForQuestionAnswering, 40 | # "roberta": RobertaForQuestionAnswering, 41 | # "xlmr": RobertaForQuestionAnswering, 42 | # } 43 | 44 | BERT_CLASSES = { 45 | "xlmr": (XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer), 46 | } 47 | 48 | 49 | def to_cuda(tup): 50 | return tuple(t.cuda() for t in tup) 51 | 52 | 53 | def set_seed(args): 54 | random.seed(args.seed) 55 | np.random.seed(args.seed) 56 | torch.manual_seed(args.seed) 57 | #TODO multi gpu support 58 | # if args.n_gpu > 0: 59 | # torch.cuda.manual_seed_all(args.seed) 60 | 61 | 62 | def init_exp(args): 63 | # dump parameters 64 | set_dump_path(args) 65 | pickle.dump(args, open(os.path.join(args.dump_path, 'params.pkl'), 'wb')) 66 | 67 | # get running command 68 | command = ["python", sys.argv[0]] 69 | for x in sys.argv[1:]: 70 | if x.startswith('--'): 71 | assert '"' not in x and "'" not in x 72 | command.append(x) 73 | else: 74 | assert "'" not in x 75 | if re.match('^[a-zA-Z0-9_]+$', x): 76 | command.append("%s" % x) 77 | else: 78 | command.append("'%s'" % x) 79 | command = ' '.join(command) 80 | args.command = command + ' --exp_id "%s"' % args.exp_id 81 | 82 | # check experiment name 83 | assert len(args.exp_name.strip()) > 0 84 | 85 | logging.basicConfig( 86 | format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 87 | datefmt = '%m/%d/%Y %H:%M:%S', 88 | level = logging.INFO) 89 | logger = logging.getLogger(__name__) 90 | logger.info("\n".join( 91 | "%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) 92 | logger.info("The experiment will be stored in %s\n" % args.dump_path) 93 | logger.info("Running command: %s" % command) 94 | logger.info("") 95 | 96 | 97 | def set_dump_path(args, output_dir=None, exp_name=None): 98 | if output_dir is None: output_dir = args.output_dir 99 | if exp_name is None: exp_name = args.exp_name 100 | chars = 'abcdefghijklmnopqrstuvwxyz0123456789' 101 | while True: 102 | exp_id = ''.join(random.choice(chars) for _ in range(10)) 103 | if not os.path.isdir(os.path.join(output_dir, exp_name, exp_id)): 104 | break 105 | args.exp_id = exp_id 106 | dump_path = os.path.join(output_dir, exp_name, exp_id) 107 | os.makedirs(dump_path) 108 | args.dump_path = dump_path 109 | -------------------------------------------------------------------------------- /src/pequod/training/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/pequod/training/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /src/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/tools/__init__.py -------------------------------------------------------------------------------- /src/tools/check_many2many_alignment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | if __name__ == "__main__": 4 | parser = argparse.ArgumentParser() 5 | 6 | # Required parameters 7 | parser.add_argument( 8 | "--translation_path", 9 | default=None, 10 | type=str, 11 | required=True, 12 | help="", 13 | ) 14 | 15 | drop_languages = ["en", "zh-CN", "zh", "ja", "ko", "th", "my", "ml", "ta"] 16 | translate_languages = None 17 | args = parser.parse_args() 18 | src2tgt = {} 19 | print("Reading translation from {}".format(args.translation_path)) 20 | with open(args.translation_path, encoding="utf-8") as f: 21 | cnt = 0 22 | for line in f: 23 | cnt += 1 24 | if cnt % 10000 == 0: 25 | print("Reading lines {}".format(cnt)) 26 | items = line.split("\t") 27 | 28 | if items == 3: 29 | src_sent, tgt_lang, tgt_sent = line.split("\t") 30 | alignment = None 31 | else: 32 | src_sent, tgt_lang, tgt_sent, alignment_str = line.split("\t") 33 | alignment = [] 34 | for x in alignment_str.split(" "): 35 | alignment.append((int(x.split("/")[0]), int(x.split("/")[1]))) 36 | 37 | if tgt_lang in drop_languages: 38 | continue 39 | if translate_languages is not None and tgt_lang not in translate_languages: 40 | continue 41 | 42 | cnt_src = {} 43 | cnt_tgt = {} 44 | for x in alignment: 45 | 46 | if x[0] not in cnt_src: 47 | cnt_src[x[0]] = 0 48 | cnt_src[x[0]] += 1 49 | 50 | if x[1] not in cnt_tgt: 51 | cnt_tgt[x[1]] = 0 52 | cnt_tgt[x[1]] += 1 53 | 54 | if not (cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1): 55 | print(cnt_src, cnt_tgt) 56 | print(alignment) 57 | print(src_sent, tgt_sent) 58 | 59 | assert cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/tools/sample_xnli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | 8 | # Required parameters 9 | parser.add_argument( 10 | "--input_path", 11 | default=None, 12 | type=str, 13 | required=True, 14 | help="input xnli file", 15 | ) 16 | parser.add_argument( 17 | "--output_path", 18 | default=None, 19 | type=str, 20 | required=True, 21 | help="output xnli file", 22 | ) 23 | parser.add_argument( 24 | "--sample_ratio", 25 | default=None, 26 | type=float, 27 | required=True, 28 | help="sample ratio", 29 | ) 30 | 31 | args = parser.parse_args() 32 | lines = open(args.input_path, "r").readlines() 33 | head = lines[0] 34 | lines = lines[1:] 35 | random.seed(0) 36 | random.shuffle(lines) 37 | 38 | n_lines = int(len(lines) * args.sample_ratio) 39 | 40 | fout = open(args.output_path, "w") 41 | fout.write(head) 42 | for i, line in enumerate(lines[:n_lines]): 43 | fout.write(line) -------------------------------------------------------------------------------- /src/transformers/activations.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def swish(x): 8 | return x * torch.sigmoid(x) 9 | 10 | 11 | def _gelu_python(x): 12 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created. 13 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 14 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 15 | This is now written in C in torch.nn.functional 16 | Also see https://arxiv.org/abs/1606.08415 17 | """ 18 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 19 | 20 | 21 | if torch.__version__ < "1.4.0": 22 | gelu = _gelu_python 23 | else: 24 | gelu = F.gelu 25 | 26 | 27 | def gelu_new(x): 28 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). 29 | Also see https://arxiv.org/abs/1606.08415 30 | """ 31 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 32 | 33 | 34 | ACT2FN = { 35 | "relu": F.relu, 36 | "swish": swish, 37 | "gelu": gelu, 38 | "tanh": F.tanh, 39 | "gelu_new": gelu_new, 40 | } 41 | 42 | 43 | def get_activation(activation_string): 44 | if activation_string in ACT2FN: 45 | return ACT2FN[activation_string] 46 | else: 47 | raise KeyError( 48 | "function {} not found in ACT2FN mapping {} or torch.nn.functional".format( 49 | activation_string, list(ACT2FN.keys()) 50 | ) 51 | ) 52 | -------------------------------------------------------------------------------- /src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /src/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /src/transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /src/transformers/commands/run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands import BaseTransformersCLICommand 5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline 6 | 7 | 8 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 9 | 10 | 11 | def try_infer_format_from_ext(path: str): 12 | if not path: 13 | return "pipe" 14 | 15 | for ext in PipelineDataFormat.SUPPORTED_FORMATS: 16 | if path.endswith(ext): 17 | return ext 18 | 19 | raise Exception( 20 | "Unable to determine file format from file extension {}. " 21 | "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) 22 | ) 23 | 24 | 25 | def run_command_factory(args): 26 | nlp = pipeline( 27 | task=args.task, 28 | model=args.model if args.model else None, 29 | config=args.config, 30 | tokenizer=args.tokenizer, 31 | device=args.device, 32 | ) 33 | format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format 34 | reader = PipelineDataFormat.from_str( 35 | format=format, 36 | output_path=args.output, 37 | input_path=args.input, 38 | column=args.column if args.column else nlp.default_input_names, 39 | overwrite=args.overwrite, 40 | ) 41 | return RunCommand(nlp, reader) 42 | 43 | 44 | class RunCommand(BaseTransformersCLICommand): 45 | def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): 46 | self._nlp = nlp 47 | self._reader = reader 48 | 49 | @staticmethod 50 | def register_subcommand(parser: ArgumentParser): 51 | run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") 52 | run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") 53 | run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") 54 | run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") 55 | run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") 56 | run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") 57 | run_parser.add_argument( 58 | "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" 59 | ) 60 | run_parser.add_argument( 61 | "--column", 62 | type=str, 63 | help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", 64 | ) 65 | run_parser.add_argument( 66 | "--format", 67 | type=str, 68 | default="infer", 69 | choices=PipelineDataFormat.SUPPORTED_FORMATS, 70 | help="Input format to read from", 71 | ) 72 | run_parser.add_argument( 73 | "--device", 74 | type=int, 75 | default=-1, 76 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", 77 | ) 78 | run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") 79 | run_parser.set_defaults(func=run_command_factory) 80 | 81 | def run(self): 82 | nlp, outputs = self._nlp, [] 83 | 84 | for entry in self._reader: 85 | output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) 86 | if isinstance(output, dict): 87 | outputs.append(output) 88 | else: 89 | outputs += output 90 | 91 | # Saving data 92 | if self._nlp.binary_output: 93 | binary_path = self._reader.save_binary(outputs) 94 | logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) 95 | else: 96 | self._reader.save(outputs) 97 | -------------------------------------------------------------------------------- /src/transformers/configuration_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ BART configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json", 27 | "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json", 28 | "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json", 29 | } 30 | 31 | 32 | class BartConfig(PretrainedConfig): 33 | r""" 34 | Configuration class for Bart. Parameters are renamed from the fairseq implementation 35 | """ 36 | model_type = "bart" 37 | pretrained_config_archive_map = BART_PRETRAINED_CONFIG_ARCHIVE_MAP 38 | 39 | def __init__( 40 | self, 41 | activation_dropout=0.0, 42 | vocab_size=50265, 43 | pad_token_id=1, 44 | eos_token_id=2, 45 | d_model=1024, 46 | encoder_ffn_dim=4096, 47 | encoder_layers=12, 48 | encoder_attention_heads=16, 49 | decoder_ffn_dim=4096, 50 | decoder_layers=12, 51 | decoder_attention_heads=16, 52 | encoder_layerdrop=0.0, 53 | decoder_layerdrop=0.0, 54 | attention_dropout=0.0, 55 | dropout=0.1, 56 | max_position_embeddings=1024, 57 | init_std=0.02, 58 | classifier_dropout=0.0, 59 | output_past=False, 60 | num_labels=3, 61 | bos_token_id=0, 62 | **common_kwargs 63 | ): 64 | r""" 65 | :class:`~transformers.BartConfig` is the configuration class for `BartModel`. 66 | Examples: 67 | config = BartConfig.from_pretrained('bart-large') 68 | model = BartModel(config) 69 | """ 70 | super().__init__( 71 | num_labels=num_labels, 72 | output_past=output_past, 73 | pad_token_id=pad_token_id, 74 | bos_token_id=bos_token_id, 75 | **common_kwargs, 76 | ) 77 | self.vocab_size = vocab_size 78 | self.d_model = d_model # encoder_embed_dim and decoder_embed_dim 79 | self.eos_token_id = eos_token_id 80 | self.encoder_ffn_dim = encoder_ffn_dim 81 | self.encoder_layers = self.num_hidden_layers = encoder_layers 82 | self.encoder_attention_heads = encoder_attention_heads 83 | self.encoder_layerdrop = encoder_layerdrop 84 | self.decoder_layerdrop = decoder_layerdrop 85 | self.decoder_ffn_dim = decoder_ffn_dim 86 | self.decoder_layers = decoder_layers 87 | self.decoder_attention_heads = decoder_attention_heads 88 | self.max_position_embeddings = max_position_embeddings 89 | self.init_std = init_std # Normal(0, this parameter) 90 | 91 | # 3 Types of Dropout 92 | self.attention_dropout = attention_dropout 93 | self.activation_dropout = activation_dropout 94 | self.dropout = dropout 95 | 96 | # Classifier stuff 97 | self.classif_dropout = classifier_dropout 98 | 99 | @property 100 | def num_attention_heads(self): 101 | return self.encoder_attention_heads 102 | 103 | @property 104 | def hidden_size(self): 105 | return self.d_model 106 | -------------------------------------------------------------------------------- /src/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 40 | model_type = "camembert" 41 | -------------------------------------------------------------------------------- /src/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /src/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_bert import BertConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 28 | "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 29 | "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 30 | "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", 31 | "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", 32 | "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", 33 | } 34 | 35 | 36 | class RobertaConfig(BertConfig): 37 | r""" 38 | This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`. 39 | It is used to instantiate an RoBERTa model according to the specified arguments, defining the model 40 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 41 | the BERT `bert-base-uncased `__ architecture. 42 | 43 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 44 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 45 | for more information. 46 | 47 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. 48 | It reuses the same defaults. Please check the parent class for more information. 49 | 50 | Example:: 51 | 52 | from transformers import RobertaConfig, RobertaModel 53 | 54 | # Initializing a RoBERTa configuration 55 | configuration = RobertaConfig() 56 | 57 | # Initializing a model from the configuration 58 | model = RobertaModel(configuration) 59 | 60 | # Accessing the model configuration 61 | configuration = model.config 62 | 63 | Attributes: 64 | pretrained_config_archive_map (Dict[str, str]): 65 | A dictionary containing all the available pre-trained checkpoints. 66 | """ 67 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 68 | model_type = "roberta" 69 | -------------------------------------------------------------------------------- /src/transformers/configuration_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2010, The T5 Authors and HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ T5 model configuration """ 16 | 17 | 18 | import logging 19 | 20 | from .configuration_utils import PretrainedConfig 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 27 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 28 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", 29 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", 30 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", 31 | } 32 | 33 | 34 | class T5Config(PretrainedConfig): 35 | r""" 36 | :class:`~transformers.T5Config` is the configuration class to store the configuration of a 37 | `T5Model`. 38 | 39 | 40 | Arguments: 41 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. 42 | hidden_size: Size of the encoder layers and the pooler layer. 43 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 44 | num_attention_heads: Number of attention heads for each attention layer in 45 | the Transformer encoder. 46 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 47 | layer in the Transformer encoder. 48 | hidden_act: The non-linear activation function (function or string) in the 49 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 50 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | attention_probs_dropout_prob: The dropout ratio for the attention 53 | probabilities. 54 | max_position_embeddings: The maximum sequence length that this model might 55 | ever be used with. Typically set this to something large just in case 56 | (e.g., 512 or 1024 or 2048). 57 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 58 | `T5Model`. 59 | initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). 60 | layer_norm_eps: The epsilon used by LayerNorm. 61 | """ 62 | pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP 63 | model_type = "t5" 64 | 65 | def __init__( 66 | self, 67 | vocab_size=32128, 68 | n_positions=512, 69 | d_model=512, 70 | d_kv=64, 71 | d_ff=2048, 72 | num_layers=6, 73 | num_heads=8, 74 | relative_attention_num_buckets=32, 75 | dropout_rate=0.1, 76 | layer_norm_epsilon=1e-6, 77 | initializer_factor=1.0, 78 | **kwargs 79 | ): 80 | super().__init__(**kwargs) 81 | self.vocab_size = vocab_size 82 | self.n_positions = n_positions 83 | self.d_model = d_model 84 | self.d_kv = d_kv 85 | self.d_ff = d_ff 86 | self.num_layers = num_layers 87 | self.num_heads = num_heads 88 | self.relative_attention_num_buckets = relative_attention_num_buckets 89 | self.dropout_rate = dropout_rate 90 | self.layer_norm_epsilon = layer_norm_epsilon 91 | self.initializer_factor = initializer_factor 92 | 93 | @property 94 | def max_position_embeddings(self): 95 | return self.n_positions 96 | 97 | @property 98 | def hidden_size(self): 99 | return self.d_model 100 | 101 | @property 102 | def num_attention_heads(self): 103 | return self.num_heads 104 | 105 | @property 106 | def num_hidden_layers(self): 107 | return self.num_layers 108 | -------------------------------------------------------------------------------- /src/transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | """ 38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 39 | superclass for the appropriate documentation alongside usage examples. 40 | """ 41 | 42 | pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 43 | model_type = "xlm-roberta" 44 | -------------------------------------------------------------------------------- /src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForMaskedLM(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BART checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | from pathlib import Path 21 | 22 | import fairseq 23 | import torch 24 | from packaging import version 25 | 26 | from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer 27 | 28 | 29 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"] 30 | 31 | if version.parse(fairseq.__version__) < version.parse("0.9.0"): 32 | raise Exception("requires fairseq >= 0.9.0") 33 | 34 | 35 | logging.basicConfig(level=logging.INFO) 36 | logger = logging.getLogger(__name__) 37 | 38 | SAMPLE_TEXT = " Hello world! cécé herlolip" 39 | 40 | rename_keys = [ 41 | ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), 42 | ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), 43 | ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), 44 | ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), 45 | ] 46 | IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"] 47 | 48 | 49 | def rename_key(dct, old, new): 50 | val = dct.pop(old) 51 | dct[new] = val 52 | 53 | 54 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): 55 | """ 56 | Copy/paste/tweak model's weights to our BERT structure. 57 | """ 58 | bart = torch.hub.load("pytorch/fairseq", checkpoint_path) 59 | bart.eval() # disable dropout 60 | bart.model.upgrade_state_dict(bart.model.state_dict()) 61 | hf_model_name = checkpoint_path.replace(".", "-") 62 | config = BartConfig.from_pretrained(hf_model_name) 63 | tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) 64 | tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) 65 | assert torch.eq(tokens, tokens2).all() 66 | 67 | if checkpoint_path in ["bart.large", "bart.large.cnn"]: 68 | state_dict = bart.model.state_dict() 69 | for k in IGNORE_KEYS: 70 | state_dict.pop(k, None) 71 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 72 | model = BartModel(config) 73 | their_output = bart.extract_features(tokens) 74 | else: # MNLI Case 75 | state_dict = bart.state_dict() 76 | for k in IGNORE_KEYS: 77 | state_dict.pop(k, None) 78 | state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] 79 | for src, dest in rename_keys: 80 | rename_key(state_dict, src, dest) 81 | model = BartForSequenceClassification(config) 82 | their_output = bart.predict("mnli", tokens, return_logits=True) 83 | 84 | # Load state dict 85 | model.load_state_dict(state_dict) 86 | model.eval() 87 | # Check results 88 | 89 | if checkpoint_path == "bart.large.cnn": # generate doesnt work yet 90 | model = BartForMaskedLM(config, base_model=model) 91 | assert "lm_head.weight" in model.state_dict() 92 | assert model.lm_head.out_features == config.max_position_embeddings 93 | model.eval() 94 | our_outputs = model.model.forward(tokens)[0] 95 | else: 96 | our_outputs = model.forward(tokens)[0] 97 | assert their_output.shape == our_outputs.shape 98 | assert (their_output == our_outputs).all().item() 99 | Path(pytorch_dump_folder_path).mkdir(exist_ok=True) 100 | model.save_pretrained(pytorch_dump_folder_path) 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser() 105 | # Required parameters 106 | parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="") 107 | 108 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 109 | args = parser.parse_args() 110 | convert_bart_checkpoint( 111 | args.fairseq_path, args.pytorch_dump_folder_path, 112 | ) 113 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import argparse 19 | import os 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | import torch 24 | 25 | from transformers import BertModel 26 | 27 | 28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): 29 | 30 | """ 31 | :param model:BertModel Pytorch model instance to be converted 32 | :param ckpt_dir: Tensorflow model directory 33 | :param model_name: model name 34 | :return: 35 | 36 | Currently supported HF models: 37 | Y BertModel 38 | N BertForMaskedLM 39 | N BertForPreTraining 40 | N BertForMultipleChoice 41 | N BertForNextSentencePrediction 42 | N BertForSequenceClassification 43 | N BertForQuestionAnswering 44 | """ 45 | 46 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") 47 | 48 | var_map = ( 49 | ("layer.", "layer_"), 50 | ("word_embeddings.weight", "word_embeddings"), 51 | ("position_embeddings.weight", "position_embeddings"), 52 | ("token_type_embeddings.weight", "token_type_embeddings"), 53 | (".", "/"), 54 | ("LayerNorm/weight", "LayerNorm/gamma"), 55 | ("LayerNorm/bias", "LayerNorm/beta"), 56 | ("weight", "kernel"), 57 | ) 58 | 59 | if not os.path.isdir(ckpt_dir): 60 | os.makedirs(ckpt_dir) 61 | 62 | state_dict = model.state_dict() 63 | 64 | def to_tf_var_name(name: str): 65 | for patt, repl in iter(var_map): 66 | name = name.replace(patt, repl) 67 | return "bert/{}".format(name) 68 | 69 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): 70 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 71 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 72 | session.run(tf.variables_initializer([tf_var])) 73 | session.run(tf_var) 74 | return tf_var 75 | 76 | tf.reset_default_graph() 77 | with tf.Session() as session: 78 | for var_name in state_dict: 79 | tf_name = to_tf_var_name(var_name) 80 | torch_tensor = state_dict[var_name].numpy() 81 | if any([x in var_name for x in tensors_to_transpose]): 82 | torch_tensor = torch_tensor.T 83 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 84 | tf.keras.backend.set_value(tf_var, torch_tensor) 85 | tf_weight = session.run(tf_var) 86 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 87 | 88 | saver = tf.train.Saver(tf.trainable_variables()) 89 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 90 | 91 | 92 | def main(raw_args=None): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") 95 | parser.add_argument( 96 | "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" 97 | ) 98 | parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") 99 | parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") 100 | args = parser.parse_args(raw_args) 101 | 102 | model = BertModel.from_pretrained( 103 | pretrained_model_name_or_path=args.model_name, 104 | state_dict=torch.load(args.pytorch_model_path), 105 | cache_dir=args.cache_dir, 106 | ) 107 | 108 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | import logging 21 | 22 | import numpy 23 | import torch 24 | 25 | from transformers import CONFIG_NAME, WEIGHTS_NAME 26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 27 | 28 | 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | 22 | import torch 23 | 24 | from transformers import ( 25 | CONFIG_NAME, 26 | WEIGHTS_NAME, 27 | XLNetConfig, 28 | XLNetForQuestionAnswering, 29 | XLNetForSequenceClassification, 30 | XLNetLMHeadModel, 31 | load_tf_weights_in_xlnet, 32 | ) 33 | 34 | 35 | GLUE_TASKS_NUM_LABELS = { 36 | "cola": 2, 37 | "mnli": 3, 38 | "mrpc": 2, 39 | "sst-2": 2, 40 | "sts-b": 1, 41 | "qqp": 2, 42 | "qnli": 2, 43 | "rte": 2, 44 | "wnli": 2, 45 | } 46 | 47 | 48 | logging.basicConfig(level=logging.INFO) 49 | 50 | 51 | def convert_xlnet_checkpoint_to_pytorch( 52 | tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None 53 | ): 54 | # Initialise PyTorch model 55 | config = XLNetConfig.from_json_file(bert_config_file) 56 | 57 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 58 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 59 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 60 | config.finetuning_task = finetuning_task 61 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 62 | model = XLNetForSequenceClassification(config) 63 | elif "squad" in finetuning_task: 64 | config.finetuning_task = finetuning_task 65 | model = XLNetForQuestionAnswering(config) 66 | else: 67 | model = XLNetLMHeadModel(config) 68 | 69 | # Load weights from tf checkpoint 70 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 71 | 72 | # Save pytorch-model 73 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 74 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 75 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 76 | torch.save(model.state_dict(), pytorch_weights_dump_path) 77 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 78 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 79 | f.write(config.to_json_string()) 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | # Required parameters 85 | parser.add_argument( 86 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 87 | ) 88 | parser.add_argument( 89 | "--xlnet_config_file", 90 | default=None, 91 | type=str, 92 | required=True, 93 | help="The config json file corresponding to the pre-trained XLNet model. \n" 94 | "This specifies the model architecture.", 95 | ) 96 | parser.add_argument( 97 | "--pytorch_dump_folder_path", 98 | default=None, 99 | type=str, 100 | required=True, 101 | help="Path to the folder to store the PyTorch model or dataset/vocab.", 102 | ) 103 | parser.add_argument( 104 | "--finetuning_task", 105 | default=None, 106 | type=str, 107 | help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", 108 | ) 109 | args = parser.parse_args() 110 | print(args) 111 | 112 | convert_xlnet_checkpoint_to_pytorch( 113 | args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task 114 | ) 115 | -------------------------------------------------------------------------------- /src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | 20 | xglue_convert_examples_to_features, 21 | xglue_convert_examples_to_vat_features, 22 | xglue_output_modes, 23 | xglue_processors, 24 | xglue_tasks_num_labels, 25 | 26 | xtreme_convert_examples_to_features, 27 | xtreme_output_modes, 28 | xtreme_processors, 29 | xtreme_tasks_num_labels, 30 | 31 | squad_convert_examples_to_features, 32 | xnli_output_modes, 33 | xnli_processors, 34 | xnli_tasks_num_labels, 35 | ) 36 | 37 | 38 | if is_sklearn_available(): 39 | from .metrics import glue_compute_metrics, xnli_compute_metrics, xglue_compute_metrics, xtreme_compute_metrics 40 | -------------------------------------------------------------------------------- /src/transformers/data/metrics/evaluate_squad.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Based on the SQuAD evaluation script from: 3 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py 16 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 17 | from __future__ import print_function 18 | from collections import Counter 19 | import string 20 | import re 21 | import argparse 22 | import json 23 | import sys 24 | 25 | 26 | def normalize_answer(s): 27 | """Lower text and remove punctuation, articles and extra whitespace.""" 28 | def remove_articles(text): 29 | return re.sub(r'\b(a|an|the)\b', ' ', text) 30 | 31 | def white_space_fix(text): 32 | return ' '.join(text.split()) 33 | 34 | def remove_punc(text): 35 | exclude = set(string.punctuation) 36 | return ''.join(ch for ch in text if ch not in exclude) 37 | 38 | def lower(text): 39 | return text.lower() 40 | 41 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 42 | 43 | 44 | def f1_score(prediction, ground_truth): 45 | prediction_tokens = normalize_answer(prediction).split() 46 | ground_truth_tokens = normalize_answer(ground_truth).split() 47 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 48 | num_same = sum(common.values()) 49 | if num_same == 0: 50 | return 0 51 | precision = 1.0 * num_same / len(prediction_tokens) 52 | recall = 1.0 * num_same / len(ground_truth_tokens) 53 | f1 = (2 * precision * recall) / (precision + recall) 54 | return f1 55 | 56 | 57 | def exact_match_score(prediction, ground_truth): 58 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 59 | 60 | 61 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 62 | scores_for_ground_truths = [] 63 | for ground_truth in ground_truths: 64 | score = metric_fn(prediction, ground_truth) 65 | scores_for_ground_truths.append(score) 66 | return max(scores_for_ground_truths) 67 | 68 | 69 | def evaluate(dataset, predictions): 70 | f1 = exact_match = total = 0 71 | for article in dataset: 72 | for paragraph in article['paragraphs']: 73 | for qa in paragraph['qas']: 74 | total += 1 75 | if qa['id'] not in predictions: 76 | message = 'Unanswered question ' + qa['id'] + \ 77 | ' will receive score 0.' 78 | print(message, file=sys.stderr) 79 | continue 80 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 81 | prediction = predictions[qa['id']] 82 | exact_match += metric_max_over_ground_truths( 83 | exact_match_score, prediction, ground_truths) 84 | f1 += metric_max_over_ground_truths( 85 | f1_score, prediction, ground_truths) 86 | 87 | exact_match = 100.0 * exact_match / total 88 | f1 = 100.0 * f1 / total 89 | 90 | return {'exact_match': exact_match, 'f1': f1} 91 | 92 | 93 | def evaluate_with_path(dataset_file, prediction_file): 94 | with open(dataset_file) as dataset_file_reader: 95 | dataset_json = json.load(dataset_file_reader) 96 | dataset = dataset_json['data'] 97 | with open(prediction_file) as prediction_file_reader: 98 | predictions = json.load(prediction_file_reader) 99 | return evaluate(dataset, predictions) 100 | 101 | if __name__ == '__main__': 102 | expected_version = '1.1' 103 | parser = argparse.ArgumentParser( 104 | description='Evaluation for SQuAD ' + expected_version) 105 | parser.add_argument('dataset_file', help='Dataset file') 106 | parser.add_argument('prediction_file', help='Prediction File') 107 | args = parser.parse_args() 108 | with open(args.dataset_file) as dataset_file: 109 | dataset_json = json.load(dataset_file) 110 | if (dataset_json['version'] != expected_version): 111 | print('Evaluation expects v-' + expected_version + 112 | ', but got dataset with v-' + dataset_json['version'], 113 | file=sys.stderr) 114 | dataset = dataset_json['data'] 115 | with open(args.prediction_file) as prediction_file: 116 | predictions = json.load(prediction_file) 117 | print(json.dumps(evaluate(dataset, predictions))) -------------------------------------------------------------------------------- /src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .xglue import xglue_convert_examples_to_features, xglue_output_modes, xglue_processors, xglue_tasks_num_labels 6 | from .xtreme import xtreme_convert_examples_to_features, xtreme_output_modes, xtreme_processors, xtreme_tasks_num_labels 7 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 8 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 9 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 10 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 11 | from .xglue import xglue_convert_examples_to_vat_features 12 | -------------------------------------------------------------------------------- /src/transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation) """ 17 | 18 | 19 | import logging 20 | import os 21 | 22 | from .utils import DataProcessor, InputExample 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class XnliProcessor(DataProcessor): 29 | """Processor for the XNLI dataset. 30 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" 31 | 32 | def __init__(self, language, train_language=None): 33 | self.language = language 34 | self.train_language = train_language 35 | 36 | def get_train_examples(self, data_dir): 37 | """See base class.""" 38 | lg = self.language if self.train_language is None else self.train_language 39 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) 40 | examples = [] 41 | for (i, line) in enumerate(lines): 42 | if i == 0: 43 | continue 44 | guid = "%s-%s" % ("train", i) 45 | text_a = line[0] 46 | text_b = line[1] 47 | label = "contradiction" if line[2] == "contradictory" else line[2] 48 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 49 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 50 | return examples 51 | 52 | def get_test_examples(self, data_dir): 53 | """See base class.""" 54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 55 | examples = [] 56 | for (i, line) in enumerate(lines): 57 | if i == 0: 58 | continue 59 | language = line[0] 60 | if language != self.language: 61 | continue 62 | guid = "%s-%s" % ("test", i) 63 | text_a = line[6] 64 | text_b = line[7] 65 | label = line[1] 66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) 67 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 68 | return examples 69 | 70 | def get_labels(self): 71 | """See base class.""" 72 | return ["contradiction", "entailment", "neutral"] 73 | 74 | 75 | xnli_processors = { 76 | "xnli": XnliProcessor, 77 | } 78 | 79 | xnli_output_modes = { 80 | "xnli": "classification", 81 | } 82 | 83 | xnli_tasks_num_labels = { 84 | "xnli": 3, 85 | } 86 | -------------------------------------------------------------------------------- /src/transformers/modeling_tf_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_camembert import CamembertConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForSequenceClassification, 26 | TFRobertaForTokenClassification, 27 | TFRobertaModel, 28 | ) 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {} 34 | 35 | 36 | CAMEMBERT_START_DOCSTRING = r""" 37 | 38 | .. note:: 39 | 40 | TF 2.0 models accepts two formats as inputs: 41 | 42 | - having all inputs as keyword arguments (like PyTorch models), or 43 | - having all inputs as a list, tuple or dict in the first positional arguments. 44 | 45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 47 | 48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 49 | in the first positional argument : 50 | 51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 56 | 57 | Parameters: 58 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the 59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 61 | """ 62 | 63 | 64 | @add_start_docstrings( 65 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", 66 | CAMEMBERT_START_DOCSTRING, 67 | ) 68 | class TFCamembertModel(TFRobertaModel): 69 | """ 70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 71 | superclass for the appropriate documentation alongside usage examples. 72 | """ 73 | 74 | config_class = CamembertConfig 75 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 76 | 77 | 78 | @add_start_docstrings( 79 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, 80 | ) 81 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM): 82 | """ 83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 84 | superclass for the appropriate documentation alongside usage examples. 85 | """ 86 | 87 | config_class = CamembertConfig 88 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 89 | 90 | 91 | @add_start_docstrings( 92 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer 93 | on top of the pooled output) e.g. for GLUE tasks. """, 94 | CAMEMBERT_START_DOCSTRING, 95 | ) 96 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification): 97 | """ 98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 99 | superclass for the appropriate documentation alongside usage examples. 100 | """ 101 | 102 | config_class = CamembertConfig 103 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 104 | 105 | 106 | @add_start_docstrings( 107 | """CamemBERT Model with a token classification head on top (a linear layer on top of 108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 109 | CAMEMBERT_START_DOCSTRING, 110 | ) 111 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification): 112 | """ 113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 114 | superclass for the appropriate documentation alongside usage examples. 115 | """ 116 | 117 | config_class = CamembertConfig 118 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP 119 | -------------------------------------------------------------------------------- /src/transformers/modeling_tf_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ TF 2.0 XLM-RoBERTa model. """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_xlm_roberta import XLMRobertaConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_tf_roberta import ( 24 | TFRobertaForMaskedLM, 25 | TFRobertaForSequenceClassification, 26 | TFRobertaForTokenClassification, 27 | TFRobertaModel, 28 | ) 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {} 34 | 35 | 36 | XLM_ROBERTA_START_DOCSTRING = r""" 37 | 38 | .. note:: 39 | 40 | TF 2.0 models accepts two formats as inputs: 41 | 42 | - having all inputs as keyword arguments (like PyTorch models), or 43 | - having all inputs as a list, tuple or dict in the first positional arguments. 44 | 45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having 46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`. 47 | 48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors 49 | in the first positional argument : 50 | 51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` 52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: 53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` 54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring: 55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` 56 | 57 | Parameters: 58 | config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 61 | """ 62 | 63 | 64 | @add_start_docstrings( 65 | "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", 66 | XLM_ROBERTA_START_DOCSTRING, 67 | ) 68 | class TFXLMRobertaModel(TFRobertaModel): 69 | """ 70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the 71 | superclass for the appropriate documentation alongside usage examples. 72 | """ 73 | 74 | config_class = XLMRobertaConfig 75 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 76 | 77 | 78 | @add_start_docstrings( 79 | """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING, 80 | ) 81 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM): 82 | """ 83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the 84 | superclass for the appropriate documentation alongside usage examples. 85 | """ 86 | 87 | config_class = XLMRobertaConfig 88 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 89 | 90 | 91 | @add_start_docstrings( 92 | """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 93 | on top of the pooled output) e.g. for GLUE tasks. """, 94 | XLM_ROBERTA_START_DOCSTRING, 95 | ) 96 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification): 97 | """ 98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the 99 | superclass for the appropriate documentation alongside usage examples. 100 | """ 101 | 102 | config_class = XLMRobertaConfig 103 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 104 | 105 | 106 | @add_start_docstrings( 107 | """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of 108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 109 | XLM_ROBERTA_START_DOCSTRING, 110 | ) 111 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): 112 | """ 113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the 114 | superclass for the appropriate documentation alongside usage examples. 115 | """ 116 | 117 | config_class = XLMRobertaConfig 118 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 119 | -------------------------------------------------------------------------------- /src/transformers/tokenization_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_roberta import RobertaTokenizer 17 | 18 | 19 | # vocab and merges same as roberta 20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" 21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" 22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"] 23 | 24 | 25 | class BartTokenizer(RobertaTokenizer): 26 | # merges and vocab same as Roberta 27 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 28 | pretrained_vocab_files_map = { 29 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 30 | "merges_file": {m: merges_url for m in _all_bart_models}, 31 | } 32 | -------------------------------------------------------------------------------- /src/transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | 18 | import logging 19 | 20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast 21 | 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 30 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 31 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 32 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 33 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", 34 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 35 | } 36 | } 37 | 38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 39 | "distilbert-base-uncased": 512, 40 | "distilbert-base-uncased-distilled-squad": 512, 41 | "distilbert-base-cased": 512, 42 | "distilbert-base-cased-distilled-squad": 512, 43 | "distilbert-base-german-cased": 512, 44 | "distilbert-base-multilingual-cased": 512, 45 | } 46 | 47 | 48 | PRETRAINED_INIT_CONFIGURATION = { 49 | "distilbert-base-uncased": {"do_lower_case": True}, 50 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, 51 | "distilbert-base-cased": {"do_lower_case": False}, 52 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, 53 | "distilbert-base-german-cased": {"do_lower_case": False}, 54 | "distilbert-base-multilingual-cased": {"do_lower_case": False}, 55 | } 56 | 57 | 58 | class DistilBertTokenizer(BertTokenizer): 59 | r""" 60 | Constructs a DistilBertTokenizer. 61 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 62 | tokenization: punctuation splitting + wordpiece. 63 | 64 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 65 | parameters. 66 | """ 67 | 68 | vocab_files_names = VOCAB_FILES_NAMES 69 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 70 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 71 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 72 | 73 | 74 | class DistilBertTokenizerFast(BertTokenizerFast): 75 | vocab_files_names = VOCAB_FILES_NAMES 76 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 77 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 78 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 79 | -------------------------------------------------------------------------------- /src/transformers/utils_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Classes to support Encoder-Decoder architectures """ 16 | 17 | 18 | def prepare_encoder_decoder_model_kwargs(**kwargs): 19 | """ Prepare the encoder and decoder's keyword arguments. 20 | 21 | Keyword arguments come in 3 flavors: 22 | - encoder-specific (prefixed by `encoder_`) 23 | - decoder-specific (prefixed by `decoder_`) 24 | - those that apply to the model as whole. 25 | 26 | We let the specific kwargs override the common ones in case of 27 | conflict. 28 | """ 29 | 30 | kwargs_common = { 31 | argument: value 32 | for argument, value in kwargs.items() 33 | if not argument.startswith("encoder_") and not argument.startswith("decoder_") 34 | } 35 | if "input_ids" in kwargs_common: 36 | kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") 37 | 38 | decoder_kwargs = kwargs_common.copy() 39 | encoder_kwargs = kwargs_common.copy() 40 | encoder_kwargs.update( 41 | {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} 42 | ) 43 | decoder_kwargs.update( 44 | {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} 45 | ) 46 | decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) 47 | return encoder_kwargs, decoder_kwargs 48 | -------------------------------------------------------------------------------- /src/ud-conversion-tools/conllu_to_conll.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import islice 3 | from pathlib import Path 4 | import argparse 5 | import sys, copy 6 | 7 | from lib.conll import CoNLLReader 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") 11 | parser.add_argument('input', help="conllu file") 12 | parser.add_argument('output', help="target file", type=Path) 13 | parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true") 14 | parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true") 15 | parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'], metavar='prop', type=str, nargs='+') 16 | parser.add_argument('--lang', help="specify a language 2-letter code", default="default") 17 | parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006") 18 | parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true") 19 | parser.add_argument('--print_comments',default=False,action="store_true") 20 | parser.add_argument('--print_fused_forms',default=False,action="store_true") 21 | 22 | args = parser.parse_args() 23 | 24 | if sys.version_info < (3,0): 25 | print("Sorry, requires Python 3.x.") #suggestion: install anaconda python 26 | sys.exit(1) 27 | 28 | POSRANKPRECEDENCEDICT = defaultdict(list) 29 | POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ") 30 | # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ") 31 | POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ") 32 | POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ") 33 | POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ") 34 | 35 | if args.lang in POSRANKPRECEDENCEDICT: 36 | current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang] 37 | else: 38 | current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"] 39 | 40 | cio = CoNLLReader() 41 | orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT) 42 | modif_treebank = copy.copy(orig_treebank) 43 | 44 | # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list 45 | # We keep it for future modifications, i.e. any language-specific modules 46 | for s in modif_treebank: 47 | # print('sentence', s.get_sentence_as_string(printid=True)) 48 | s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics) 49 | 50 | cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments) 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /src/ud-conversion-tools/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/ud-conversion-tools/lib/__init__.py -------------------------------------------------------------------------------- /transformers-cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.convert import ConvertCommand 5 | from transformers.commands.download import DownloadCommand 6 | from transformers.commands.env import EnvironmentCommand 7 | from transformers.commands.run import RunCommand 8 | from transformers.commands.serving import ServeCommand 9 | from transformers.commands.user import UserCommands 10 | 11 | if __name__ == '__main__': 12 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') 13 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 14 | 15 | # Register commands 16 | ConvertCommand.register_subcommand(commands_parser) 17 | DownloadCommand.register_subcommand(commands_parser) 18 | EnvironmentCommand.register_subcommand(commands_parser) 19 | RunCommand.register_subcommand(commands_parser) 20 | ServeCommand.register_subcommand(commands_parser) 21 | UserCommands.register_subcommand(commands_parser) 22 | 23 | # Let's go 24 | args = parser.parse_args() 25 | 26 | if not hasattr(args, 'func'): 27 | parser.print_help() 28 | exit(1) 29 | 30 | # Run 31 | service = args.func(args) 32 | service.run() 33 | --------------------------------------------------------------------------------