├── .idea
├── .gitignore
├── deployment.xml
├── encodings.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── remote-mappings.xml
├── sshConfigs.xml
├── vcs.xml
├── webServers.xml
└── xTune.iml
├── README.md
├── scripts
├── cross-lingual-transfer
│ ├── train_mlqa.sh
│ ├── train_panx.sh
│ ├── train_pawsx.sh
│ ├── train_tydiqa.sh
│ ├── train_udpos.sh
│ ├── train_xnli.sh
│ └── train_xquad.sh
├── download_data.sh
├── download_model.sh
├── preprocess_panx.sh
├── preprocess_udpos.sh
├── train.sh
└── translate-train-all
│ ├── train_mlqa.sh
│ ├── train_panx.sh
│ ├── train_pawsx.sh
│ ├── train_tydiqa.sh
│ ├── train_udpos.sh
│ ├── train_xnli.sh
│ └── train_xquad.sh
├── setup.py
├── src
├── pequod
│ ├── __init__.py
│ ├── __pycache__
│ │ └── __init__.cpython-37.pyc
│ ├── data
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ ├── sampler.cpython-37.pyc
│ │ │ ├── utils_squad.cpython-37.pyc
│ │ │ ├── utils_squad_evaluate.cpython-37.pyc
│ │ │ ├── xdoc.cpython-37.pyc
│ │ │ ├── xqa.cpython-37.pyc
│ │ │ └── xretrieval.cpython-37.pyc
│ │ ├── dataloader.py
│ │ ├── sampler.py
│ │ ├── utils_squad.py
│ │ ├── utils_squad_evaluate.py
│ │ ├── wili.py
│ │ ├── xdoc.py
│ │ ├── xqa.py
│ │ └── xretrieval.py
│ ├── eval
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ ├── bretrieval.cpython-37.pyc
│ │ │ ├── bucc_eval.cpython-37.pyc
│ │ │ ├── evaluator.cpython-37.pyc
│ │ │ ├── utils_retrieve.cpython-37.pyc
│ │ │ └── xretrieval.cpython-37.pyc
│ │ ├── bretrieval.py
│ │ ├── evaluator.py
│ │ ├── utils_retrieve.py
│ │ └── xretrieval.py
│ ├── io.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── roberta.cpython-37.pyc
│ │ └── roberta.py
│ ├── optim
│ │ ├── __init__.py
│ │ ├── la.py
│ │ └── la0.py
│ ├── text
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── tokenization_sentencepiece.cpython-37.pyc
│ │ └── tokenization_sentencepiece.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── convert.cpython-37.pyc
│ │ └── convert.py
│ └── training
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── trainer.cpython-37.pyc
│ │ ├── trainer.py
│ │ └── xtrainer.py
├── run_cls.py
├── run_qa.py
├── run_tag.py
├── tools
│ ├── __init__.py
│ ├── check_many2many_alignment.py
│ ├── dump_hf_state_dict.py
│ ├── get_eval_results.py
│ ├── sample_xnli.py
│ └── xnli_sampling_statistics.py
├── transformers
│ ├── __init__.py
│ ├── activations.py
│ ├── commands
│ │ ├── __init__.py
│ │ ├── convert.py
│ │ ├── download.py
│ │ ├── env.py
│ │ ├── run.py
│ │ ├── serving.py
│ │ ├── train.py
│ │ └── user.py
│ ├── configuration_albert.py
│ ├── configuration_auto.py
│ ├── configuration_bart.py
│ ├── configuration_bert.py
│ ├── configuration_camembert.py
│ ├── configuration_ctrl.py
│ ├── configuration_distilbert.py
│ ├── configuration_flaubert.py
│ ├── configuration_gpt2.py
│ ├── configuration_mmbt.py
│ ├── configuration_openai.py
│ ├── configuration_roberta.py
│ ├── configuration_t5.py
│ ├── configuration_transfo_xl.py
│ ├── configuration_utils.py
│ ├── configuration_xlm.py
│ ├── configuration_xlm_roberta.py
│ ├── configuration_xlnet.py
│ ├── convert_albert_original_tf_checkpoint_to_pytorch.py
│ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
│ ├── convert_bert_original_tf_checkpoint_to_pytorch.py
│ ├── convert_bert_pytorch_checkpoint_to_original_tf.py
│ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
│ ├── convert_openai_original_tf_checkpoint_to_pytorch.py
│ ├── convert_pytorch_checkpoint_to_tf2.py
│ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
│ ├── convert_t5_original_tf_checkpoint_to_pytorch.py
│ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
│ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
│ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── metrics
│ │ │ ├── __init__.py
│ │ │ ├── evaluate_mlqa.py
│ │ │ ├── evaluate_squad.py
│ │ │ ├── mlqa_evaluation_v1.py
│ │ │ └── squad_metrics.py
│ │ └── processors
│ │ │ ├── __init__.py
│ │ │ ├── glue.py
│ │ │ ├── squad.py
│ │ │ ├── utils.py
│ │ │ ├── xglue.py
│ │ │ ├── xnli.py
│ │ │ └── xtreme.py
│ ├── file_utils.py
│ ├── hf_api.py
│ ├── modelcard.py
│ ├── modeling_albert.py
│ ├── modeling_auto.py
│ ├── modeling_bart.py
│ ├── modeling_bert.py
│ ├── modeling_camembert.py
│ ├── modeling_ctrl.py
│ ├── modeling_distilbert.py
│ ├── modeling_encoder_decoder.py
│ ├── modeling_flaubert.py
│ ├── modeling_gpt2.py
│ ├── modeling_mmbt.py
│ ├── modeling_openai.py
│ ├── modeling_roberta.py
│ ├── modeling_t5.py
│ ├── modeling_tf_albert.py
│ ├── modeling_tf_auto.py
│ ├── modeling_tf_bert.py
│ ├── modeling_tf_camembert.py
│ ├── modeling_tf_ctrl.py
│ ├── modeling_tf_distilbert.py
│ ├── modeling_tf_gpt2.py
│ ├── modeling_tf_openai.py
│ ├── modeling_tf_pytorch_utils.py
│ ├── modeling_tf_roberta.py
│ ├── modeling_tf_t5.py
│ ├── modeling_tf_transfo_xl.py
│ ├── modeling_tf_transfo_xl_utilities.py
│ ├── modeling_tf_utils.py
│ ├── modeling_tf_xlm.py
│ ├── modeling_tf_xlm_roberta.py
│ ├── modeling_tf_xlnet.py
│ ├── modeling_transfo_xl.py
│ ├── modeling_transfo_xl_utilities.py
│ ├── modeling_utils.py
│ ├── modeling_xlm.py
│ ├── modeling_xlm_roberta.py
│ ├── modeling_xlnet.py
│ ├── optimization.py
│ ├── optimization_tf.py
│ ├── pipelines.py
│ ├── tokenization_albert.py
│ ├── tokenization_auto.py
│ ├── tokenization_bart.py
│ ├── tokenization_bert.py
│ ├── tokenization_bert_japanese.py
│ ├── tokenization_camembert.py
│ ├── tokenization_ctrl.py
│ ├── tokenization_distilbert.py
│ ├── tokenization_flaubert.py
│ ├── tokenization_gpt2.py
│ ├── tokenization_openai.py
│ ├── tokenization_roberta.py
│ ├── tokenization_t5.py
│ ├── tokenization_transfo_xl.py
│ ├── tokenization_utils.py
│ ├── tokenization_xlm.py
│ ├── tokenization_xlm_roberta.py
│ ├── tokenization_xlnet.py
│ └── utils_encoder_decoder.py
├── ud-conversion-tools
│ ├── conllu_to_conll.py
│ └── lib
│ │ ├── __init__.py
│ │ └── conll.py
└── utils_tag.py
├── transformers-cli
└── utils_preprocess.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /../../../../../../:\Users\v-zhebo\OneDrive - Microsoft\stabletune\.idea/dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/sshConfigs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/webServers.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/xTune.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # xTune
2 |
3 | Code for ACL2021 paper [Consistency Regularization for Cross-Lingual Fine-Tuning](https://arxiv.org/pdf/2106.08226.pdf).
4 | ## Environment
5 |
6 | DockerFile: `dancingsoul/pytorch:xTune`
7 |
8 | Install the fine-tuning code: `pip install --user .`
9 |
10 | ## Data & Model Preparation
11 |
12 | ### XTREME Datasets
13 |
14 | 1) Create a download folder with `mkdir -p download` in the root of this project.
15 | 2) manually download `panx_dataset` (for NER) [here][2], (note that it will download as `AmazonPhotos.zip`) to the download directory.
16 | 3) run the following command to download the remaining datasets: `bash scripts/download_data.sh`
17 | The code of downloading dataset from XTREME is from [xtreme offical repo][1].
18 |
19 | Note that we keep the labels in test set for easier evaluation. To prevent accidental evaluation on the test sets while running experiments, the code of [xtreme offical repo][1] removes labels of the test data during pre-processing and changes the order of the test sentences for cross-lingual sentence retrieval.
20 | Replace `csv.writer(fout, delimiter='\t')` with `csv.writer(fout, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')` in utils_process.py if using XTREME official repo.
21 |
22 | ### Translations
23 |
24 | XTREME provides translations for SQuAD v1.1 (only train and dev), MLQA, PAWS-X, TyDiQA-GoldP, XNLI, and XQuAD, which can be downloaded from [here][3]. The `xtreme_translations` folder should be moved to the download directory.
25 |
26 | The target language translations for panx and udpos are obtained with Google Translate, since they are not provided. Our processed version can be downloaded from [here][4]. It should be merged with the above `xtreme_translations` folder.
27 |
28 | ### Bi-lingual dictionaries
29 |
30 | We obtain the bi-lingual dictionaries from the [MUSE][6] repo. For convenience, you can download them from [here][7] and move it to the download directory, i.e., `./download/dicts`.
31 |
32 | ### Models
33 |
34 | XLM-Roberta is supported. We utilize the [huggingface][5] format, which can be downloaded with `bash scripts/download_model.sh`.
35 |
36 | ## Fine-tuning Usage
37 |
38 | Our default settings were using Nvidia V100-32GB GPU cards. If there were out-of-memory errors, you can reduce `per_gpu_train_batch_size` while increasing `gradient_accumulation_steps`, or use multi-GPU training.
39 |
40 | xTune consists of a two-stage training process.
41 | - Stage 1: fine-tuning with example consistency on the English training set.
42 | - Stage 2: fine-tuning with example consistency on the augmented training set and regularize model consistency with the model from Stage 1.
43 |
44 | It's recommended to use both Stage 1 and Stage 2 for token-level tasks, such as sequential labeling, and question answering. For text classification, you can only use Stage 1 if the computation budget was limited.
45 |
46 | ```bash
47 | bash ./scripts/train.sh [setting] [dataset] [model] [stage] [gpu] [data_dir] [output_dir]
48 | ```
49 | where the options are described as follows:
50 | - `[setting]`: `translate-train-all` (using input translation for the languages other than English) or `cross-lingual-transfer` (only using English for zero-shot cross-lingual transfer)
51 | - `[dataset]`: dataset names in XTREME, i.e., `xnli`, `panx`, `pawsx`, `udpos`, `mlqa`, `tydiqa`, `xquad`
52 | - `[model]`: `xlm-roberta-base`, `xlm-roberta-large`
53 | - `[stage]`: `1` (first stage), `2` (second stage)
54 | - `[gpu]`: used to set environment variable `CUDA_VISIBLE_DEVICES`
55 | - `[data_dir]`: folder of training data
56 | - `[output_dir]`: folder of fine-tuning output
57 |
58 | ## Examples: XTREME Tasks
59 |
60 | ### XNLI fine-tuning on English training set and translated training sets (`translate-train-all`)
61 |
62 | ```bash
63 | # run stage 1 of xTune
64 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 1
65 | # run stage 2 of xTune (optional)
66 | bash ./scripts/train.sh translate-train-all xnli xlm-roberta-base 2
67 | ```
68 |
69 | ### XNLI fine-tuning on English training set (`cross-lingual-transfer`)
70 |
71 | ```bash
72 | # run stage 1 of xTune
73 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 1
74 | # run stage 2 of xTune (optional)
75 | bash ./scripts/train.sh cross-lingual-transfer xnli xlm-roberta-base 2
76 | ```
77 |
78 | ## Paper
79 | Please cite our paper `\cite{bo2021xtune}` if you found the resources in the repository useful.
80 |
81 | ```
82 | @inproceedings{bo2021xtune,
83 | author = {Bo Zheng, Li Dong, Shaohan Huang, Wenhui Wang, Zewen Chi, Saksham Singhal, Wanxiang Che, Ting Liu, Xia Song, Furu Wei},
84 | booktitle = {Proceedings of ACL 2021},
85 | title = {{Consistency Regularization for Cross-Lingual Fine-Tuning}},
86 | year = {2021}
87 | }
88 | ```
89 |
90 | ## Reference
91 |
92 | 1. https://github.com/google-research/xtreme
93 | 2. https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN?_encoding=UTF8&%2AVersion%2A=1&%2Aentries%2A=0&mgh=1
94 | 3. https://console.cloud.google.com/storage/browser/xtreme_translations
95 | 4. https://drive.google.com/drive/folders/1Rdbc0Us_4I5MpRCwLASxBwqSW8_dlF87?usp=sharing
96 | 5. https://github.com/huggingface/transformers/
97 | 6. https://github.com/facebookresearch/MUSE
98 | 7. https://drive.google.com/drive/folders/1k9rQinwUXicglA5oyzo9xtgqiuUVDkjT?usp=sharing
99 |
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_mlqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 |
27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/
28 |
29 | TASK='mlqa'
30 |
31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
32 | MODEL_PATH=$DATA_DIR/$MODEL
33 |
34 | EPOCH=4
35 | MAXL=384
36 | LANGS="en,es,de,ar,hi,vi,zh"
37 | BSR=0.3
38 | SA=0.3
39 | SNBS=-1
40 | CSR=0.3
41 | R1_LAMBDA=5.0
42 | R2_LAMBDA=5.0
43 | if [ $MODEL == "xlm-roberta-large" ]; then
44 | BATCH_SIZE=4
45 | GRAD_ACC=8
46 | LR=1.5e-5
47 | else
48 | BATCH_SIZE=32
49 | GRAD_ACC=1
50 | LR=3e-5
51 | fi
52 |
53 | if [ $STAGE == 1 ]; then
54 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
55 | python ./src/run_qa.py --model_type xlmr \
56 | --task_name $TASK \
57 | --model_name_or_path $MODEL_PATH \
58 | --do_train \
59 | --do_eval \
60 | --language $LANGS \
61 | --train_language en \
62 | --data_dir $DATA_DIR/$TASK/ \
63 | --per_gpu_train_batch_size $BATCH_SIZE \
64 | --gradient_accumulation_steps $GRAD_ACC \
65 | --per_gpu_eval_batch_size 128 \
66 | --learning_rate $LR \
67 | --num_train_epochs $EPOCH \
68 | --save_steps 0 \
69 | --logging_each_epoch \
70 | --max_seq_length $MAXL \
71 | --doc_stride 128 \
72 | --output_dir $OUTPUT_DIR \
73 | --overwrite_output_dir \
74 | --evaluate_during_training \
75 | --logging_steps 50 \
76 | --evaluate_steps 0 \
77 | --seed $SEED \
78 | --fp16 --fp16_opt_level O2 \
79 | --warmup_steps -1 \
80 | --enable_r1_loss \
81 | --r1_lambda $R1_LAMBDA \
82 | --original_loss \
83 | --overall_ratio 1.0 \
84 | --keep_boundary_unchanged \
85 | --enable_code_switch \
86 | --code_switch_ratio $CSR \
87 | --dict_dir $DATA_DIR/dicts \
88 | --dict_languages es,de,ar,hi,vi,zh \
89 | --noised_max_seq_length $MAXL
90 | elif [ $STAGE == 2 ]; then
91 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
92 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
93 | python ./src/run_qa.py --model_type xlmr \
94 | --task_name $TASK \
95 | --model_name_or_path $MODEL_PATH \
96 | --do_train \
97 | --do_eval \
98 | --language $LANGS \
99 | --train_language en \
100 | --data_dir $DATA_DIR/$TASK/ \
101 | --per_gpu_train_batch_size $BATCH_SIZE \
102 | --gradient_accumulation_steps $GRAD_ACC \
103 | --per_gpu_eval_batch_size 128 \
104 | --learning_rate $LR \
105 | --num_train_epochs $EPOCH \
106 | --save_steps 0 \
107 | --logging_each_epoch \
108 | --max_seq_length $MAXL \
109 | --doc_stride 128 \
110 | --output_dir $OUTPUT_DIR \
111 | --overwrite_output_dir \
112 | --evaluate_during_training \
113 | --logging_steps 50 \
114 | --evaluate_steps 0 \
115 | --seed $SEED \
116 | --fp16 --fp16_opt_level O2 \
117 | --warmup_steps -1 \
118 | --enable_r1_loss \
119 | --r1_lambda $R1_LAMBDA \
120 | --original_loss \
121 | --overall_ratio 1.0 \
122 | --keep_boundary_unchanged \
123 | --enable_bpe_sampling \
124 | --bpe_sampling_ratio $BSR \
125 | --sampling_alpha $SA \
126 | --sampling_nbest_size $SNBS \
127 | --noised_max_seq_length $MAXL \
128 | --enable_data_augmentation \
129 | --augment_ratio 1.0 \
130 | --augment_method ss \
131 | --max_steps 24000 \
132 | --r2_lambda $R2_LAMBDA \
133 | --first_stage_model_path $FIRST_MODEL_PATH
134 | fi
135 |
136 |
137 |
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_panx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='panx'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | EPOCH=10
29 | MAX_LENGTH=128
30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
31 | EVALUATE_STEPS=1000
32 | BSR=0.3
33 | SA=0.3
34 | SNBS=-1
35 | R1_LAMBDA=5.0
36 | R2_LAMBDA=5.0
37 | if [ $MODEL == "xlm-roberta-large" ]; then
38 | BATCH_SIZE=32
39 | GRAD_ACC=1
40 | LR=7e-6
41 | else
42 | BATCH_SIZE=32
43 | GRAD_ACC=1
44 | LR=1e-5
45 | fi
46 |
47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt
48 |
49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
50 |
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python src/run_tag.py --model_type xlmr \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --do_predict \
59 | --do_predict_dev \
60 | --predict_langs $LANGS \
61 | --train_langs en \
62 | --data_dir $DATA_DIR \
63 | --labels $DATA_DIR/labels.txt \
64 | --per_gpu_train_batch_size $BATCH_SIZE \
65 | --gradient_accumulation_steps $GRAD_ACC \
66 | --per_gpu_eval_batch_size 128 \
67 | --learning_rate $LR \
68 | --num_train_epochs $EPOCH \
69 | --max_seq_length $MAX_LENGTH \
70 | --noised_max_seq_length $MAX_LENGTH \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps $EVALUATE_STEPS \
76 | --seed $SEED \
77 | --warmup_steps -1 \
78 | --save_only_best_checkpoint \
79 | --eval_all_checkpoints \
80 | --eval_patience -1 \
81 | --fp16 --fp16_opt_level O2 \
82 | --hidden_dropout_prob 0.1 \
83 | --original_loss \
84 | --enable_r1_loss \
85 | --r1_lambda $R1_LAMBDA \
86 | --use_token_label_probs \
87 | --enable_bpe_sampling \
88 | --bpe_sampling_ratio $BSR \
89 | --sampling_alpha $SA \
90 | --sampling_nbest_size $SNBS
91 | elif [ $STAGE == 2 ]; then
92 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
93 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
94 | python src/run_tag.py --model_type xlmr \
95 | --model_name_or_path $MODEL_PATH \
96 | --do_train \
97 | --do_eval \
98 | --do_predict \
99 | --do_predict_dev \
100 | --predict_langs $LANGS \
101 | --train_langs en \
102 | --data_dir $DATA_DIR \
103 | --labels $DATA_DIR/labels.txt \
104 | --per_gpu_train_batch_size $BATCH_SIZE \
105 | --gradient_accumulation_steps $GRAD_ACC \
106 | --per_gpu_eval_batch_size 128 \
107 | --learning_rate $LR \
108 | --num_train_epochs $EPOCH \
109 | --max_seq_length $MAX_LENGTH \
110 | --noised_max_seq_length $MAX_LENGTH \
111 | --output_dir $OUTPUT_DIR \
112 | --overwrite_output_dir \
113 | --evaluate_during_training \
114 | --logging_steps 50 \
115 | --evaluate_steps $EVALUATE_STEPS \
116 | --seed $SEED \
117 | --warmup_steps -1 \
118 | --save_only_best_checkpoint \
119 | --eval_all_checkpoints \
120 | --eval_patience -1 \
121 | --fp16 --fp16_opt_level O2 \
122 | --hidden_dropout_prob 0.1 \
123 | --original_loss \
124 | --enable_r1_loss \
125 | --r1_lambda $R1_LAMBDA \
126 | --use_token_label_probs \
127 | --enable_bpe_sampling \
128 | --bpe_sampling_ratio $BSR \
129 | --sampling_alpha $SA \
130 | --sampling_nbest_size $SNBS \
131 | --enable_data_augmentation \
132 | --augment_ratio 1.0 \
133 | --augment_method ss \
134 | --r2_lambda $R2_LAMBDA \
135 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
136 | --use_hard_labels
137 | fi
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_pawsx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='pawsx'
27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/
28 | MODEL_PATH=$DATA_DIR/$MODEL
29 | EPOCH=10
30 | MAXL=256
31 | LANGS="de,en,es,fr,ja,ko,zh"
32 | EVALUATE_STEPS=1000
33 | CSR=0.5
34 | R1_LAMBDA=5.0
35 | R2_LAMBDA=2.0
36 | if [ $MODEL == "xlm-roberta-large" ]; then
37 | BATCH_SIZE=16
38 | GRAD_ACC=2
39 | LR=1e-5
40 | else
41 | BATCH_SIZE=32
42 | GRAD_ACC=1
43 | LR=1e-5
44 | fi
45 |
46 | if [ $STAGE == 1 ]; then
47 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
48 | mkdir -p $OUTPUT_DIR
49 | python ./src/run_cls.py --model_type xlmr \
50 | --model_name_or_path $MODEL_PATH \
51 | --language $LANGS \
52 | --train_language en \
53 | --do_train \
54 | --data_dir $DATA_DIR/$TASK/ \
55 | --per_gpu_train_batch_size $BATCH_SIZE \
56 | --gradient_accumulation_steps $GRAD_ACC \
57 | --per_gpu_eval_batch_size 64 \
58 | --learning_rate $LR \
59 | --num_train_epochs $EPOCH \
60 | --max_seq_length $MAXL \
61 | --output_dir $OUTPUT_DIR \
62 | --task_name $TASK \
63 | --save_steps -1 \
64 | --overwrite_output_dir \
65 | --evaluate_during_training \
66 | --evaluate_steps $EVALUATE_STEPS \
67 | --logging_steps 50 \
68 | --logging_steps_in_sample -1 \
69 | --logging_each_epoch \
70 | --gpu_id 0 \
71 | --seed $SEED \
72 | --fp16 --fp16_opt_level O2 \
73 | --warmup_steps -1 \
74 | --enable_r1_loss \
75 | --r1_lambda $R1_LAMBDA \
76 | --original_loss \
77 | --overall_ratio 1.0 \
78 | --enable_code_switch \
79 | --code_switch_ratio $CSR \
80 | --dict_dir $DATA_DIR/dicts \
81 | --dict_languages de,es,fr,ja,ko,zh
82 | elif [ $STAGE == 2 ]; then
83 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
84 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/"
85 | mkdir -p $OUTPUT_DIR
86 | python ./src/run_cls.py --model_type xlmr \
87 | --model_name_or_path $MODEL_PATH \
88 | --language $LANGS \
89 | --train_language en \
90 | --do_train \
91 | --data_dir $DATA_DIR/$TASK/ \
92 | --per_gpu_train_batch_size $BATCH_SIZE \
93 | --gradient_accumulation_steps $GRAD_ACC \
94 | --per_gpu_eval_batch_size 64 \
95 | --learning_rate $LR \
96 | --num_train_epochs $EPOCH \
97 | --max_seq_length $MAXL \
98 | --output_dir $OUTPUT_DIR \
99 | --task_name $TASK \
100 | --save_steps -1 \
101 | --overwrite_output_dir \
102 | --evaluate_during_training \
103 | --evaluate_steps $EVALUATE_STEPS \
104 | --logging_steps 50 \
105 | --logging_steps_in_sample -1 \
106 | --logging_each_epoch \
107 | --gpu_id 0 \
108 | --seed $SEED \
109 | --fp16 --fp16_opt_level O2 \
110 | --warmup_steps -1 \
111 | --enable_r1_loss \
112 | --r1_lambda $R1_LAMBDA \
113 | --original_loss \
114 | --overall_ratio 1.0 \
115 | --enable_code_switch \
116 | --code_switch_ratio $CSR \
117 | --dict_dir $DATA_DIR/dicts \
118 | --dict_languages de,es,fr,ja,ko,zh \
119 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
120 | --enable_data_augmentation \
121 | --augment_ratio 1.0 \
122 | --augment_method cs \
123 | --r2_lambda $R2_LAMBDA
124 | fi
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_tydiqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='tydiqa'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/
29 |
30 |
31 | MAXL=384
32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te"
33 | BSR=0.3
34 | SA=0.3
35 | SNBS=-1
36 | R1_LAMBDA=5.0
37 | R2_LAMBDA=5.0
38 | if [ $MODEL == "xlm-roberta-large" ]; then
39 | BATCH_SIZE=4
40 | GRAD_ACC=8
41 | LR=1.5e-5
42 | EPOCH=10
43 | MAX_STEPS=2500
44 | else
45 | BATCH_SIZE=32
46 | GRAD_ACC=1
47 | LR=3e-5
48 | EPOCH=20
49 | MAX_STEPS=5000
50 | fi
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python ./src/run_qa.py --model_type xlmr \
55 | --task_name $TASK \
56 | --model_name_or_path $MODEL_PATH \
57 | --do_train \
58 | --do_eval \
59 | --language $LANGS \
60 | --train_language en \
61 | --data_dir $DATA_DIR/$TASK/ \
62 | --per_gpu_train_batch_size $BATCH_SIZE \
63 | --gradient_accumulation_steps $GRAD_ACC \
64 | --per_gpu_eval_batch_size 128 \
65 | --learning_rate $LR \
66 | --num_train_epochs $EPOCH \
67 | --save_steps 0 \
68 | --logging_each_epoch \
69 | --max_seq_length $MAXL \
70 | --doc_stride 128 \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps 0 \
76 | --seed $SEED \
77 | --fp16 --fp16_opt_level O2 \
78 | --warmup_steps -1 \
79 | --enable_r1_loss \
80 | --r1_lambda $R1_LAMBDA \
81 | --original_loss \
82 | --overall_ratio 1.0 \
83 | --keep_boundary_unchanged \
84 | --enable_bpe_sampling \
85 | --bpe_sampling_ratio $BSR \
86 | --sampling_alpha $SA \
87 | --sampling_nbest_size $SNBS \
88 | --noised_max_seq_length $MAXL
89 | elif [ $STAGE == 2 ]; then
90 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
91 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
92 | python ./src/run_qa.py --model_type xlmr \
93 | --task_name $TASK \
94 | --model_name_or_path $MODEL_PATH \
95 | --do_train \
96 | --do_eval \
97 | --language $LANGS \
98 | --train_language en \
99 | --data_dir $DATA_DIR/$TASK/ \
100 | --per_gpu_train_batch_size $BATCH_SIZE \
101 | --gradient_accumulation_steps $GRAD_ACC \
102 | --per_gpu_eval_batch_size 128 \
103 | --learning_rate $LR \
104 | --num_train_epochs $EPOCH \
105 | --save_steps 0 \
106 | --logging_each_epoch \
107 | --max_seq_length $MAXL \
108 | --doc_stride 128 \
109 | --output_dir $OUTPUT_DIR \
110 | --overwrite_output_dir \
111 | --evaluate_during_training \
112 | --logging_steps 50 \
113 | --evaluate_steps 0 \
114 | --seed $SEED \
115 | --fp16 --fp16_opt_level O2 \
116 | --warmup_steps -1 \
117 | --enable_r1_loss \
118 | --r1_lambda $R1_LAMBDA \
119 | --original_loss \
120 | --overall_ratio 1.0 \
121 | --keep_boundary_unchanged \
122 | --enable_bpe_sampling \
123 | --bpe_sampling_ratio $BSR \
124 | --sampling_alpha $SA \
125 | --sampling_nbest_size $SNBS \
126 | --noised_max_seq_length $MAXL \
127 | --enable_data_augmentation \
128 | --augment_ratio 1.0 \
129 | --augment_method ss \
130 | --max_steps $MAX_STEPS \
131 | --r2_lambda $R2_LAMBDA \
132 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
133 | fi
134 |
135 |
136 |
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_udpos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='udpos'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | EPOCH=10
29 | MAX_LENGTH=128
30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh"
31 | EVALUATE_STEPS=500
32 | BSR=0.5
33 | SA=0.3
34 | SNBS=-1
35 | R1_LAMBDA=5.0
36 | R2_LAMBDA=0.3
37 | if [ $MODEL == "xlm-roberta-large" ]; then
38 | BATCH_SIZE=32
39 | GRAD_ACC=1
40 | LR=5e-6
41 | else
42 | BATCH_SIZE=32
43 | GRAD_ACC=1
44 | LR=2e-5
45 | fi
46 |
47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt
48 |
49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
50 |
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python src/run_tag.py --model_type xlmr \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --do_predict \
59 | --do_predict_dev \
60 | --predict_langs $LANGS \
61 | --train_langs en \
62 | --data_dir $DATA_DIR \
63 | --labels $DATA_DIR/labels.txt \
64 | --per_gpu_train_batch_size $BATCH_SIZE \
65 | --gradient_accumulation_steps $GRAD_ACC \
66 | --per_gpu_eval_batch_size 128 \
67 | --learning_rate $LR \
68 | --num_train_epochs $EPOCH \
69 | --max_seq_length $MAX_LENGTH \
70 | --noised_max_seq_length $MAX_LENGTH \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps $EVALUATE_STEPS \
76 | --seed $SEED \
77 | --warmup_steps -1 \
78 | --save_only_best_checkpoint \
79 | --eval_all_checkpoints \
80 | --eval_patience -1 \
81 | --fp16 --fp16_opt_level O2 \
82 | --hidden_dropout_prob 0.1 \
83 | --original_loss \
84 | --use_pooling_strategy \
85 | --enable_r1_loss \
86 | --r1_lambda $R1_LAMBDA \
87 | --use_token_label_probs \
88 | --enable_bpe_sampling \
89 | --bpe_sampling_ratio $BSR \
90 | --sampling_alpha $SA \
91 | --sampling_nbest_size $SNBS
92 | elif [ $STAGE == 2 ]; then
93 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
94 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
95 | python src/run_tag.py --model_type xlmr \
96 | --model_name_or_path $MODEL_PATH \
97 | --do_train \
98 | --do_eval \
99 | --do_predict \
100 | --do_predict_dev \
101 | --predict_langs $LANGS \
102 | --train_langs en \
103 | --data_dir $DATA_DIR \
104 | --labels $DATA_DIR/labels.txt \
105 | --per_gpu_train_batch_size $BATCH_SIZE \
106 | --gradient_accumulation_steps $GRAD_ACC \
107 | --per_gpu_eval_batch_size 128 \
108 | --learning_rate $LR \
109 | --num_train_epochs $EPOCH \
110 | --max_seq_length $MAX_LENGTH \
111 | --noised_max_seq_length $MAX_LENGTH \
112 | --output_dir $OUTPUT_DIR \
113 | --overwrite_output_dir \
114 | --evaluate_during_training \
115 | --logging_steps 50 \
116 | --evaluate_steps $EVALUATE_STEPS \
117 | --seed $SEED \
118 | --warmup_steps -1 \
119 | --save_only_best_checkpoint \
120 | --eval_all_checkpoints \
121 | --eval_patience -1 \
122 | --fp16 --fp16_opt_level O2 \
123 | --hidden_dropout_prob 0.1 \
124 | --original_loss \
125 | --use_pooling_strategy \
126 | --enable_r1_loss \
127 | --r1_lambda $R1_LAMBDA \
128 | --use_token_label_probs \
129 | --enable_bpe_sampling \
130 | --bpe_sampling_ratio $BSR \
131 | --sampling_alpha $SA \
132 | --sampling_nbest_size $SNBS \
133 | --enable_data_augmentation \
134 | --augment_ratio 1.0 \
135 | --augment_method ss \
136 | --r2_lambda $R2_LAMBDA \
137 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
138 | fi
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_xnli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='xnli'
27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/
28 | MODEL_PATH=$DATA_DIR/$MODEL
29 | EPOCH=10
30 | MAXL=256
31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh"
32 | EVALUATE_STEPS=5000
33 | CSR=0.3
34 | R1_LAMBDA=5.0
35 | R2_LAMBDA=5.0
36 | if [ $MODEL == "xlm-roberta-large" ]; then
37 | BATCH_SIZE=16
38 | GRAD_ACC=2
39 | LR=5e-6
40 | else
41 | BATCH_SIZE=32
42 | GRAD_ACC=1
43 | LR=7e-6
44 | fi
45 |
46 | if [ $STAGE == 1 ]; then
47 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
48 | mkdir -p $OUTPUT_DIR
49 | python ./src/run_cls.py --model_type xlmr \
50 | --model_name_or_path $MODEL_PATH \
51 | --language $LANGS \
52 | --train_language en \
53 | --do_train \
54 | --data_dir $DATA_DIR/$TASK/ \
55 | --per_gpu_train_batch_size $BATCH_SIZE \
56 | --gradient_accumulation_steps $GRAD_ACC \
57 | --per_gpu_eval_batch_size 64 \
58 | --learning_rate $LR \
59 | --num_train_epochs $EPOCH \
60 | --max_seq_length $MAXL \
61 | --output_dir $OUTPUT_DIR \
62 | --task_name $TASK \
63 | --save_steps -1 \
64 | --overwrite_output_dir \
65 | --evaluate_during_training \
66 | --evaluate_steps $EVALUATE_STEPS \
67 | --logging_steps 50 \
68 | --logging_steps_in_sample -1 \
69 | --logging_each_epoch \
70 | --gpu_id 0 \
71 | --seed $SEED \
72 | --fp16 --fp16_opt_level O2 \
73 | --warmup_steps -1 \
74 | --enable_r1_loss \
75 | --r1_lambda $R1_LAMBDA \
76 | --original_loss \
77 | --overall_ratio 1.0 \
78 | --enable_code_switch \
79 | --code_switch_ratio $CSR \
80 | --dict_dir $DATA_DIR/dicts \
81 | --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh
82 | elif [ $STAGE == 2 ]; then
83 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
84 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_Lambda${R1_LAMBDA}-Aug1.0-CS-R2_Lambda${R2_LAMBDA}/"
85 | mkdir -p $OUTPUT_DIR
86 | python ./src/run_cls.py --model_type xlmr \
87 | --model_name_or_path $MODEL_PATH \
88 | --language $LANGS \
89 | --train_language en \
90 | --do_train \
91 | --data_dir $DATA_DIR/$TASK/ \
92 | --per_gpu_train_batch_size $BATCH_SIZE \
93 | --gradient_accumulation_steps $GRAD_ACC \
94 | --per_gpu_eval_batch_size 64 \
95 | --learning_rate $LR \
96 | --num_train_epochs $EPOCH \
97 | --max_seq_length $MAXL \
98 | --output_dir $OUTPUT_DIR \
99 | --task_name $TASK \
100 | --save_steps -1 \
101 | --overwrite_output_dir \
102 | --evaluate_during_training \
103 | --evaluate_steps $EVALUATE_STEPS \
104 | --logging_steps 50 \
105 | --logging_steps_in_sample -1 \
106 | --logging_each_epoch \
107 | --gpu_id 0 \
108 | --seed $SEED \
109 | --fp16 --fp16_opt_level O2 \
110 | --warmup_steps -1 \
111 | --enable_r1_loss \
112 | --r1_lambda $R1_LAMBDA \
113 | --original_loss \
114 | --overall_ratio 1.0 \
115 | --enable_code_switch \
116 | --code_switch_ratio $CSR \
117 | --dict_dir $DATA_DIR/dicts \
118 | --dict_languages ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh \
119 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
120 | --enable_data_augmentation \
121 | --augment_ratio 1.0 \
122 | --augment_method cs \
123 | --r2_lambda $R2_LAMBDA
124 | fi
--------------------------------------------------------------------------------
/scripts/cross-lingual-transfer/train_xquad.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/
27 |
28 | TASK='xquad'
29 | MODEL_PATH=$DATA_DIR/$MODEL
30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
31 |
32 | EPOCH=4
33 | MAXL=384
34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh"
35 | BSR=0.3
36 | SA=0.3
37 | SNBS=-1
38 | CSR=0.3
39 | R1_LAMBDA=5.0
40 | R2_LAMBDA=5.0
41 | if [ $MODEL == "xlm-roberta-large" ]; then
42 | BATCH_SIZE=4
43 | GRAD_ACC=8
44 | LR=1.5e-5
45 | else
46 | BATCH_SIZE=32
47 | GRAD_ACC=1
48 | LR=3e-5
49 | fi
50 |
51 | if [ $STAGE == 1 ]; then
52 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
53 | python ./src/run_qa.py --model_type xlmr \
54 | --task_name $TASK \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --language $LANGS \
59 | --train_language en \
60 | --data_dir $DATA_DIR/$TASK/ \
61 | --per_gpu_train_batch_size $BATCH_SIZE \
62 | --gradient_accumulation_steps $GRAD_ACC \
63 | --per_gpu_eval_batch_size 128 \
64 | --learning_rate $LR \
65 | --num_train_epochs $EPOCH \
66 | --save_steps 0 \
67 | --logging_each_epoch \
68 | --max_seq_length $MAXL \
69 | --doc_stride 128 \
70 | --output_dir $OUTPUT_DIR \
71 | --overwrite_output_dir \
72 | --evaluate_during_training \
73 | --logging_steps 50 \
74 | --evaluate_steps 0 \
75 | --seed $SEED \
76 | --fp16 --fp16_opt_level O2 \
77 | --warmup_steps -1 \
78 | --enable_r1_loss \
79 | --r1_lambda $R1_LAMBDA \
80 | --original_loss \
81 | --overall_ratio 1.0 \
82 | --keep_boundary_unchanged \
83 | --enable_code_switch \
84 | --code_switch_ratio $CSR \
85 | --dict_dir $DATA_DIR/dicts \
86 | --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \
87 | --noised_max_seq_length $MAXL
88 | elif [ $STAGE == 2 ]; then
89 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
90 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-SS-R2_Lambda${R2_LAMBDA}/"
91 | python ./src/run_qa.py --model_type xlmr \
92 | --task_name $TASK \
93 | --model_name_or_path $MODEL_PATH \
94 | --do_train \
95 | --do_eval \
96 | --language $LANGS \
97 | --train_language en \
98 | --data_dir $DATA_DIR/$TASK/ \
99 | --per_gpu_train_batch_size $BATCH_SIZE \
100 | --gradient_accumulation_steps $GRAD_ACC \
101 | --per_gpu_eval_batch_size 128 \
102 | --learning_rate $LR \
103 | --num_train_epochs $EPOCH \
104 | --save_steps 0 \
105 | --logging_each_epoch \
106 | --max_seq_length $MAXL \
107 | --doc_stride 128 \
108 | --output_dir $OUTPUT_DIR \
109 | --overwrite_output_dir \
110 | --evaluate_during_training \
111 | --logging_steps 50 \
112 | --evaluate_steps 0 \
113 | --seed $SEED \
114 | --fp16 --fp16_opt_level O2 \
115 | --warmup_steps -1 \
116 | --enable_r1_loss \
117 | --r1_lambda $R1_LAMBDA \
118 | --original_loss \
119 | --overall_ratio 1.0 \
120 | --keep_boundary_unchanged \
121 | --enable_bpe_sampling \
122 | --bpe_sampling_ratio $BSR \
123 | --sampling_alpha $SA \
124 | --sampling_nbest_size $SNBS \
125 | --noised_max_seq_length $MAXL \
126 | --enable_data_augmentation \
127 | --augment_ratio 1.0 \
128 | --augment_method ss \
129 | --max_steps 24000 \
130 | --r2_lambda $R2_LAMBDA \
131 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
132 | fi
133 |
134 |
135 |
--------------------------------------------------------------------------------
/scripts/download_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | DIR=$REPO/download/
18 | mkdir -p $DIR
19 |
20 | # download xlm-roberta-base
21 | function download_xlm-roberta-base {
22 | mkdir -p $DIR/xlm-roberta-base/
23 | cd $DIR/xlm-roberta-base/
24 | wget https://huggingface.co/xlm-roberta-base/resolve/main/pytorch_model.bin -q --show-progress
25 | wget https://huggingface.co/xlm-roberta-base/resolve/main/config.json -q --show-progress
26 | wget https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model -q --show-progress
27 | wget https://huggingface.co/xlm-roberta-base/resolve/main/tokenizer.json -q --show-progress
28 | echo "Successfully downloaded xlm-roberta-base at $DIR/xlm-roberta-base" >> $DIR/download_model.log
29 | }
30 |
31 | # download xlm-roberta-large
32 | function download_xlm-roberta-large {
33 | mkdir -p $DIR/xlm-roberta-large/
34 | cd $DIR/xlm-roberta-large/
35 | wget https://huggingface.co/xlm-roberta-large/resolve/main/pytorch_model.bin -q --show-progress
36 | wget https://huggingface.co/xlm-roberta-large/resolve/main/config.json -q --show-progress
37 | wget https://huggingface.co/xlm-roberta-large/resolve/main/sentencepiece.bpe.model -q --show-progress
38 | wget https://huggingface.co/xlm-roberta-large/resolve/main/tokenizer.json -q --show-progress
39 | echo "Successfully downloaded xlm-roberta-large at $DIR/xlm-roberta-large" >> $DIR/download_model.log
40 | }
41 |
42 | download_xlm-roberta-base
43 | download_xlm-roberta-large
44 |
--------------------------------------------------------------------------------
/scripts/preprocess_panx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 |
20 | TASK='panx'
21 | MAXL=128
22 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 | MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 | MODEL_TYPE="xlm"
28 | LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then
30 | MODEL_TYPE="xlmr"
31 | fi
32 | SAVE_DIR="$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAXL}"
33 | mkdir -p $SAVE_DIR
34 | python3 $REPO/utils_preprocess.py \
35 | --data_dir $DATA_DIR/$TASK/ \
36 | --task panx_tokenize \
37 | --model_name_or_path $MODEL \
38 | --model_type $MODEL_TYPE \
39 | --max_len $MAXL \
40 | --output_dir $SAVE_DIR \
41 | --languages $LANGS $LC >> $SAVE_DIR/preprocess.log
42 | if [ ! -f $SAVE_DIR/labels.txt ]; then
43 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
44 | fi
45 |
--------------------------------------------------------------------------------
/scripts/preprocess_udpos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-bert-base-multilingual-cased}
18 | DATA_DIR=${2:-"$REPO/download/"}
19 |
20 | TASK='udpos'
21 | MAXL=128
22 | LANGS='af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh'
23 | LC=""
24 | if [ $MODEL == "bert-base-multilingual-cased" ]; then
25 | MODEL_TYPE="bert"
26 | elif [ $MODEL == "xlm-mlm-100-1280" ] || [ $MODEL == "xlm-mlm-tlm-xnli15-1024" ]; then
27 | MODEL_TYPE="xlm"
28 | LC=" --do_lower_case"
29 | elif [ $MODEL == "xlm-roberta-large" ] || [ $MODEL == "xlm-roberta-base" ]; then
30 | MODEL_TYPE="xlmr"
31 | fi
32 |
33 | SAVE_DIR="$DATA_DIR/${TASK}/udpos_processed_maxlen${MAXL}"
34 | mkdir -p $SAVE_DIR
35 | python3 $REPO/utils_preprocess.py \
36 | --data_dir $DATA_DIR/${TASK}/ \
37 | --task udpos_tokenize \
38 | --model_name_or_path $MODEL \
39 | --model_type $MODEL_TYPE \
40 | --max_len $MAXL \
41 | --output_dir $SAVE_DIR \
42 | --languages $LANGS $LC >> $SAVE_DIR/process.log
43 | if [ ! -f $SAVE_DIR/labels.txt ]; then
44 | echo "create label"
45 | cat $SAVE_DIR/*/*.${MODEL} | cut -f 2 | grep -v "^$" | sort | uniq > $SAVE_DIR/labels.txt
46 | fi
47 |
--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | SETTING=${1:-cross-lingual-transfer}
18 | TASK=${2:-xnli}
19 | MODEL=${3:-"xlm-roberta-base"}
20 | STAGE=${4:-1}
21 | GPU=${5:-0}
22 | DATA_DIR=${6:-"$REPO/download/"}
23 | OUT_DIR=${7:-"$REPO/outputs/"}
24 | SEED=${8:-1}
25 |
26 | echo "Fine-tuning $MODEL on $TASK using GPU $GPU in STAGE $STAGE with SETTING $SETTING"
27 | echo "Load data from $DATA_DIR, and save models to $OUT_DIR"
28 |
29 | if [ $TASK == "udpos" ]; then
30 | bash $REPO/scripts/preprocess_udpos.sh $MODEL $DATA_DIR
31 | elif [ $TASK == "panx" ]; then
32 | bash $REPO/scripts/preprocess_panx.sh $MODEL $DATA_DIR
33 | fi
34 |
35 | bash $REPO/scripts/$SETTING/train_${TASK}.sh $MODEL $STAGE $GPU $DATA_DIR $OUT_DIR $SEED
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_mlqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 |
27 | cp -r $DATA_DIR/squad/ $DATA_DIR/mlqa/squad1.1/
28 |
29 | TASK='mlqa'
30 |
31 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
32 | MODEL_PATH=$DATA_DIR/$MODEL
33 |
34 | EPOCH=4
35 | MAXL=384
36 | LANGS="en,es,de,ar,hi,vi,zh"
37 | BSR=0.3
38 | SA=0.3
39 | SNBS=-1
40 | CSR=0.3
41 | R1_LAMBDA=5.0
42 | R2_LAMBDA=0.5
43 | if [ $MODEL == "xlm-roberta-large" ]; then
44 | BATCH_SIZE=4
45 | GRAD_ACC=8
46 | LR=1.5e-5
47 | else
48 | BATCH_SIZE=32
49 | GRAD_ACC=1
50 | LR=3e-5
51 | fi
52 |
53 | if [ $STAGE == 1 ]; then
54 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
55 | python ./src/run_qa.py --model_type xlmr \
56 | --task_name $TASK \
57 | --model_name_or_path $MODEL_PATH \
58 | --do_train \
59 | --do_eval \
60 | --language $LANGS \
61 | --train_language en \
62 | --data_dir $DATA_DIR/$TASK/ \
63 | --per_gpu_train_batch_size $BATCH_SIZE \
64 | --gradient_accumulation_steps $GRAD_ACC \
65 | --per_gpu_eval_batch_size 128 \
66 | --learning_rate $LR \
67 | --num_train_epochs $EPOCH \
68 | --save_steps 0 \
69 | --logging_each_epoch \
70 | --max_seq_length $MAXL \
71 | --doc_stride 128 \
72 | --output_dir $OUTPUT_DIR \
73 | --overwrite_output_dir \
74 | --evaluate_during_training \
75 | --logging_steps 50 \
76 | --evaluate_steps 0 \
77 | --seed $SEED \
78 | --fp16 --fp16_opt_level O2 \
79 | --warmup_steps -1 \
80 | --enable_r1_loss \
81 | --r1_lambda $R1_LAMBDA \
82 | --original_loss \
83 | --overall_ratio 1.0 \
84 | --keep_boundary_unchanged \
85 | --enable_code_switch \
86 | --code_switch_ratio $CSR \
87 | --dict_dir $DATA_DIR/dicts \
88 | --dict_languages es,de,ar,hi,vi,zh \
89 | --noised_max_seq_length $MAXL
90 | elif [ $STAGE == 2 ]; then
91 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
92 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
93 | python ./src/run_qa.py --model_type xlmr \
94 | --task_name $TASK \
95 | --model_name_or_path $MODEL_PATH \
96 | --do_train \
97 | --do_eval \
98 | --language $LANGS \
99 | --train_language en \
100 | --data_dir $DATA_DIR/$TASK/ \
101 | --per_gpu_train_batch_size $BATCH_SIZE \
102 | --gradient_accumulation_steps $GRAD_ACC \
103 | --per_gpu_eval_batch_size 128 \
104 | --learning_rate $LR \
105 | --num_train_epochs $EPOCH \
106 | --save_steps 0 \
107 | --logging_each_epoch \
108 | --max_seq_length $MAXL \
109 | --doc_stride 128 \
110 | --output_dir $OUTPUT_DIR \
111 | --overwrite_output_dir \
112 | --evaluate_during_training \
113 | --logging_steps 50 \
114 | --evaluate_steps 0 \
115 | --seed $SEED \
116 | --fp16 --fp16_opt_level O2 \
117 | --warmup_steps -1 \
118 | --enable_r1_loss \
119 | --r1_lambda $R1_LAMBDA \
120 | --original_loss \
121 | --overall_ratio 1.0 \
122 | --keep_boundary_unchanged \
123 | --enable_bpe_sampling \
124 | --bpe_sampling_ratio $BSR \
125 | --sampling_alpha $SA \
126 | --sampling_nbest_size $SNBS \
127 | --noised_max_seq_length $MAXL \
128 | --enable_data_augmentation \
129 | --augment_ratio 1.0 \
130 | --augment_method mt \
131 | --translation_path $TRANSLATION_PATH \
132 | --max_steps 24000 \
133 | --r2_lambda $R2_LAMBDA \
134 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
135 | fi
136 |
137 |
138 |
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_panx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='panx'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | EPOCH=10
29 | MAX_LENGTH=128
30 | LANGS="ar,he,vi,id,jv,ms,tl,eu,ml,ta,te,af,nl,en,de,el,bn,hi,mr,ur,fa,fr,it,pt,es,bg,ru,ja,ka,ko,th,sw,yo,my,zh,kk,tr,et,fi,hu"
31 | EVALUATE_STEPS=1000
32 | BSR=0.3
33 | SA=0.3
34 | SNBS=-1
35 | R1_LAMBDA=5.0
36 | R2_LAMBDA=1.0
37 | if [ $MODEL == "xlm-roberta-large" ]; then
38 | BATCH_SIZE=32
39 | GRAD_ACC=1
40 | LR=7e-6
41 | else
42 | BATCH_SIZE=32
43 | GRAD_ACC=1
44 | LR=1e-5
45 | fi
46 |
47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.panx.txt
48 |
49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
50 |
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python src/run_tag.py --model_type xlmr \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --do_predict \
59 | --do_predict_dev \
60 | --predict_langs $LANGS \
61 | --train_langs en \
62 | --data_dir $DATA_DIR \
63 | --labels $DATA_DIR/labels.txt \
64 | --per_gpu_train_batch_size $BATCH_SIZE \
65 | --gradient_accumulation_steps $GRAD_ACC \
66 | --per_gpu_eval_batch_size 128 \
67 | --learning_rate $LR \
68 | --num_train_epochs $EPOCH \
69 | --max_seq_length $MAX_LENGTH \
70 | --noised_max_seq_length $MAX_LENGTH \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps $EVALUATE_STEPS \
76 | --seed $SEED \
77 | --warmup_steps -1 \
78 | --save_only_best_checkpoint \
79 | --eval_all_checkpoints \
80 | --eval_patience -1 \
81 | --fp16 --fp16_opt_level O2 \
82 | --hidden_dropout_prob 0.1 \
83 | --original_loss \
84 | --enable_r1_loss \
85 | --r1_lambda $R1_LAMBDA \
86 | --use_token_label_probs \
87 | --enable_bpe_sampling \
88 | --bpe_sampling_ratio $BSR \
89 | --sampling_alpha $SA \
90 | --sampling_nbest_size $SNBS
91 | elif [ $STAGE == 2 ]; then
92 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
93 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
94 | python src/run_tag.py --model_type xlmr \
95 | --model_name_or_path $MODEL_PATH \
96 | --do_train \
97 | --do_eval \
98 | --do_predict \
99 | --do_predict_dev \
100 | --predict_langs $LANGS \
101 | --train_langs en \
102 | --data_dir $DATA_DIR \
103 | --labels $DATA_DIR/labels.txt \
104 | --per_gpu_train_batch_size $BATCH_SIZE \
105 | --gradient_accumulation_steps $GRAD_ACC \
106 | --per_gpu_eval_batch_size 128 \
107 | --learning_rate $LR \
108 | --num_train_epochs $EPOCH \
109 | --max_seq_length $MAX_LENGTH \
110 | --noised_max_seq_length $MAX_LENGTH \
111 | --output_dir $OUTPUT_DIR \
112 | --overwrite_output_dir \
113 | --evaluate_during_training \
114 | --logging_steps 50 \
115 | --evaluate_steps $EVALUATE_STEPS \
116 | --seed $SEED \
117 | --warmup_steps -1 \
118 | --save_only_best_checkpoint \
119 | --eval_all_checkpoints \
120 | --eval_patience -1 \
121 | --fp16 --fp16_opt_level O2 \
122 | --hidden_dropout_prob 0.1 \
123 | --original_loss \
124 | --enable_r1_loss \
125 | --r1_lambda $R1_LAMBDA \
126 | --use_token_label_probs \
127 | --enable_bpe_sampling \
128 | --bpe_sampling_ratio $BSR \
129 | --sampling_alpha $SA \
130 | --sampling_nbest_size $SNBS \
131 | --enable_data_augmentation \
132 | --augment_ratio 1.0 \
133 | --augment_method mt \
134 | --translation_path $TRANSLATION_PATH \
135 | --r2_lambda $R2_LAMBDA \
136 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
137 | --use_hard_labels
138 | fi
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_pawsx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='pawsx'
27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/PAWSX/
28 | MODEL_PATH=$DATA_DIR/$MODEL
29 | EPOCH=10
30 | MAXL=256
31 | LANGS="de,en,es,fr,ja,ko,zh"
32 | EVALUATE_STEPS=1000
33 | R1_LAMBDA=5.0
34 | R2_LAMBDA=1.0
35 | if [ $MODEL == "xlm-roberta-large" ]; then
36 | BATCH_SIZE=16
37 | GRAD_ACC=2
38 | LR=1e-5
39 | else
40 | BATCH_SIZE=32
41 | GRAD_ACC=1
42 | LR=1e-5
43 | fi
44 |
45 | if [ $STAGE == 1 ]; then
46 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/"
47 | mkdir -p $OUTPUT_DIR
48 | python ./src/run_cls.py --model_type xlmr \
49 | --model_name_or_path $MODEL_PATH \
50 | --language $LANGS \
51 | --train_language en \
52 | --do_train \
53 | --data_dir $DATA_DIR/$TASK/ \
54 | --per_gpu_train_batch_size $BATCH_SIZE \
55 | --gradient_accumulation_steps $GRAD_ACC \
56 | --per_gpu_eval_batch_size 64 \
57 | --learning_rate $LR \
58 | --num_train_epochs $EPOCH \
59 | --max_seq_length $MAXL \
60 | --output_dir $OUTPUT_DIR \
61 | --task_name $TASK \
62 | --save_steps -1 \
63 | --overwrite_output_dir \
64 | --evaluate_during_training \
65 | --evaluate_steps $EVALUATE_STEPS \
66 | --logging_steps 50 \
67 | --logging_steps_in_sample -1 \
68 | --logging_each_epoch \
69 | --gpu_id 0 \
70 | --seed $SEED \
71 | --fp16 --fp16_opt_level O2 \
72 | --warmup_steps -1 \
73 | --enable_r1_loss \
74 | --r1_lambda $R1_LAMBDA \
75 | --original_loss \
76 | --enable_translate_data \
77 | --translation_path $TRANSLATION_PATH
78 | elif [ $STAGE == 2 ]; then
79 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
80 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
81 | mkdir -p $OUTPUT_DIR
82 | python ./src/run_cls.py --model_type xlmr \
83 | --model_name_or_path $MODEL_PATH \
84 | --language $LANGS \
85 | --train_language en \
86 | --do_train \
87 | --data_dir $DATA_DIR/$TASK/ \
88 | --per_gpu_train_batch_size $BATCH_SIZE \
89 | --gradient_accumulation_steps $GRAD_ACC \
90 | --per_gpu_eval_batch_size 64 \
91 | --learning_rate $LR \
92 | --num_train_epochs $EPOCH \
93 | --max_seq_length $MAXL \
94 | --output_dir $OUTPUT_DIR \
95 | --task_name $TASK \
96 | --save_steps -1 \
97 | --overwrite_output_dir \
98 | --evaluate_during_training \
99 | --evaluate_steps $EVALUATE_STEPS \
100 | --logging_steps 50 \
101 | --logging_steps_in_sample -1 \
102 | --logging_each_epoch \
103 | --gpu_id 0 \
104 | --seed $SEED \
105 | --fp16 --fp16_opt_level O2 \
106 | --warmup_steps -1 \
107 | --enable_r1_loss \
108 | --r1_lambda $R1_LAMBDA \
109 | --original_loss \
110 | --enable_translate_data \
111 | --translation_path $TRANSLATION_PATH \
112 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
113 | --enable_data_augmentation \
114 | --augment_ratio 1.0 \
115 | --augment_method mt \
116 | --r2_lambda $R2_LAMBDA
117 | fi
118 |
119 |
120 |
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_tydiqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='tydiqa'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/TyDiQA-GoldP/translate-train/
29 |
30 |
31 | MAXL=384
32 | LANGS="en,ar,bn,fi,id,ko,ru,sw,te"
33 | BSR=0.3
34 | SA=0.3
35 | SNBS=-1
36 | R1_LAMBDA=5.0
37 | R2_LAMBDA=0.3
38 | if [ $MODEL == "xlm-roberta-large" ]; then
39 | BATCH_SIZE=4
40 | GRAD_ACC=8
41 | LR=1.5e-5
42 | EPOCH=10
43 | MAX_STEPS=2500
44 | else
45 | BATCH_SIZE=32
46 | GRAD_ACC=1
47 | LR=3e-5
48 | EPOCH=20
49 | MAX_STEPS=5000
50 | fi
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python ./src/run_qa.py --model_type xlmr \
55 | --task_name $TASK \
56 | --model_name_or_path $MODEL_PATH \
57 | --do_train \
58 | --do_eval \
59 | --language $LANGS \
60 | --train_language en \
61 | --data_dir $DATA_DIR/$TASK/ \
62 | --per_gpu_train_batch_size $BATCH_SIZE \
63 | --gradient_accumulation_steps $GRAD_ACC \
64 | --per_gpu_eval_batch_size 128 \
65 | --learning_rate $LR \
66 | --num_train_epochs $EPOCH \
67 | --save_steps 0 \
68 | --logging_each_epoch \
69 | --max_seq_length $MAXL \
70 | --doc_stride 128 \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps 0 \
76 | --seed $SEED \
77 | --fp16 --fp16_opt_level O2 \
78 | --warmup_steps -1 \
79 | --enable_r1_loss \
80 | --r1_lambda $R1_LAMBDA \
81 | --original_loss \
82 | --overall_ratio 1.0 \
83 | --keep_boundary_unchanged \
84 | --enable_bpe_sampling \
85 | --bpe_sampling_ratio $BSR \
86 | --sampling_alpha $SA \
87 | --sampling_nbest_size $SNBS \
88 | --noised_max_seq_length $MAXL
89 | elif [ $STAGE == 2 ]; then
90 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
91 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
92 | python ./src/run_qa.py --model_type xlmr \
93 | --task_name $TASK \
94 | --model_name_or_path $MODEL_PATH \
95 | --do_train \
96 | --do_eval \
97 | --language $LANGS \
98 | --train_language en \
99 | --data_dir $DATA_DIR/$TASK/ \
100 | --per_gpu_train_batch_size $BATCH_SIZE \
101 | --gradient_accumulation_steps $GRAD_ACC \
102 | --per_gpu_eval_batch_size 128 \
103 | --learning_rate $LR \
104 | --num_train_epochs $EPOCH \
105 | --save_steps 0 \
106 | --logging_each_epoch \
107 | --max_seq_length $MAXL \
108 | --doc_stride 128 \
109 | --output_dir $OUTPUT_DIR \
110 | --overwrite_output_dir \
111 | --evaluate_during_training \
112 | --logging_steps 50 \
113 | --evaluate_steps 0 \
114 | --seed $SEED \
115 | --fp16 --fp16_opt_level O2 \
116 | --warmup_steps -1 \
117 | --enable_r1_loss \
118 | --r1_lambda $R1_LAMBDA \
119 | --original_loss \
120 | --overall_ratio 1.0 \
121 | --keep_boundary_unchanged \
122 | --enable_bpe_sampling \
123 | --bpe_sampling_ratio $BSR \
124 | --sampling_alpha $SA \
125 | --sampling_nbest_size $SNBS \
126 | --noised_max_seq_length $MAXL \
127 | --enable_data_augmentation \
128 | --augment_ratio 1.0 \
129 | --augment_method mt \
130 | --translation_path $TRANSLATION_PATH \
131 | --max_steps $MAX_STEPS \
132 | --r2_lambda $R2_LAMBDA \
133 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
134 | fi
135 |
136 |
137 |
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_udpos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='udpos'
27 | MODEL_PATH=$DATA_DIR/$MODEL
28 | EPOCH=10
29 | MAX_LENGTH=128
30 | LANGS="af,ar,bg,de,el,en,es,et,eu,fa,fi,fr,he,hi,hu,id,it,ja,kk,ko,mr,nl,pt,ru,ta,te,th,tl,tr,ur,vi,yo,zh"
31 | EVALUATE_STEPS=500
32 | BSR=0.5
33 | SA=0.3
34 | SNBS=-1
35 | R1_LAMBDA=5.0
36 | R2_LAMBDA=0.3
37 | if [ $MODEL == "xlm-roberta-large" ]; then
38 | BATCH_SIZE=32
39 | GRAD_ACC=1
40 | LR=5e-6
41 | else
42 | BATCH_SIZE=32
43 | GRAD_ACC=1
44 | LR=2e-5
45 | fi
46 |
47 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/translate_train.udpos.txt
48 |
49 | DATA_DIR=$DATA_DIR/$TASK/${TASK}_processed_maxlen${MAX_LENGTH}/
50 |
51 |
52 | if [ $STAGE == 1 ]; then
53 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/"
54 | python src/run_tag.py --model_type xlmr \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --do_predict \
59 | --do_predict_dev \
60 | --predict_langs $LANGS \
61 | --train_langs en \
62 | --data_dir $DATA_DIR \
63 | --labels $DATA_DIR/labels.txt \
64 | --per_gpu_train_batch_size $BATCH_SIZE \
65 | --gradient_accumulation_steps $GRAD_ACC \
66 | --per_gpu_eval_batch_size 128 \
67 | --learning_rate $LR \
68 | --num_train_epochs $EPOCH \
69 | --max_seq_length $MAX_LENGTH \
70 | --noised_max_seq_length $MAX_LENGTH \
71 | --output_dir $OUTPUT_DIR \
72 | --overwrite_output_dir \
73 | --evaluate_during_training \
74 | --logging_steps 50 \
75 | --evaluate_steps $EVALUATE_STEPS \
76 | --seed $SEED \
77 | --warmup_steps -1 \
78 | --save_only_best_checkpoint \
79 | --eval_all_checkpoints \
80 | --eval_patience -1 \
81 | --fp16 --fp16_opt_level O2 \
82 | --hidden_dropout_prob 0.1 \
83 | --original_loss \
84 | --use_pooling_strategy \
85 | --enable_r1_loss \
86 | --r1_lambda $R1_LAMBDA \
87 | --use_token_label_probs \
88 | --enable_bpe_sampling \
89 | --bpe_sampling_ratio $BSR \
90 | --sampling_alpha $SA \
91 | --sampling_nbest_size $SNBS
92 | elif [ $STAGE == 2 ]; then
93 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
94 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
95 | python src/run_tag.py --model_type xlmr \
96 | --model_name_or_path $MODEL_PATH \
97 | --do_train \
98 | --do_eval \
99 | --do_predict \
100 | --do_predict_dev \
101 | --predict_langs $LANGS \
102 | --train_langs en \
103 | --data_dir $DATA_DIR \
104 | --labels $DATA_DIR/labels.txt \
105 | --per_gpu_train_batch_size $BATCH_SIZE \
106 | --gradient_accumulation_steps $GRAD_ACC \
107 | --per_gpu_eval_batch_size 128 \
108 | --learning_rate $LR \
109 | --num_train_epochs $EPOCH \
110 | --max_seq_length $MAX_LENGTH \
111 | --noised_max_seq_length $MAX_LENGTH \
112 | --output_dir $OUTPUT_DIR \
113 | --overwrite_output_dir \
114 | --evaluate_during_training \
115 | --logging_steps 50 \
116 | --evaluate_steps $EVALUATE_STEPS \
117 | --seed $SEED \
118 | --warmup_steps -1 \
119 | --save_only_best_checkpoint \
120 | --eval_all_checkpoints \
121 | --eval_patience -1 \
122 | --fp16 --fp16_opt_level O2 \
123 | --hidden_dropout_prob 0.1 \
124 | --original_loss \
125 | --use_pooling_strategy \
126 | --enable_r1_loss \
127 | --r1_lambda $R1_LAMBDA \
128 | --use_token_label_probs \
129 | --enable_bpe_sampling \
130 | --bpe_sampling_ratio $BSR \
131 | --sampling_alpha $SA \
132 | --sampling_nbest_size $SNBS \
133 | --enable_data_augmentation \
134 | --augment_ratio 1.0 \
135 | --augment_method mt \
136 | --translation_path $TRANSLATION_PATH \
137 | --r2_lambda $R2_LAMBDA \
138 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
139 | fi
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_xnli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | TASK='xnli'
27 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/XNLI/
28 | MODEL_PATH=$DATA_DIR/$MODEL
29 | EPOCH=10
30 | MAXL=256
31 | LANGS="ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh"
32 | EVALUATE_STEPS=5000
33 | R1_LAMBDA=5.0
34 | R2_LAMBDA=1.0
35 | if [ $MODEL == "xlm-roberta-large" ]; then
36 | BATCH_SIZE=16
37 | GRAD_ACC=2
38 | LR=5e-6
39 | else
40 | BATCH_SIZE=32
41 | GRAD_ACC=1
42 | LR=7e-6
43 | fi
44 |
45 | if [ $STAGE == 1 ]; then
46 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/"
47 | mkdir -p $OUTPUT_DIR
48 | python ./src/run_cls.py --model_type xlmr \
49 | --model_name_or_path $MODEL_PATH \
50 | --language $LANGS \
51 | --train_language en \
52 | --do_train \
53 | --data_dir $DATA_DIR/$TASK/ \
54 | --per_gpu_train_batch_size $BATCH_SIZE \
55 | --gradient_accumulation_steps $GRAD_ACC \
56 | --per_gpu_eval_batch_size 64 \
57 | --learning_rate $LR \
58 | --num_train_epochs $EPOCH \
59 | --max_seq_length $MAXL \
60 | --output_dir $OUTPUT_DIR \
61 | --task_name $TASK \
62 | --save_steps -1 \
63 | --overwrite_output_dir \
64 | --evaluate_during_training \
65 | --evaluate_steps $EVALUATE_STEPS \
66 | --logging_steps 50 \
67 | --logging_steps_in_sample -1 \
68 | --logging_each_epoch \
69 | --gpu_id 0 \
70 | --seed $SEED \
71 | --fp16 --fp16_opt_level O2 \
72 | --warmup_steps -1 \
73 | --enable_r1_loss \
74 | --r1_lambda $R1_LAMBDA \
75 | --original_loss \
76 | --enable_translate_data \
77 | --translation_path $TRANSLATION_PATH
78 | elif [ $STAGE == 2 ]; then
79 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_LAMBDA${R1_LAMBDA}/checkpoint-best"
80 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-Translate-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
81 | mkdir -p $OUTPUT_DIR
82 | python ./src/run_cls.py --model_type xlmr \
83 | --model_name_or_path $MODEL_PATH \
84 | --language $LANGS \
85 | --train_language en \
86 | --do_train \
87 | --data_dir $DATA_DIR/$TASK/ \
88 | --per_gpu_train_batch_size $BATCH_SIZE \
89 | --gradient_accumulation_steps $GRAD_ACC \
90 | --per_gpu_eval_batch_size 64 \
91 | --learning_rate $LR \
92 | --num_train_epochs $EPOCH \
93 | --max_seq_length $MAXL \
94 | --output_dir $OUTPUT_DIR \
95 | --task_name $TASK \
96 | --save_steps -1 \
97 | --overwrite_output_dir \
98 | --evaluate_during_training \
99 | --evaluate_steps $EVALUATE_STEPS \
100 | --logging_steps 50 \
101 | --logging_steps_in_sample -1 \
102 | --logging_each_epoch \
103 | --gpu_id 0 \
104 | --seed $SEED \
105 | --fp16 --fp16_opt_level O2 \
106 | --warmup_steps -1 \
107 | --enable_r1_loss \
108 | --r1_lambda $R1_LAMBDA \
109 | --original_loss \
110 | --enable_translate_data \
111 | --translation_path $TRANSLATION_PATH \
112 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH \
113 | --enable_data_augmentation \
114 | --augment_ratio 1.0 \
115 | --augment_method mt \
116 | --r2_lambda $R2_LAMBDA
117 | fi
--------------------------------------------------------------------------------
/scripts/translate-train-all/train_xquad.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google and DeepMind.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | REPO=$PWD
17 | MODEL=${1:-"xlm-roberta-base"}
18 | STAGE=${2:-1}
19 | GPU=${3:-0}
20 | DATA_DIR=${4:-"$REPO/download/"}
21 | OUT_DIR=${5:-"$REPO/outputs/"}
22 | SEED=${6:-1}
23 |
24 | export CUDA_VISIBLE_DEVICES=$GPU
25 |
26 | cp -r $DATA_DIR/squad/ $DATA_DIR/xquad/squad1.1/
27 |
28 | TASK='xquad'
29 | MODEL_PATH=$DATA_DIR/$MODEL
30 | TRANSLATION_PATH=$DATA_DIR/xtreme_translations/SQuAD/translate-train/
31 |
32 | EPOCH=4
33 | MAXL=384
34 | LANGS="ar,de,el,en,es,hi,ru,th,tr,vi,zh"
35 | BSR=0.3
36 | SA=0.3
37 | SNBS=-1
38 | CSR=0.3
39 | R1_LAMBDA=5.0
40 | R2_LAMBDA=0.1
41 | if [ $MODEL == "xlm-roberta-large" ]; then
42 | BATCH_SIZE=4
43 | GRAD_ACC=8
44 | LR=1.5e-5
45 | else
46 | BATCH_SIZE=32
47 | GRAD_ACC=1
48 | LR=3e-5
49 | fi
50 |
51 | if [ $STAGE == 1 ]; then
52 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
53 | python ./src/run_qa.py --model_type xlmr \
54 | --task_name $TASK \
55 | --model_name_or_path $MODEL_PATH \
56 | --do_train \
57 | --do_eval \
58 | --language $LANGS \
59 | --train_language en \
60 | --data_dir $DATA_DIR/$TASK/ \
61 | --per_gpu_train_batch_size $BATCH_SIZE \
62 | --gradient_accumulation_steps $GRAD_ACC \
63 | --per_gpu_eval_batch_size 128 \
64 | --learning_rate $LR \
65 | --num_train_epochs $EPOCH \
66 | --save_steps 0 \
67 | --logging_each_epoch \
68 | --max_seq_length $MAXL \
69 | --doc_stride 128 \
70 | --output_dir $OUTPUT_DIR \
71 | --overwrite_output_dir \
72 | --evaluate_during_training \
73 | --logging_steps 50 \
74 | --evaluate_steps 0 \
75 | --seed $SEED \
76 | --fp16 --fp16_opt_level O2 \
77 | --warmup_steps -1 \
78 | --enable_r1_loss \
79 | --r1_lambda $R1_LAMBDA \
80 | --original_loss \
81 | --overall_ratio 1.0 \
82 | --keep_boundary_unchanged \
83 | --enable_code_switch \
84 | --code_switch_ratio $CSR \
85 | --dict_dir $DATA_DIR/dicts \
86 | --dict_languages ar,de,el,es,hi,ru,th,tr,vi,zh \
87 | --noised_max_seq_length $MAXL
88 | elif [ $STAGE == 2 ]; then
89 | FIRST_STAGE_MODEL_PATH="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-CS-csr${CSR}-R1_LAMBDA${R1_LAMBDA}/"
90 | OUTPUT_DIR="${OUT_DIR}/${TASK}/${MODEL}-LR${LR}-epoch${EPOCH}-MaxLen${MAXL}-SS-bsr${BSR}-sa${SA}-snbs${SNBS}-R1_Lambda${R1_LAMBDA}-Aug1.0-MT-R2_Lambda${R2_LAMBDA}/"
91 | python ./src/run_qa.py --model_type xlmr \
92 | --task_name $TASK \
93 | --model_name_or_path $MODEL_PATH \
94 | --do_train \
95 | --do_eval \
96 | --language $LANGS \
97 | --train_language en \
98 | --data_dir $DATA_DIR/$TASK/ \
99 | --per_gpu_train_batch_size $BATCH_SIZE \
100 | --gradient_accumulation_steps $GRAD_ACC \
101 | --per_gpu_eval_batch_size 128 \
102 | --learning_rate $LR \
103 | --num_train_epochs $EPOCH \
104 | --save_steps 0 \
105 | --logging_each_epoch \
106 | --max_seq_length $MAXL \
107 | --doc_stride 128 \
108 | --output_dir $OUTPUT_DIR \
109 | --overwrite_output_dir \
110 | --evaluate_during_training \
111 | --logging_steps 50 \
112 | --evaluate_steps 0 \
113 | --seed $SEED \
114 | --fp16 --fp16_opt_level O2 \
115 | --warmup_steps -1 \
116 | --enable_r1_loss \
117 | --r1_lambda $R1_LAMBDA \
118 | --original_loss \
119 | --overall_ratio 1.0 \
120 | --keep_boundary_unchanged \
121 | --enable_bpe_sampling \
122 | --bpe_sampling_ratio $BSR \
123 | --sampling_alpha $SA \
124 | --sampling_nbest_size $SNBS \
125 | --noised_max_seq_length $MAXL \
126 | --enable_data_augmentation \
127 | --augment_ratio 1.0 \
128 | --augment_method mt \
129 | --translation_path $TRANSLATION_PATH \
130 | --max_steps 24000 \
131 | --r2_lambda $R2_LAMBDA \
132 | --first_stage_model_path $FIRST_STAGE_MODEL_PATH
133 | fi
134 |
135 |
136 |
--------------------------------------------------------------------------------
/src/pequod/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__init__.py
--------------------------------------------------------------------------------
/src/pequod/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from transformers.data.processors.utils import InputFeatures
3 |
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | def convert_examples_to_features(
9 | processor, examples, tokenizer, max_length, label_list,
10 | pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True):
11 |
12 | if label_list is None: label_list = processor.get_labels()
13 |
14 | label_map = {label: i for i, label in enumerate(label_list)}
15 |
16 | features = []
17 | for ex_index, example in enumerate(examples):
18 | if ex_index % 10000 == 0:
19 | logger.info("Writing example %d" % ex_index)
20 | inputs = tokenizer.encode_plus(
21 | example.text_a,
22 | example.text_b,
23 | add_special_tokens=True,
24 | max_length=max_length)
25 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
26 |
27 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
28 |
29 | padding_length = max_length - len(input_ids)
30 | input_ids = input_ids + ([pad_token] * padding_length)
31 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
32 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
33 |
34 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
35 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
36 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
37 |
38 | label = label_map[example.label]
39 | if ex_index < 3:
40 | logger.info("*** Example ***")
41 | logger.info("guid: %s" % (example.guid))
42 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
43 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
44 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
45 | logger.info("label: %s (id = %d)" % (example.label, label))
46 |
47 | features.append(InputFeatures(
48 | input_ids=input_ids,
49 | attention_mask=attention_mask,
50 | token_type_ids=token_type_ids,
51 | label=label))
52 |
53 | return features
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/sampler.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/sampler.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/utils_squad.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/utils_squad_evaluate.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xdoc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xdoc.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xqa.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/__pycache__/xretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/__pycache__/xretrieval.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/data/dataloader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/data/dataloader.py
--------------------------------------------------------------------------------
/src/pequod/data/sampler.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from torch.utils.data.sampler import Sampler
4 |
5 |
6 | class SubSampler(Sampler):
7 |
8 | def __init__(self, data_source, num_samples):
9 | self.data_source = data_source
10 | self.num_samples = num_samples
11 |
12 | def __len__(self):
13 | return self.num_samples
14 |
15 | def __iter__(self):
16 | n = len(self.data_source)
17 | if self.num_samples <= n:
18 | return iter(torch.randperm(n).tolist()[:self.num_samples])
19 | return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64).tolist())
--------------------------------------------------------------------------------
/src/pequod/data/wili.py:
--------------------------------------------------------------------------------
1 | """Loading examples and features for WiLI-2018 dataset"""
2 |
3 | import logging
4 | import os
5 | import torch
6 |
7 | from transformers.data.processors.utils import (DataProcessor,
8 | InputExample, InputFeatures)
9 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
10 | TensorDataset)
11 | from src.data import convert_examples_to_features
12 | from src.io import lines_gen
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | _alias2lang = {}
19 | _lang2id = {}
20 | _langs = []
21 |
22 | def get_alias2lang(data_dir):
23 | if len(_alias2lang) > 0: return _alias2lang, _lang2id, _langs
24 | for line, in lines_gen(os.path.join(data_dir, "labels-new")):
25 | value = None
26 | for alias in line.split(";"):
27 | alias = alias.strip()
28 | if alias == "": continue
29 | if value is None: value = alias
30 | _alias2lang[alias] = value
31 | _langs.append(value)
32 | for i, lang in enumerate(_langs): _lang2id[lang] = i
33 | return _alias2lang, _lang2id, _langs
34 |
35 |
36 | def load_and_cache_examples(args, data_dir, split, run_lang2id, tokenizer, key=""):
37 | cache_filename = os.path.join(
38 | data_dir, "cached_%s_%s" % (split, key))
39 |
40 | if os.path.exists(cache_filename) and not args.overwrite_cache:
41 | logger.info("Loading features from cached file %s" % cache_filename)
42 | features = torch.load(cache_filename)
43 | else:
44 | processor = WiliProcessor()
45 | logger.info("Creating features from dataset file at %s" % data_dir)
46 | label_list = processor.get_labels(data_dir)
47 | examples = processor.get_examples(data_dir, split)
48 | logger.info("%d Examples loaded" % len(examples))
49 | features = convert_examples_to_features(
50 | processor, examples, tokenizer, max_length=args.max_seq_length,
51 | label_list=label_list, pad_token_segment_id=0,
52 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0])
53 | logger.info("Saving features to cache file %s" % cache_filename)
54 | torch.save(features, cache_filename)
55 |
56 | # Cut dataset to test langs
57 | alias2lang, lang2id, _ = get_alias2lang(data_dir)
58 | test_lang_ids = {lang2id[alias2lang[lang]] for lang in run_lang2id.keys()}
59 | wili_id2run_langid = {
60 | lang2id[alias2lang[lang]]:val for lang, val in run_lang2id.items()}
61 |
62 | all_input_ids, all_attention_mask = [], []
63 | all_token_type_ids, all_labels = [], []
64 | for f in features:
65 | if f.label not in test_lang_ids: continue
66 | all_input_ids.append(f.input_ids)
67 | all_attention_mask.append(f.attention_mask)
68 | all_token_type_ids.append(f.token_type_ids)
69 | all_labels.append(wili_id2run_langid[f.label])
70 |
71 | all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
72 | all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
73 | all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
74 | all_labels = torch.tensor(all_labels, dtype=torch.long)
75 |
76 | dataset = TensorDataset(
77 | all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
78 |
79 | return dataset
80 |
81 |
82 | class WiliProcessor(DataProcessor):
83 |
84 | def get_examples(self, data_dir, split):
85 | examples = []
86 | filename_x = os.path.join(data_dir, "x_%s.txt" % split)
87 | filename_y = os.path.join(data_dir, "y_%s.txt" % split)
88 | for i, (line_x, line_y) in enumerate(lines_gen(filename_x, filename_y)):
89 | guid = "%s-%s" % (split, i)
90 | examples.append(
91 | InputExample(guid=guid, text_a=line_x, text_b=None, label=line_y))
92 | return examples
93 |
94 | def get_labels(self, data_dir):
95 | _, _, langs = get_alias2lang(data_dir)
96 | return langs
97 |
--------------------------------------------------------------------------------
/src/pequod/data/xqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import torch
4 |
5 | from torch.utils.data import TensorDataset
6 | from src.pequod.data.utils_squad import (read_squad_examples,
7 | convert_examples_to_features)
8 |
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def load_and_cache_examples(args, split, lang, tokenizer, key="", evaluate=False):
14 | cache_filename = os.path.join(
15 | args.data_dir, "cached_%s_%s_%s" % (split, lang, key))
16 |
17 | input_file = os.path.join(args.data_dir, "%s-%s.json" % (split, lang))
18 | if os.path.exists(cache_filename):
19 | logger.info("Loading features from cached file %s", cache_filename)
20 | features = torch.load(cache_filename)
21 | if evaluate:
22 | examples = read_squad_examples(input_file=input_file,
23 | is_training=not evaluate,
24 | version_2_with_negative=args.version_2_with_negative)
25 | else: examples = None
26 | else:
27 | logger.info("Creating features from dataset file at %s", input_file)
28 | examples = read_squad_examples(input_file=input_file,
29 | is_training=not evaluate,
30 | version_2_with_negative=args.version_2_with_negative)
31 | features = convert_examples_to_features(examples=examples,
32 | tokenizer=tokenizer, max_seq_length=args.max_seq_length,
33 | doc_stride=args.doc_stride, max_query_length=args.max_query_length,
34 | is_training=not evaluate, cls_token=tokenizer.cls_token,
35 | sep_token=tokenizer.sep_token)
36 | logger.info("Saving features into cached file %s", cache_filename)
37 | torch.save(features, cache_filename)
38 |
39 | # Convert to Tensors and build dataset
40 | all_input_ids = torch.tensor(
41 | [f.input_ids for f in features], dtype=torch.long)
42 | all_input_mask = torch.tensor(
43 | [f.input_mask for f in features], dtype=torch.long)
44 | all_segment_ids = torch.tensor(
45 | [f.segment_ids for f in features], dtype=torch.long)
46 | all_cls_index = torch.tensor(
47 | [f.cls_index for f in features], dtype=torch.long)
48 | all_p_mask = torch.tensor(
49 | [f.p_mask for f in features], dtype=torch.float)
50 | if evaluate:
51 | all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
52 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
53 | all_example_index, all_cls_index, all_p_mask)
54 | else:
55 | all_start_positions = torch.tensor(
56 | [f.start_position for f in features], dtype=torch.long)
57 | all_end_positions = torch.tensor(
58 | [f.end_position for f in features], dtype=torch.long)
59 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
60 | all_start_positions, all_end_positions, all_cls_index, all_p_mask)
61 |
62 | return dataset, examples, features
63 |
--------------------------------------------------------------------------------
/src/pequod/data/xretrieval.py:
--------------------------------------------------------------------------------
1 | """Load examples from BUCC"""
2 |
3 |
4 | import logging
5 | import os
6 | import torch
7 |
8 |
9 | from transformers.data.processors.utils import (
10 | DataProcessor, InputExample, InputFeatures)
11 | from torch.utils.data import (
12 | DataLoader, RandomSampler, SequentialSampler, TensorDataset)
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def load_and_cache_examples(args, langpair, lang, tokenizer, key="", prefix="tatoeba"):
19 |
20 | cache_dir = os.path.join(args.data_dir, "pequod_cache")
21 | os.makedirs(cache_dir, exist_ok=True)
22 | cache_filename = os.path.join(
23 | cache_dir, "cached_%s_%s_%s" % (langpair, lang, key))
24 |
25 | if os.path.exists(cache_filename) and not args.overwrite_cache:
26 | logger.info("Loading features from cached file %s" % cache_filename)
27 | features = torch.load(cache_filename)
28 | else:
29 | processer = TatoebaProcesser()
30 | logger.info("Creating features from dataset file at %s" % args.data_dir)
31 | examples = processer.get_examples(args.data_dir, langpair, lang, prefix)
32 | features = TatoebaProcesser.convert_examples_to_features(
33 | examples, tokenizer, args.max_seq_length, 0,
34 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],)
35 | #logger.info("Saving features to cache file %s" % cache_filename)
36 | #torch.save(features, cache_filename)
37 |
38 | all_input_ids = torch.tensor(
39 | [f.input_ids for f in features], dtype=torch.long)
40 | all_attention_mask = torch.tensor(
41 | [f.attention_mask for f in features], dtype=torch.long)
42 | all_token_type_ids = torch.tensor(
43 | [f.token_type_ids for f in features], dtype=torch.long)
44 |
45 | dataset = TensorDataset(
46 | all_input_ids, all_attention_mask, all_token_type_ids)
47 |
48 | return dataset
49 |
50 | class TatoebaProcesser(DataProcessor):
51 |
52 | @classmethod
53 | def convert_examples_to_features(cls, examples, tokenizer, max_length, pad_token_segment_id, pad_token, mask_padding_with_zero=True):
54 |
55 | features = []
56 | for ex_index, example in enumerate(examples):
57 | inputs = tokenizer.encode_plus(
58 | example.text_a,
59 | None,
60 | add_special_tokens=True,
61 | max_length=max_length,
62 | )
63 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
64 |
65 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
66 |
67 | padding_length = max_length - len(input_ids)
68 | input_ids = input_ids + ([pad_token] * padding_length)
69 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
70 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
71 |
72 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
73 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
74 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
75 |
76 | if ex_index < 3:
77 | logger.info("*** Example ***")
78 | logger.info("guid: %s" % (example.guid))
79 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
80 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
81 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
82 |
83 | features.append(InputFeatures(
84 | input_ids=input_ids,
85 | attention_mask=attention_mask,
86 | token_type_ids=token_type_ids,
87 | label=None,
88 | ))
89 |
90 | return features
91 |
92 | def get_examples(self, data_dir, langpair, lang, prefix="tatoeba"):
93 | examples = []
94 | if prefix == "bucc":
95 | fn = os.path.join(data_dir, "%s.%s.txt" % (langpair, lang))
96 | else:
97 | fn = os.path.join(data_dir, "%s.%s" % (langpair, lang))
98 | #fn = os.path.join(data_dir, "%s.%s.%s" % (prefix, langpair, lang))
99 | with open(fn, encoding='utf-8') as fp:
100 | for i, line in enumerate(fp):
101 | line = line.strip()
102 | examples.append(InputExample(
103 | guid="%s-%s-%d" % (langpair, lang, i),
104 | text_a=line,
105 | ))
106 | return examples
107 |
--------------------------------------------------------------------------------
/src/pequod/eval/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import torch
4 | import inspect
5 |
6 |
7 | from src.pequod.data.utils_squad import RawResult, write_predictions
8 | from src.pequod.data.utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
9 |
10 |
11 | def to_list(tensor):
12 | return tensor.detach().cpu().tolist()
13 |
14 |
15 | def score_dict_to_string(score_dict):
16 | return " ".join([("%s:%.2f" % (k, v)) for k, v in score_dict.items()])
17 |
18 |
19 | def score_dicts_to_latex(score_dicts):
20 | keys = [k for k in score_dicts[0]]
21 | return "\n".join([""] + [(
22 | " & ".join([key] + [("%.2f" % (sd[key])) for sd in score_dicts])
23 | ) for key in keys])
24 |
25 |
26 | def eval_classification(model, batch_dict_iter):
27 | model.eval()
28 | preds, labels = None, None
29 | for batch_dict in batch_dict_iter:
30 | label_id = batch_dict["labels"].detach().cpu().numpy()
31 | batch_dict.pop("labels")
32 | with torch.no_grad(): logits = model(**batch_dict)[0]
33 | pred = logits.detach().cpu().numpy()
34 | if preds is None: preds, labels = pred, label_id
35 | else:
36 | preds = np.append(preds, pred, axis=0)
37 | labels = np.append(labels, label_id)
38 | preds = np.argmax(preds, axis=1)
39 | result = (preds == labels).mean()
40 | return {"acc": result*100.0}
41 |
42 |
43 | def eval_qa(model, batch_dict_iter, prefix="", **kwargs):
44 |
45 | features = kwargs["all_features"]
46 | output_dir = kwargs["output_dir"]
47 |
48 | model.eval()
49 | all_results = []
50 | for batch_dict, example_indices in batch_dict_iter:
51 | with torch.no_grad(): outputs = model(**batch_dict)
52 |
53 | for i, example_index in enumerate(example_indices):
54 | eval_feature = features[example_index.item()]
55 | unique_id = int(eval_feature.unique_id)
56 | result = RawResult(unique_id = unique_id,
57 | start_logits = to_list(outputs[0][i]),
58 | end_logits = to_list(outputs[1][i]))
59 | all_results.append(result)
60 |
61 | output_prediction_file = os.path.join(
62 | output_dir, "predictions_{}.json".format(prefix))
63 | output_nbest_file = os.path.join(
64 | output_dir, "nbest_predictions_{}.json".format(prefix))
65 | if kwargs["version_2_with_negative"]:
66 | output_null_log_odds_file = os.path.join(
67 | output_dir, "null_odds_{}.json".format(prefix))
68 | else: output_null_log_odds_file = None
69 |
70 | wrt_pred_kwargs = {
71 | "all_results": all_results,
72 | "output_prediction_file": output_prediction_file,
73 | "output_nbest_file": output_nbest_file,
74 | "output_null_log_odds_file": output_null_log_odds_file}
75 |
76 | for key in inspect.getfullargspec(write_predictions).args:
77 | if key not in wrt_pred_kwargs:
78 | wrt_pred_kwargs[key] = kwargs[key]
79 |
80 | write_predictions(**wrt_pred_kwargs)
81 |
82 | # Evaluate with the official SQuAD script
83 | evaluate_options = EVAL_OPTS(
84 | data_file=kwargs["predict_file"],
85 | pred_file=output_prediction_file,
86 | na_prob_file=output_null_log_odds_file,
87 | out_file="/dev/null")
88 | results = evaluate_on_squad(evaluate_options)
89 | return results
90 |
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bretrieval.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/bucc_eval.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/evaluator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/evaluator.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/utils_retrieve.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/eval/__pycache__/xretrieval.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/eval/evaluator.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 |
4 | from torch.utils.data import DataLoader
5 | from src.pequod.training.trainer import to_cuda
6 |
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class Evaluator(object):
12 |
13 | def __init__(self, args, model, tokenizer, **kwargs):
14 | self.args = args
15 | self.datasets = {}
16 | self.model = model
17 | self.tokenizer = tokenizer
18 |
19 | def _parse_batch(self, batch, has_label=True, **kwargs):
20 | _batch = to_cuda(batch)
21 | # _batch = batch
22 | ret = {"input_ids": _batch[0],
23 | "attention_mask": _batch[1],
24 | "token_type_ids": _batch[2] if self.args.model_type == "bert" else None,}
25 | if has_label: ret["labels"] = _batch[3]
26 | ret.update(**kwargs)
27 | return ret
28 |
29 | def run(self):
30 | raise NotImplementedError
31 |
32 | def get_dataset(self, *args, **kwargs):
33 | if args in self.datasets: return self.datasets[args]
34 | dataset = self.load_and_cache_examples(*args, **kwargs)
35 | self.datasets[args] = dataset
36 | return dataset
37 |
38 | def load_and_cache_examples(self, *args, **kwargs):
39 | raise NotImplementedError
40 |
41 | def get_dataloader(self, *args, **kwargs):
42 | logger.info("Getting dataloader - args: %s" % str(args))
43 | dataset = kwargs.pop("dataset", self.get_dataset(*args, **kwargs))
44 | dataloader = DataLoader(dataset, batch_size=self.args.eval_batch_size)
45 | return dataloader
46 |
--------------------------------------------------------------------------------
/src/pequod/io.py:
--------------------------------------------------------------------------------
1 | """I/O"""
2 |
3 | def _lines_gen_from_single_file(filename):
4 | with open(filename) as fp:
5 | for line in fp: yield line.strip()
6 |
7 |
8 | def lines_gen(*filenames):
9 | for ret in zip(*map(_lines_gen_from_single_file, filenames)): yield ret
--------------------------------------------------------------------------------
/src/pequod/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__init__.py
--------------------------------------------------------------------------------
/src/pequod/model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/model/__pycache__/roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/model/__pycache__/roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/model/roberta.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from torch import nn
4 | from torch.nn import CrossEntropyLoss
5 | from transformers.modeling_bert import BertPreTrainedModel, BertForQuestionAnswering
6 | from transformers.modeling_roberta import RobertaModel
7 |
8 |
9 | class RobertaForQuestionAnswering(BertPreTrainedModel):
10 |
11 | base_model_prefix = "roberta"
12 | def __init__(self, config):
13 | BertPreTrainedModel.__init__(self, config)
14 | self.num_labels = config.num_labels
15 | self.roberta = RobertaModel(config)
16 | self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
17 | BertPreTrainedModel.init_weights(self)
18 |
19 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, start_positions=None, end_positions=None, **kwargs):
20 |
21 | outputs = self.roberta(input_ids,
22 | attention_mask=attention_mask,
23 | token_type_ids=token_type_ids,
24 | position_ids=position_ids,
25 | head_mask=head_mask,
26 | **kwargs)
27 |
28 | sequence_output = outputs[0]
29 |
30 | logits = self.qa_outputs(sequence_output)
31 | start_logits, end_logits = logits.split(1, dim=-1)
32 | start_logits = start_logits.squeeze(-1)
33 | end_logits = end_logits.squeeze(-1)
34 |
35 | outputs = (start_logits, end_logits,) + outputs[2:]
36 | if start_positions is not None and end_positions is not None:
37 | # If we are on multi-GPU, split add a dimension
38 | if len(start_positions.size()) > 1:
39 | start_positions = start_positions.squeeze(-1)
40 | if len(end_positions.size()) > 1:
41 | end_positions = end_positions.squeeze(-1)
42 | # sometimes the start/end positions are outside our model inputs, we ignore these terms
43 | ignored_index = start_logits.size(1)
44 | start_positions.clamp_(0, ignored_index)
45 | end_positions.clamp_(0, ignored_index)
46 |
47 | loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
48 | start_loss = loss_fct(start_logits, start_positions)
49 | end_loss = loss_fct(end_logits, end_positions)
50 | total_loss = (start_loss + end_loss) / 2
51 | outputs = (total_loss,) + outputs
52 |
53 | return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
--------------------------------------------------------------------------------
/src/pequod/optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/optim/__init__.py
--------------------------------------------------------------------------------
/src/pequod/optim/la.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import torch
4 | from torch.optim.optimizer import Optimizer
5 |
6 |
7 | class LookaheadWrapper(Optimizer):
8 | r"""Implements a Lookahead wrapper around a given optimizer
9 | """
10 |
11 | def __init__(self, optimizer, la_steps, la_alpha=0.5):
12 | self.optimizer = optimizer
13 | self._la_step = 0 # counter for inner optimizer
14 | self.la_alpha = la_alpha
15 | self._total_la_steps = la_steps
16 |
17 | self.state = defaultdict(dict)
18 |
19 | # Cache the current optimizer parameters
20 | for group in optimizer.param_groups:
21 | for p in group['params']:
22 | param_state = self.state[p]
23 | param_state['cached_params'] = torch.zeros_like(p.data)
24 | param_state['cached_params'].copy_(p.data)
25 |
26 | def __getstate__(self):
27 | return self.optimizer.__getstate__()
28 |
29 | def __setstate__(self, state):
30 | self.optimizer.__setstate__(state)
31 |
32 | def zero_grad(self):
33 | self.optimizer.zero_grad()
34 |
35 | def state_dict(self):
36 | return self.optimizer.state_dict()
37 |
38 | def load_state_dict(self, state_dict):
39 | self.optimizer.load_state_dict(state_dict)
40 |
41 | @property
42 | def param_groups(self):
43 | return self.optimizer.param_groups
44 |
45 | def step(self, closure=None):
46 | """Performs a single Lookahead optimization step.
47 | Arguments:
48 | closure (callable, optional): A closure that reevaluates the model
49 | and returns the loss.
50 | """
51 | loss = self.optimizer.step(closure)
52 | self._la_step += 1
53 |
54 | if self._la_step >= self._total_la_steps:
55 | self._la_step = 0
56 | # Lookahead and cache the current optimizer parameters
57 | for group in self.optimizer.param_groups:
58 | for p in group['params']:
59 | param_state = self.state[p]
60 | p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params'])
61 | param_state['cached_params'].copy_(p.data)
62 | return loss
63 |
--------------------------------------------------------------------------------
/src/pequod/optim/la0.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import torch
4 | from torch.optim.optimizer import Optimizer
5 |
6 |
7 | class Lookahead0Wrapper(Optimizer):
8 | r"""Implements a Lookahead wrapper around a given optimizer
9 | """
10 |
11 | def __init__(self, optimizer, la_steps, la_alpha=0.5):
12 | self.optimizer = optimizer
13 | self._la_step = 0 # counter for inner optimizer
14 | self.la_alpha = la_alpha
15 | self._total_la_steps = la_steps
16 |
17 | self.state = defaultdict(dict)
18 |
19 | # Cache the current optimizer parameters
20 | for group in optimizer.param_groups:
21 | for p in group['params']:
22 | param_state = self.state[p]
23 | param_state['cached_params'] = torch.zeros_like(p.data)
24 | param_state['cached_params'].copy_(p.data)
25 |
26 | def __getstate__(self):
27 | return self.optimizer.__getstate__()
28 |
29 | def __setstate__(self, state):
30 | self.optimizer.__setstate__(state)
31 |
32 | def zero_grad(self):
33 | self.optimizer.zero_grad()
34 |
35 | def state_dict(self):
36 | return self.optimizer.state_dict()
37 |
38 | def load_state_dict(self, state_dict):
39 | self.optimizer.load_state_dict(state_dict)
40 |
41 | @property
42 | def param_groups(self):
43 | return self.optimizer.param_groups
44 |
45 | def step(self, closure=None):
46 | """Performs a single Lookahead optimization step.
47 | Arguments:
48 | closure (callable, optional): A closure that reevaluates the model
49 | and returns the loss.
50 | """
51 | loss = self.optimizer.step(closure)
52 | self._la_step += 1
53 |
54 | if self._la_step >= self._total_la_steps:
55 | self._la_step = 0
56 | # Lookahead and cache the current optimizer parameters
57 | for group in self.optimizer.param_groups:
58 | for p in group['params']:
59 | param_state = self.state[p]
60 | p.data.mul_(self.la_alpha).add_(1 - self.la_alpha, param_state['cached_params'])
61 | # param_state['cached_params'].copy_(p.data)
62 | return loss
63 |
--------------------------------------------------------------------------------
/src/pequod/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__init__.py
--------------------------------------------------------------------------------
/src/pequod/text/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/text/__pycache__/tokenization_sentencepiece.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/text/tokenization_sentencepiece.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import sentencepiece as spm
4 | from transformers.tokenization_utils import PreTrainedTokenizer
5 |
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class XLMRTokenizer(PreTrainedTokenizer):
11 |
12 | def __init__(self, bpe_file, dict_file, **kwargs):
13 | super(XLMRTokenizer, self).__init__(
14 | bos_token="",
15 | eos_token="",
16 | unk_token="",
17 | pad_token="",
18 | mask_token="",
19 | sep_token="",
20 | cls_token="",
21 | **kwargs)
22 |
23 | self.max_len_single_sentence = self.max_len - 2
24 | self.max_len_sentences_pair = self.max_len - 4
25 |
26 | self.sp = spm.SentencePieceProcessor()
27 | self.sp.Load(bpe_file)
28 |
29 | self.encoder = {}
30 | self.decoder = []
31 |
32 | for token in [self.bos_token, self.pad_token, self.eos_token, self.unk_token]:
33 | self._add_token(token)
34 |
35 | with open(dict_file, encoding="utf-8") as fp:
36 | for line in fp:
37 | # NOTE DO NOT USE .split()
38 | tokens_cnt = line.rstrip().split(" ")
39 | try:
40 | assert len(tokens_cnt) >= 2, line
41 | except AssertionError:
42 | logger.error(
43 | "tokenizer line %s asserterror, replaced as " % (
44 | line, len(self.decoder)))
45 | exit(0)
46 | self._add_token(" ".join(tokens_cnt[:-1]))
47 |
48 | def _add_token(self, token):
49 | idx = len(self.encoder)
50 | self.encoder[token] = idx
51 | self.decoder.append(token)
52 |
53 | def _tokenize(self, text):
54 | return self.sp.EncodeAsPieces(text)
55 |
56 | def _convert_id_to_token(self, index):
57 | return self.decoder[index]
58 |
59 | def _convert_token_to_id(self, token):
60 | return self.encoder.get(token, self.encoder.get(self.unk_token))
61 |
62 | def convert_tokens_to_string(self, tokens):
63 | return "".join(tokens).replace('\u2581', ' ').strip()
64 |
65 | @classmethod
66 | def from_pretrained(cls, model_path, **kwargs):
67 | bpe_file = os.path.join(model_path, "sentencepiece.bpe.model")
68 | dict_file = os.path.join(model_path, "dict.txt")
69 | tokenizer = cls(bpe_file, dict_file)
70 | return tokenizer
71 |
72 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
73 | if token_ids_1 is None:
74 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
75 | cls = [self.cls_token_id]
76 | sep = [self.sep_token_id]
77 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep
78 |
79 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
80 | if already_has_special_tokens:
81 | if token_ids_1 is not None:
82 | raise ValueError("You should not supply a second sequence if the provided sequence of ids is already formated with special tokens for the model.")
83 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
84 |
85 | if token_ids_1 is None:
86 | return [1] + ([0] * len(token_ids_0)) + [1]
87 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
88 |
89 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
90 | sep = [self.sep_token_id]
91 | cls = [self.cls_token_id]
92 |
93 | if token_ids_1 is None:
94 | return len(cls + token_ids_0 + sep) * [0]
95 | return len(cls + token_ids_0 + sep) * [0] + len(sep + token_ids_1 + sep) * [1]
96 |
97 |
98 | if __name__ == "__main__":
99 | tokenizer = XLMRTokenizer.from_pretrained("/home/v-zechi/data/unilm/zechi/exp/bert_data/xlmr-large")
100 |
101 | for text in ["Hello world!", "你好,世界", "नमस्ते दुनिया", "مرحبا بالعالم", "Bonjour le monde"]:
102 | print(tokenizer.tokenize(text))
103 | print(tokenizer.encode_plus(text, text, add_special_tokens=True))
104 |
--------------------------------------------------------------------------------
/src/pequod/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__init__.py
--------------------------------------------------------------------------------
/src/pequod/tools/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/tools/__pycache__/convert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/tools/__pycache__/convert.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/training/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | import os
4 | import random
5 | import torch
6 | import pickle
7 | import logging
8 | import numpy as np
9 |
10 | # from transformers import (WEIGHTS_NAME,
11 | # BertConfig, BertForSequenceClassification, BertTokenizer,
12 | # RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
13 | # RobertaModel, BertModel, XLMModel,
14 | # XLMConfig, XLMForSequenceClassification, XLMTokenizer,
15 | # XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
16 | # DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
17 | # BertForQuestionAnswering)
18 | #
19 | # from src.pequod.model.roberta import RobertaForQuestionAnswering
20 | from transformers import XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer
21 |
22 | # ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
23 | # for conf in (BertConfig, XLNetConfig, XLMConfig,
24 | # RobertaConfig, DistilBertConfig)), ())
25 |
26 | ALL_MODELS = []
27 |
28 | # # Model classes for classification
29 | # MODEL_CLASSES = {
30 | # 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
31 | # 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
32 | # 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
33 | # 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
34 | # 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
35 | # "xlmr": (RobertaConfig, RobertaForSequenceClassification, XLMRTokenizer)
36 | # }
37 | #
38 | # QA_MODELS = {
39 | # "bert": BertForQuestionAnswering,
40 | # "roberta": RobertaForQuestionAnswering,
41 | # "xlmr": RobertaForQuestionAnswering,
42 | # }
43 |
44 | BERT_CLASSES = {
45 | "xlmr": (XLMRobertaConfig, XLMRobertaForRetrieval, XLMRobertaTokenizer),
46 | }
47 |
48 |
49 | def to_cuda(tup):
50 | return tuple(t.cuda() for t in tup)
51 |
52 |
53 | def set_seed(args):
54 | random.seed(args.seed)
55 | np.random.seed(args.seed)
56 | torch.manual_seed(args.seed)
57 | #TODO multi gpu support
58 | # if args.n_gpu > 0:
59 | # torch.cuda.manual_seed_all(args.seed)
60 |
61 |
62 | def init_exp(args):
63 | # dump parameters
64 | set_dump_path(args)
65 | pickle.dump(args, open(os.path.join(args.dump_path, 'params.pkl'), 'wb'))
66 |
67 | # get running command
68 | command = ["python", sys.argv[0]]
69 | for x in sys.argv[1:]:
70 | if x.startswith('--'):
71 | assert '"' not in x and "'" not in x
72 | command.append(x)
73 | else:
74 | assert "'" not in x
75 | if re.match('^[a-zA-Z0-9_]+$', x):
76 | command.append("%s" % x)
77 | else:
78 | command.append("'%s'" % x)
79 | command = ' '.join(command)
80 | args.command = command + ' --exp_id "%s"' % args.exp_id
81 |
82 | # check experiment name
83 | assert len(args.exp_name.strip()) > 0
84 |
85 | logging.basicConfig(
86 | format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
87 | datefmt = '%m/%d/%Y %H:%M:%S',
88 | level = logging.INFO)
89 | logger = logging.getLogger(__name__)
90 | logger.info("\n".join(
91 | "%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
92 | logger.info("The experiment will be stored in %s\n" % args.dump_path)
93 | logger.info("Running command: %s" % command)
94 | logger.info("")
95 |
96 |
97 | def set_dump_path(args, output_dir=None, exp_name=None):
98 | if output_dir is None: output_dir = args.output_dir
99 | if exp_name is None: exp_name = args.exp_name
100 | chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
101 | while True:
102 | exp_id = ''.join(random.choice(chars) for _ in range(10))
103 | if not os.path.isdir(os.path.join(output_dir, exp_name, exp_id)):
104 | break
105 | args.exp_id = exp_id
106 | dump_path = os.path.join(output_dir, exp_name, exp_id)
107 | os.makedirs(dump_path)
108 | args.dump_path = dump_path
109 |
--------------------------------------------------------------------------------
/src/pequod/training/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/pequod/training/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/pequod/training/__pycache__/trainer.cpython-37.pyc
--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/tools/__init__.py
--------------------------------------------------------------------------------
/src/tools/check_many2many_alignment.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | if __name__ == "__main__":
4 | parser = argparse.ArgumentParser()
5 |
6 | # Required parameters
7 | parser.add_argument(
8 | "--translation_path",
9 | default=None,
10 | type=str,
11 | required=True,
12 | help="",
13 | )
14 |
15 | drop_languages = ["en", "zh-CN", "zh", "ja", "ko", "th", "my", "ml", "ta"]
16 | translate_languages = None
17 | args = parser.parse_args()
18 | src2tgt = {}
19 | print("Reading translation from {}".format(args.translation_path))
20 | with open(args.translation_path, encoding="utf-8") as f:
21 | cnt = 0
22 | for line in f:
23 | cnt += 1
24 | if cnt % 10000 == 0:
25 | print("Reading lines {}".format(cnt))
26 | items = line.split("\t")
27 |
28 | if items == 3:
29 | src_sent, tgt_lang, tgt_sent = line.split("\t")
30 | alignment = None
31 | else:
32 | src_sent, tgt_lang, tgt_sent, alignment_str = line.split("\t")
33 | alignment = []
34 | for x in alignment_str.split(" "):
35 | alignment.append((int(x.split("/")[0]), int(x.split("/")[1])))
36 |
37 | if tgt_lang in drop_languages:
38 | continue
39 | if translate_languages is not None and tgt_lang not in translate_languages:
40 | continue
41 |
42 | cnt_src = {}
43 | cnt_tgt = {}
44 | for x in alignment:
45 |
46 | if x[0] not in cnt_src:
47 | cnt_src[x[0]] = 0
48 | cnt_src[x[0]] += 1
49 |
50 | if x[1] not in cnt_tgt:
51 | cnt_tgt[x[1]] = 0
52 | cnt_tgt[x[1]] += 1
53 |
54 | if not (cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1):
55 | print(cnt_src, cnt_tgt)
56 | print(alignment)
57 | print(src_sent, tgt_sent)
58 |
59 | assert cnt_src[x[0]] <= 1 or cnt_tgt[x[1]] <= 1
60 |
61 |
62 |
--------------------------------------------------------------------------------
/src/tools/sample_xnli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import random
4 |
5 | if __name__ == "__main__":
6 | parser = argparse.ArgumentParser()
7 |
8 | # Required parameters
9 | parser.add_argument(
10 | "--input_path",
11 | default=None,
12 | type=str,
13 | required=True,
14 | help="input xnli file",
15 | )
16 | parser.add_argument(
17 | "--output_path",
18 | default=None,
19 | type=str,
20 | required=True,
21 | help="output xnli file",
22 | )
23 | parser.add_argument(
24 | "--sample_ratio",
25 | default=None,
26 | type=float,
27 | required=True,
28 | help="sample ratio",
29 | )
30 |
31 | args = parser.parse_args()
32 | lines = open(args.input_path, "r").readlines()
33 | head = lines[0]
34 | lines = lines[1:]
35 | random.seed(0)
36 | random.shuffle(lines)
37 |
38 | n_lines = int(len(lines) * args.sample_ratio)
39 |
40 | fout = open(args.output_path, "w")
41 | fout.write(head)
42 | for i, line in enumerate(lines[:n_lines]):
43 | fout.write(line)
--------------------------------------------------------------------------------
/src/transformers/activations.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn.functional as F
5 |
6 |
7 | def swish(x):
8 | return x * torch.sigmoid(x)
9 |
10 |
11 | def _gelu_python(x):
12 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
13 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
14 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
15 | This is now written in C in torch.nn.functional
16 | Also see https://arxiv.org/abs/1606.08415
17 | """
18 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
19 |
20 |
21 | if torch.__version__ < "1.4.0":
22 | gelu = _gelu_python
23 | else:
24 | gelu = F.gelu
25 |
26 |
27 | def gelu_new(x):
28 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
29 | Also see https://arxiv.org/abs/1606.08415
30 | """
31 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
32 |
33 |
34 | ACT2FN = {
35 | "relu": F.relu,
36 | "swish": swish,
37 | "gelu": gelu,
38 | "tanh": F.tanh,
39 | "gelu_new": gelu_new,
40 | }
41 |
42 |
43 | def get_activation(activation_string):
44 | if activation_string in ACT2FN:
45 | return ACT2FN[activation_string]
46 | else:
47 | raise KeyError(
48 | "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
49 | activation_string, list(ACT2FN.keys())
50 | )
51 | )
52 |
--------------------------------------------------------------------------------
/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from argparse import ArgumentParser
3 |
4 |
5 | class BaseTransformersCLICommand(ABC):
6 | @staticmethod
7 | @abstractmethod
8 | def register_subcommand(parser: ArgumentParser):
9 | raise NotImplementedError()
10 |
11 | @abstractmethod
12 | def run(self):
13 | raise NotImplementedError()
14 |
--------------------------------------------------------------------------------
/src/transformers/commands/download.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from transformers.commands import BaseTransformersCLICommand
4 |
5 |
6 | def download_command_factory(args):
7 | return DownloadCommand(args.model, args.cache_dir, args.force)
8 |
9 |
10 | class DownloadCommand(BaseTransformersCLICommand):
11 | @staticmethod
12 | def register_subcommand(parser: ArgumentParser):
13 | download_parser = parser.add_parser("download")
14 | download_parser.add_argument(
15 | "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 | )
17 | download_parser.add_argument(
18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 | )
20 | download_parser.add_argument("model", type=str, help="Name of the model to download")
21 | download_parser.set_defaults(func=download_command_factory)
22 |
23 | def __init__(self, model: str, cache: str, force: bool):
24 | self._model = model
25 | self._cache = cache
26 | self._force = force
27 |
28 | def run(self):
29 | from transformers import AutoModel, AutoTokenizer
30 |
31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 |
--------------------------------------------------------------------------------
/src/transformers/commands/env.py:
--------------------------------------------------------------------------------
1 | import platform
2 | from argparse import ArgumentParser
3 |
4 | from transformers import __version__ as version
5 | from transformers import is_tf_available, is_torch_available
6 | from transformers.commands import BaseTransformersCLICommand
7 |
8 |
9 | def info_command_factory(_):
10 | return EnvironmentCommand()
11 |
12 |
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 | @staticmethod
15 | def register_subcommand(parser: ArgumentParser):
16 | download_parser = parser.add_parser("env")
17 | download_parser.set_defaults(func=info_command_factory)
18 |
19 | def run(self):
20 | pt_version = "not installed"
21 | pt_cuda_available = "NA"
22 | if is_torch_available():
23 | import torch
24 |
25 | pt_version = torch.__version__
26 | pt_cuda_available = torch.cuda.is_available()
27 |
28 | tf_version = "not installed"
29 | tf_cuda_available = "NA"
30 | if is_tf_available():
31 | import tensorflow as tf
32 |
33 | tf_version = tf.__version__
34 | try:
35 | # deprecated in v2.1
36 | tf_cuda_available = tf.test.is_gpu_available()
37 | except AttributeError:
38 | # returns list of devices, convert to bool
39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 |
41 | info = {
42 | "`transformers` version": version,
43 | "Platform": platform.platform(),
44 | "Python version": platform.python_version(),
45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 | "Using GPU in script?": "",
48 | "Using distributed or parallel set-up in script?": "",
49 | }
50 |
51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 | print(self.format_dict(info))
53 |
54 | return info
55 |
56 | @staticmethod
57 | def format_dict(d):
58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 |
--------------------------------------------------------------------------------
/src/transformers/commands/run.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from argparse import ArgumentParser
3 |
4 | from transformers.commands import BaseTransformersCLICommand
5 | from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
6 |
7 |
8 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name
9 |
10 |
11 | def try_infer_format_from_ext(path: str):
12 | if not path:
13 | return "pipe"
14 |
15 | for ext in PipelineDataFormat.SUPPORTED_FORMATS:
16 | if path.endswith(ext):
17 | return ext
18 |
19 | raise Exception(
20 | "Unable to determine file format from file extension {}. "
21 | "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
22 | )
23 |
24 |
25 | def run_command_factory(args):
26 | nlp = pipeline(
27 | task=args.task,
28 | model=args.model if args.model else None,
29 | config=args.config,
30 | tokenizer=args.tokenizer,
31 | device=args.device,
32 | )
33 | format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
34 | reader = PipelineDataFormat.from_str(
35 | format=format,
36 | output_path=args.output,
37 | input_path=args.input,
38 | column=args.column if args.column else nlp.default_input_names,
39 | overwrite=args.overwrite,
40 | )
41 | return RunCommand(nlp, reader)
42 |
43 |
44 | class RunCommand(BaseTransformersCLICommand):
45 | def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
46 | self._nlp = nlp
47 | self._reader = reader
48 |
49 | @staticmethod
50 | def register_subcommand(parser: ArgumentParser):
51 | run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
52 | run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
53 | run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
54 | run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
55 | run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
56 | run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
57 | run_parser.add_argument(
58 | "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
59 | )
60 | run_parser.add_argument(
61 | "--column",
62 | type=str,
63 | help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
64 | )
65 | run_parser.add_argument(
66 | "--format",
67 | type=str,
68 | default="infer",
69 | choices=PipelineDataFormat.SUPPORTED_FORMATS,
70 | help="Input format to read from",
71 | )
72 | run_parser.add_argument(
73 | "--device",
74 | type=int,
75 | default=-1,
76 | help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
77 | )
78 | run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
79 | run_parser.set_defaults(func=run_command_factory)
80 |
81 | def run(self):
82 | nlp, outputs = self._nlp, []
83 |
84 | for entry in self._reader:
85 | output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
86 | if isinstance(output, dict):
87 | outputs.append(output)
88 | else:
89 | outputs += output
90 |
91 | # Saving data
92 | if self._nlp.binary_output:
93 | binary_path = self._reader.save_binary(outputs)
94 | logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
95 | else:
96 | self._reader.save(outputs)
97 |
--------------------------------------------------------------------------------
/src/transformers/configuration_bart.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ BART configuration """
16 |
17 |
18 | import logging
19 |
20 | from .configuration_utils import PretrainedConfig
21 |
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 | BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26 | "bart-large": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large/config.json",
27 | "bart-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json",
28 | "bart-large-cnn": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json",
29 | }
30 |
31 |
32 | class BartConfig(PretrainedConfig):
33 | r"""
34 | Configuration class for Bart. Parameters are renamed from the fairseq implementation
35 | """
36 | model_type = "bart"
37 | pretrained_config_archive_map = BART_PRETRAINED_CONFIG_ARCHIVE_MAP
38 |
39 | def __init__(
40 | self,
41 | activation_dropout=0.0,
42 | vocab_size=50265,
43 | pad_token_id=1,
44 | eos_token_id=2,
45 | d_model=1024,
46 | encoder_ffn_dim=4096,
47 | encoder_layers=12,
48 | encoder_attention_heads=16,
49 | decoder_ffn_dim=4096,
50 | decoder_layers=12,
51 | decoder_attention_heads=16,
52 | encoder_layerdrop=0.0,
53 | decoder_layerdrop=0.0,
54 | attention_dropout=0.0,
55 | dropout=0.1,
56 | max_position_embeddings=1024,
57 | init_std=0.02,
58 | classifier_dropout=0.0,
59 | output_past=False,
60 | num_labels=3,
61 | bos_token_id=0,
62 | **common_kwargs
63 | ):
64 | r"""
65 | :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
66 | Examples:
67 | config = BartConfig.from_pretrained('bart-large')
68 | model = BartModel(config)
69 | """
70 | super().__init__(
71 | num_labels=num_labels,
72 | output_past=output_past,
73 | pad_token_id=pad_token_id,
74 | bos_token_id=bos_token_id,
75 | **common_kwargs,
76 | )
77 | self.vocab_size = vocab_size
78 | self.d_model = d_model # encoder_embed_dim and decoder_embed_dim
79 | self.eos_token_id = eos_token_id
80 | self.encoder_ffn_dim = encoder_ffn_dim
81 | self.encoder_layers = self.num_hidden_layers = encoder_layers
82 | self.encoder_attention_heads = encoder_attention_heads
83 | self.encoder_layerdrop = encoder_layerdrop
84 | self.decoder_layerdrop = decoder_layerdrop
85 | self.decoder_ffn_dim = decoder_ffn_dim
86 | self.decoder_layers = decoder_layers
87 | self.decoder_attention_heads = decoder_attention_heads
88 | self.max_position_embeddings = max_position_embeddings
89 | self.init_std = init_std # Normal(0, this parameter)
90 |
91 | # 3 Types of Dropout
92 | self.attention_dropout = attention_dropout
93 | self.activation_dropout = activation_dropout
94 | self.dropout = dropout
95 |
96 | # Classifier stuff
97 | self.classif_dropout = classifier_dropout
98 |
99 | @property
100 | def num_attention_heads(self):
101 | return self.encoder_attention_heads
102 |
103 | @property
104 | def hidden_size(self):
105 | return self.d_model
106 |
--------------------------------------------------------------------------------
/src/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 |
18 |
19 | import logging
20 |
21 | from .configuration_roberta import RobertaConfig
22 |
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 |
32 |
33 | class CamembertConfig(RobertaConfig):
34 | """
35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 | superclass for the appropriate documentation alongside usage examples.
37 | """
38 |
39 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
40 | model_type = "camembert"
41 |
--------------------------------------------------------------------------------
/src/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # Copyright (c) HuggingFace Inc. team.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 |
18 |
19 | import logging
20 |
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | class MMBTConfig(object):
26 | """Configuration class to store the configuration of a `MMBT Model`.
27 |
28 | Args:
29 | config (:obj:`~transformers.PreTrainedConfig`):
30 | Config of the underlying Transformer models. Its values are
31 | copied over to use a single config.
32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 | Size of final Linear layer for classification.
34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 | Embedding dimension of the non-text modality encoder.
36 | """
37 |
38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 | self.__dict__ = config.__dict__
40 | self.modal_hidden_size = modal_hidden_size
41 | if num_labels:
42 | self.num_labels = num_labels
43 |
--------------------------------------------------------------------------------
/src/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 |
18 |
19 | import logging
20 |
21 | from .configuration_bert import BertConfig
22 |
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 | "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
28 | "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
29 | "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
30 | "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
31 | "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
32 | "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
33 | }
34 |
35 |
36 | class RobertaConfig(BertConfig):
37 | r"""
38 | This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
39 | It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
40 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
41 | the BERT `bert-base-uncased `__ architecture.
42 |
43 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
44 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
45 | for more information.
46 |
47 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
48 | It reuses the same defaults. Please check the parent class for more information.
49 |
50 | Example::
51 |
52 | from transformers import RobertaConfig, RobertaModel
53 |
54 | # Initializing a RoBERTa configuration
55 | configuration = RobertaConfig()
56 |
57 | # Initializing a model from the configuration
58 | model = RobertaModel(configuration)
59 |
60 | # Accessing the model configuration
61 | configuration = model.config
62 |
63 | Attributes:
64 | pretrained_config_archive_map (Dict[str, str]):
65 | A dictionary containing all the available pre-trained checkpoints.
66 | """
67 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
68 | model_type = "roberta"
69 |
--------------------------------------------------------------------------------
/src/transformers/configuration_t5.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2010, The T5 Authors and HuggingFace Inc.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ T5 model configuration """
16 |
17 |
18 | import logging
19 |
20 | from .configuration_utils import PretrainedConfig
21 |
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 | T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26 | "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
27 | "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
28 | "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
29 | "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
30 | "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
31 | }
32 |
33 |
34 | class T5Config(PretrainedConfig):
35 | r"""
36 | :class:`~transformers.T5Config` is the configuration class to store the configuration of a
37 | `T5Model`.
38 |
39 |
40 | Arguments:
41 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
42 | hidden_size: Size of the encoder layers and the pooler layer.
43 | num_hidden_layers: Number of hidden layers in the Transformer encoder.
44 | num_attention_heads: Number of attention heads for each attention layer in
45 | the Transformer encoder.
46 | intermediate_size: The size of the "intermediate" (i.e., feed-forward)
47 | layer in the Transformer encoder.
48 | hidden_act: The non-linear activation function (function or string) in the
49 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
50 | hidden_dropout_prob: The dropout probabilitiy for all fully connected
51 | layers in the embeddings, encoder, and pooler.
52 | attention_probs_dropout_prob: The dropout ratio for the attention
53 | probabilities.
54 | max_position_embeddings: The maximum sequence length that this model might
55 | ever be used with. Typically set this to something large just in case
56 | (e.g., 512 or 1024 or 2048).
57 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into
58 | `T5Model`.
59 | initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
60 | layer_norm_eps: The epsilon used by LayerNorm.
61 | """
62 | pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
63 | model_type = "t5"
64 |
65 | def __init__(
66 | self,
67 | vocab_size=32128,
68 | n_positions=512,
69 | d_model=512,
70 | d_kv=64,
71 | d_ff=2048,
72 | num_layers=6,
73 | num_heads=8,
74 | relative_attention_num_buckets=32,
75 | dropout_rate=0.1,
76 | layer_norm_epsilon=1e-6,
77 | initializer_factor=1.0,
78 | **kwargs
79 | ):
80 | super().__init__(**kwargs)
81 | self.vocab_size = vocab_size
82 | self.n_positions = n_positions
83 | self.d_model = d_model
84 | self.d_kv = d_kv
85 | self.d_ff = d_ff
86 | self.num_layers = num_layers
87 | self.num_heads = num_heads
88 | self.relative_attention_num_buckets = relative_attention_num_buckets
89 | self.dropout_rate = dropout_rate
90 | self.layer_norm_epsilon = layer_norm_epsilon
91 | self.initializer_factor = initializer_factor
92 |
93 | @property
94 | def max_position_embeddings(self):
95 | return self.n_positions
96 |
97 | @property
98 | def hidden_size(self):
99 | return self.d_model
100 |
101 | @property
102 | def num_attention_heads(self):
103 | return self.num_heads
104 |
105 | @property
106 | def num_hidden_layers(self):
107 | return self.num_layers
108 |
--------------------------------------------------------------------------------
/src/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 |
18 |
19 | import logging
20 |
21 | from .configuration_roberta import RobertaConfig
22 |
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
33 | }
34 |
35 |
36 | class XLMRobertaConfig(RobertaConfig):
37 | """
38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the
39 | superclass for the appropriate documentation alongside usage examples.
40 | """
41 |
42 | pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
43 | model_type = "xlm-roberta"
44 |
--------------------------------------------------------------------------------
/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = AlbertConfig.from_json_file(albert_config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = AlbertForMaskedLM(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--albert_config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BART checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 | from pathlib import Path
21 |
22 | import fairseq
23 | import torch
24 | from packaging import version
25 |
26 | from transformers import BartConfig, BartForMaskedLM, BartForSequenceClassification, BartModel, BartTokenizer
27 |
28 |
29 | FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn"]
30 |
31 | if version.parse(fairseq.__version__) < version.parse("0.9.0"):
32 | raise Exception("requires fairseq >= 0.9.0")
33 |
34 |
35 | logging.basicConfig(level=logging.INFO)
36 | logger = logging.getLogger(__name__)
37 |
38 | SAMPLE_TEXT = " Hello world! cécé herlolip"
39 |
40 | rename_keys = [
41 | ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
42 | ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
43 | ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
44 | ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
45 | ]
46 | IGNORE_KEYS = ["encoder.version", "decoder.version", "model.encoder.version", "model.decoder.version", "_float_tensor"]
47 |
48 |
49 | def rename_key(dct, old, new):
50 | val = dct.pop(old)
51 | dct[new] = val
52 |
53 |
54 | def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
55 | """
56 | Copy/paste/tweak model's weights to our BERT structure.
57 | """
58 | bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
59 | bart.eval() # disable dropout
60 | bart.model.upgrade_state_dict(bart.model.state_dict())
61 | hf_model_name = checkpoint_path.replace(".", "-")
62 | config = BartConfig.from_pretrained(hf_model_name)
63 | tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
64 | tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
65 | assert torch.eq(tokens, tokens2).all()
66 |
67 | if checkpoint_path in ["bart.large", "bart.large.cnn"]:
68 | state_dict = bart.model.state_dict()
69 | for k in IGNORE_KEYS:
70 | state_dict.pop(k, None)
71 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
72 | model = BartModel(config)
73 | their_output = bart.extract_features(tokens)
74 | else: # MNLI Case
75 | state_dict = bart.state_dict()
76 | for k in IGNORE_KEYS:
77 | state_dict.pop(k, None)
78 | state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
79 | for src, dest in rename_keys:
80 | rename_key(state_dict, src, dest)
81 | model = BartForSequenceClassification(config)
82 | their_output = bart.predict("mnli", tokens, return_logits=True)
83 |
84 | # Load state dict
85 | model.load_state_dict(state_dict)
86 | model.eval()
87 | # Check results
88 |
89 | if checkpoint_path == "bart.large.cnn": # generate doesnt work yet
90 | model = BartForMaskedLM(config, base_model=model)
91 | assert "lm_head.weight" in model.state_dict()
92 | assert model.lm_head.out_features == config.max_position_embeddings
93 | model.eval()
94 | our_outputs = model.model.forward(tokens)[0]
95 | else:
96 | our_outputs = model.forward(tokens)[0]
97 | assert their_output.shape == our_outputs.shape
98 | assert (their_output == our_outputs).all().item()
99 | Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
100 | model.save_pretrained(pytorch_dump_folder_path)
101 |
102 |
103 | if __name__ == "__main__":
104 | parser = argparse.ArgumentParser()
105 | # Required parameters
106 | parser.add_argument("fairseq_path", choices=FAIRSEQ_MODELS, type=str, help="")
107 |
108 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
109 | args = parser.parse_args()
110 | convert_bart_checkpoint(
111 | args.fairseq_path, args.pytorch_dump_folder_path,
112 | )
113 |
--------------------------------------------------------------------------------
/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = BertConfig.from_json_file(bert_config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = BertForPreTraining(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--bert_config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained BERT model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
17 |
18 | import argparse
19 | import os
20 |
21 | import numpy as np
22 | import tensorflow as tf
23 | import torch
24 |
25 | from transformers import BertModel
26 |
27 |
28 | def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
29 |
30 | """
31 | :param model:BertModel Pytorch model instance to be converted
32 | :param ckpt_dir: Tensorflow model directory
33 | :param model_name: model name
34 | :return:
35 |
36 | Currently supported HF models:
37 | Y BertModel
38 | N BertForMaskedLM
39 | N BertForPreTraining
40 | N BertForMultipleChoice
41 | N BertForNextSentencePrediction
42 | N BertForSequenceClassification
43 | N BertForQuestionAnswering
44 | """
45 |
46 | tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
47 |
48 | var_map = (
49 | ("layer.", "layer_"),
50 | ("word_embeddings.weight", "word_embeddings"),
51 | ("position_embeddings.weight", "position_embeddings"),
52 | ("token_type_embeddings.weight", "token_type_embeddings"),
53 | (".", "/"),
54 | ("LayerNorm/weight", "LayerNorm/gamma"),
55 | ("LayerNorm/bias", "LayerNorm/beta"),
56 | ("weight", "kernel"),
57 | )
58 |
59 | if not os.path.isdir(ckpt_dir):
60 | os.makedirs(ckpt_dir)
61 |
62 | state_dict = model.state_dict()
63 |
64 | def to_tf_var_name(name: str):
65 | for patt, repl in iter(var_map):
66 | name = name.replace(patt, repl)
67 | return "bert/{}".format(name)
68 |
69 | def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
70 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
71 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
72 | session.run(tf.variables_initializer([tf_var]))
73 | session.run(tf_var)
74 | return tf_var
75 |
76 | tf.reset_default_graph()
77 | with tf.Session() as session:
78 | for var_name in state_dict:
79 | tf_name = to_tf_var_name(var_name)
80 | torch_tensor = state_dict[var_name].numpy()
81 | if any([x in var_name for x in tensors_to_transpose]):
82 | torch_tensor = torch_tensor.T
83 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
84 | tf.keras.backend.set_value(tf_var, torch_tensor)
85 | tf_weight = session.run(tf_var)
86 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
87 |
88 | saver = tf.train.Saver(tf.trainable_variables())
89 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
90 |
91 |
92 | def main(raw_args=None):
93 | parser = argparse.ArgumentParser()
94 | parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
95 | parser.add_argument(
96 | "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
97 | )
98 | parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin")
99 | parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
100 | args = parser.parse_args(raw_args)
101 |
102 | model = BertModel.from_pretrained(
103 | pretrained_model_name_or_path=args.model_name,
104 | state_dict=torch.load(args.pytorch_model_path),
105 | cache_dir=args.cache_dir,
106 | )
107 |
108 | convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
109 |
110 |
111 | if __name__ == "__main__":
112 | main()
113 |
--------------------------------------------------------------------------------
/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
30 | # Construct model
31 | if gpt2_config_file == "":
32 | config = GPT2Config()
33 | else:
34 | config = GPT2Config.from_json_file(gpt2_config_file)
35 | model = GPT2Model(config)
36 |
37 | # Load weights from numpy
38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
39 |
40 | # Save pytorch-model
41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 | torch.save(model.state_dict(), pytorch_weights_dump_path)
45 | print("Save configuration file to {}".format(pytorch_config_dump_path))
46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 | f.write(config.to_json_string())
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | # Required parameters
53 | parser.add_argument(
54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
55 | )
56 | parser.add_argument(
57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
58 | )
59 | parser.add_argument(
60 | "--gpt2_config_file",
61 | default="",
62 | type=str,
63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
64 | "This specifies the model architecture.",
65 | )
66 | args = parser.parse_args()
67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
68 |
--------------------------------------------------------------------------------
/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
30 | # Construct model
31 | if openai_config_file == "":
32 | config = OpenAIGPTConfig()
33 | else:
34 | config = OpenAIGPTConfig.from_json_file(openai_config_file)
35 | model = OpenAIGPTModel(config)
36 |
37 | # Load weights from numpy
38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
39 |
40 | # Save pytorch-model
41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 | torch.save(model.state_dict(), pytorch_weights_dump_path)
45 | print("Save configuration file to {}".format(pytorch_config_dump_path))
46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 | f.write(config.to_json_string())
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | # Required parameters
53 | parser.add_argument(
54 | "--openai_checkpoint_folder_path",
55 | default=None,
56 | type=str,
57 | required=True,
58 | help="Path to the TensorFlow checkpoint path.",
59 | )
60 | parser.add_argument(
61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
62 | )
63 | parser.add_argument(
64 | "--openai_config_file",
65 | default="",
66 | type=str,
67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 | "This specifies the model architecture.",
69 | )
70 | args = parser.parse_args()
71 | convert_openai_checkpoint_to_pytorch(
72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
73 | )
74 |
--------------------------------------------------------------------------------
/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = T5Config.from_json_file(config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = T5Model(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained T5 model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 | import json
20 | import logging
21 |
22 | import numpy
23 | import torch
24 |
25 | from transformers import CONFIG_NAME, WEIGHTS_NAME
26 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
27 |
28 |
29 | logging.basicConfig(level=logging.INFO)
30 |
31 |
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 | # Load checkpoint
34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
35 |
36 | state_dict = chkpt["model"]
37 |
38 | # We have the base model one level deeper than the original XLM repository
39 | two_levels_state_dict = {}
40 | for k, v in state_dict.items():
41 | if "pred_layer" in k:
42 | two_levels_state_dict[k] = v
43 | else:
44 | two_levels_state_dict["transformer." + k] = v
45 |
46 | config = chkpt["params"]
47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 |
49 | vocab = chkpt["dico_word2id"]
50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
51 |
52 | # Save pytorch-model
53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
56 |
57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 |
60 | print("Save configuration file to {}".format(pytorch_config_dump_path))
61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 | f.write(json.dumps(config, indent=2) + "\n")
63 |
64 | print("Save vocab file to {}".format(pytorch_config_dump_path))
65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 | f.write(json.dumps(vocab, indent=2) + "\n")
67 |
68 |
69 | if __name__ == "__main__":
70 | parser = argparse.ArgumentParser()
71 | # Required parameters
72 | parser.add_argument(
73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
74 | )
75 | parser.add_argument(
76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
77 | )
78 | args = parser.parse_args()
79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
80 |
--------------------------------------------------------------------------------
/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 | import os
21 |
22 | import torch
23 |
24 | from transformers import (
25 | CONFIG_NAME,
26 | WEIGHTS_NAME,
27 | XLNetConfig,
28 | XLNetForQuestionAnswering,
29 | XLNetForSequenceClassification,
30 | XLNetLMHeadModel,
31 | load_tf_weights_in_xlnet,
32 | )
33 |
34 |
35 | GLUE_TASKS_NUM_LABELS = {
36 | "cola": 2,
37 | "mnli": 3,
38 | "mrpc": 2,
39 | "sst-2": 2,
40 | "sts-b": 1,
41 | "qqp": 2,
42 | "qnli": 2,
43 | "rte": 2,
44 | "wnli": 2,
45 | }
46 |
47 |
48 | logging.basicConfig(level=logging.INFO)
49 |
50 |
51 | def convert_xlnet_checkpoint_to_pytorch(
52 | tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
53 | ):
54 | # Initialise PyTorch model
55 | config = XLNetConfig.from_json_file(bert_config_file)
56 |
57 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
58 | if finetuning_task in GLUE_TASKS_NUM_LABELS:
59 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
60 | config.finetuning_task = finetuning_task
61 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
62 | model = XLNetForSequenceClassification(config)
63 | elif "squad" in finetuning_task:
64 | config.finetuning_task = finetuning_task
65 | model = XLNetForQuestionAnswering(config)
66 | else:
67 | model = XLNetLMHeadModel(config)
68 |
69 | # Load weights from tf checkpoint
70 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
71 |
72 | # Save pytorch-model
73 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
74 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
75 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
76 | torch.save(model.state_dict(), pytorch_weights_dump_path)
77 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
78 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
79 | f.write(config.to_json_string())
80 |
81 |
82 | if __name__ == "__main__":
83 | parser = argparse.ArgumentParser()
84 | # Required parameters
85 | parser.add_argument(
86 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
87 | )
88 | parser.add_argument(
89 | "--xlnet_config_file",
90 | default=None,
91 | type=str,
92 | required=True,
93 | help="The config json file corresponding to the pre-trained XLNet model. \n"
94 | "This specifies the model architecture.",
95 | )
96 | parser.add_argument(
97 | "--pytorch_dump_folder_path",
98 | default=None,
99 | type=str,
100 | required=True,
101 | help="Path to the folder to store the PyTorch model or dataset/vocab.",
102 | )
103 | parser.add_argument(
104 | "--finetuning_task",
105 | default=None,
106 | type=str,
107 | help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
108 | )
109 | args = parser.parse_args()
110 | print(args)
111 |
112 | convert_xlnet_checkpoint_to_pytorch(
113 | args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
114 | )
115 |
--------------------------------------------------------------------------------
/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | from .metrics import is_sklearn_available
6 | from .processors import (
7 | DataProcessor,
8 | InputExample,
9 | InputFeatures,
10 | SingleSentenceClassificationProcessor,
11 | SquadExample,
12 | SquadFeatures,
13 | SquadV1Processor,
14 | SquadV2Processor,
15 | glue_convert_examples_to_features,
16 | glue_output_modes,
17 | glue_processors,
18 | glue_tasks_num_labels,
19 |
20 | xglue_convert_examples_to_features,
21 | xglue_convert_examples_to_vat_features,
22 | xglue_output_modes,
23 | xglue_processors,
24 | xglue_tasks_num_labels,
25 |
26 | xtreme_convert_examples_to_features,
27 | xtreme_output_modes,
28 | xtreme_processors,
29 | xtreme_tasks_num_labels,
30 |
31 | squad_convert_examples_to_features,
32 | xnli_output_modes,
33 | xnli_processors,
34 | xnli_tasks_num_labels,
35 | )
36 |
37 |
38 | if is_sklearn_available():
39 | from .metrics import glue_compute_metrics, xnli_compute_metrics, xglue_compute_metrics, xtreme_compute_metrics
40 |
--------------------------------------------------------------------------------
/src/transformers/data/metrics/evaluate_squad.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Based on the SQuAD evaluation script from:
3 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
16 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
17 | from __future__ import print_function
18 | from collections import Counter
19 | import string
20 | import re
21 | import argparse
22 | import json
23 | import sys
24 |
25 |
26 | def normalize_answer(s):
27 | """Lower text and remove punctuation, articles and extra whitespace."""
28 | def remove_articles(text):
29 | return re.sub(r'\b(a|an|the)\b', ' ', text)
30 |
31 | def white_space_fix(text):
32 | return ' '.join(text.split())
33 |
34 | def remove_punc(text):
35 | exclude = set(string.punctuation)
36 | return ''.join(ch for ch in text if ch not in exclude)
37 |
38 | def lower(text):
39 | return text.lower()
40 |
41 | return white_space_fix(remove_articles(remove_punc(lower(s))))
42 |
43 |
44 | def f1_score(prediction, ground_truth):
45 | prediction_tokens = normalize_answer(prediction).split()
46 | ground_truth_tokens = normalize_answer(ground_truth).split()
47 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
48 | num_same = sum(common.values())
49 | if num_same == 0:
50 | return 0
51 | precision = 1.0 * num_same / len(prediction_tokens)
52 | recall = 1.0 * num_same / len(ground_truth_tokens)
53 | f1 = (2 * precision * recall) / (precision + recall)
54 | return f1
55 |
56 |
57 | def exact_match_score(prediction, ground_truth):
58 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
59 |
60 |
61 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
62 | scores_for_ground_truths = []
63 | for ground_truth in ground_truths:
64 | score = metric_fn(prediction, ground_truth)
65 | scores_for_ground_truths.append(score)
66 | return max(scores_for_ground_truths)
67 |
68 |
69 | def evaluate(dataset, predictions):
70 | f1 = exact_match = total = 0
71 | for article in dataset:
72 | for paragraph in article['paragraphs']:
73 | for qa in paragraph['qas']:
74 | total += 1
75 | if qa['id'] not in predictions:
76 | message = 'Unanswered question ' + qa['id'] + \
77 | ' will receive score 0.'
78 | print(message, file=sys.stderr)
79 | continue
80 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
81 | prediction = predictions[qa['id']]
82 | exact_match += metric_max_over_ground_truths(
83 | exact_match_score, prediction, ground_truths)
84 | f1 += metric_max_over_ground_truths(
85 | f1_score, prediction, ground_truths)
86 |
87 | exact_match = 100.0 * exact_match / total
88 | f1 = 100.0 * f1 / total
89 |
90 | return {'exact_match': exact_match, 'f1': f1}
91 |
92 |
93 | def evaluate_with_path(dataset_file, prediction_file):
94 | with open(dataset_file) as dataset_file_reader:
95 | dataset_json = json.load(dataset_file_reader)
96 | dataset = dataset_json['data']
97 | with open(prediction_file) as prediction_file_reader:
98 | predictions = json.load(prediction_file_reader)
99 | return evaluate(dataset, predictions)
100 |
101 | if __name__ == '__main__':
102 | expected_version = '1.1'
103 | parser = argparse.ArgumentParser(
104 | description='Evaluation for SQuAD ' + expected_version)
105 | parser.add_argument('dataset_file', help='Dataset file')
106 | parser.add_argument('prediction_file', help='Prediction File')
107 | args = parser.parse_args()
108 | with open(args.dataset_file) as dataset_file:
109 | dataset_json = json.load(dataset_file)
110 | if (dataset_json['version'] != expected_version):
111 | print('Evaluation expects v-' + expected_version +
112 | ', but got dataset with v-' + dataset_json['version'],
113 | file=sys.stderr)
114 | dataset = dataset_json['data']
115 | with open(args.prediction_file) as prediction_file:
116 | predictions = json.load(prediction_file)
117 | print(json.dumps(evaluate(dataset, predictions)))
--------------------------------------------------------------------------------
/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | from .xglue import xglue_convert_examples_to_features, xglue_output_modes, xglue_processors, xglue_tasks_num_labels
6 | from .xtreme import xtreme_convert_examples_to_features, xtreme_output_modes, xtreme_processors, xtreme_tasks_num_labels
7 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
8 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
9 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
10 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
11 | from .xglue import xglue_convert_examples_to_vat_features
12 |
--------------------------------------------------------------------------------
/src/transformers/data/processors/xnli.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XNLI utils (dataset loading and evaluation) """
17 |
18 |
19 | import logging
20 | import os
21 |
22 | from .utils import DataProcessor, InputExample
23 |
24 |
25 | logger = logging.getLogger(__name__)
26 |
27 |
28 | class XnliProcessor(DataProcessor):
29 | """Processor for the XNLI dataset.
30 | Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
31 |
32 | def __init__(self, language, train_language=None):
33 | self.language = language
34 | self.train_language = train_language
35 |
36 | def get_train_examples(self, data_dir):
37 | """See base class."""
38 | lg = self.language if self.train_language is None else self.train_language
39 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
40 | examples = []
41 | for (i, line) in enumerate(lines):
42 | if i == 0:
43 | continue
44 | guid = "%s-%s" % ("train", i)
45 | text_a = line[0]
46 | text_b = line[1]
47 | label = "contradiction" if line[2] == "contradictory" else line[2]
48 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
49 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
50 | return examples
51 |
52 | def get_test_examples(self, data_dir):
53 | """See base class."""
54 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
55 | examples = []
56 | for (i, line) in enumerate(lines):
57 | if i == 0:
58 | continue
59 | language = line[0]
60 | if language != self.language:
61 | continue
62 | guid = "%s-%s" % ("test", i)
63 | text_a = line[6]
64 | text_b = line[7]
65 | label = line[1]
66 | assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
67 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
68 | return examples
69 |
70 | def get_labels(self):
71 | """See base class."""
72 | return ["contradiction", "entailment", "neutral"]
73 |
74 |
75 | xnli_processors = {
76 | "xnli": XnliProcessor,
77 | }
78 |
79 | xnli_output_modes = {
80 | "xnli": "classification",
81 | }
82 |
83 | xnli_tasks_num_labels = {
84 | "xnli": 3,
85 | }
86 |
--------------------------------------------------------------------------------
/src/transformers/modeling_tf_camembert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ TF 2.0 RoBERTa model. """
17 |
18 |
19 | import logging
20 |
21 | from .configuration_camembert import CamembertConfig
22 | from .file_utils import add_start_docstrings
23 | from .modeling_tf_roberta import (
24 | TFRobertaForMaskedLM,
25 | TFRobertaForSequenceClassification,
26 | TFRobertaForTokenClassification,
27 | TFRobertaModel,
28 | )
29 |
30 |
31 | logger = logging.getLogger(__name__)
32 |
33 | TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {}
34 |
35 |
36 | CAMEMBERT_START_DOCSTRING = r"""
37 |
38 | .. note::
39 |
40 | TF 2.0 models accepts two formats as inputs:
41 |
42 | - having all inputs as keyword arguments (like PyTorch models), or
43 | - having all inputs as a list, tuple or dict in the first positional arguments.
44 |
45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
47 |
48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors
49 | in the first positional argument :
50 |
51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring:
55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
56 |
57 | Parameters:
58 | config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration.
60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
61 | """
62 |
63 |
64 | @add_start_docstrings(
65 | "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
66 | CAMEMBERT_START_DOCSTRING,
67 | )
68 | class TFCamembertModel(TFRobertaModel):
69 | """
70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the
71 | superclass for the appropriate documentation alongside usage examples.
72 | """
73 |
74 | config_class = CamembertConfig
75 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
76 |
77 |
78 | @add_start_docstrings(
79 | """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
80 | )
81 | class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
82 | """
83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
84 | superclass for the appropriate documentation alongside usage examples.
85 | """
86 |
87 | config_class = CamembertConfig
88 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
89 |
90 |
91 | @add_start_docstrings(
92 | """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
93 | on top of the pooled output) e.g. for GLUE tasks. """,
94 | CAMEMBERT_START_DOCSTRING,
95 | )
96 | class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
97 | """
98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
99 | superclass for the appropriate documentation alongside usage examples.
100 | """
101 |
102 | config_class = CamembertConfig
103 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
104 |
105 |
106 | @add_start_docstrings(
107 | """CamemBERT Model with a token classification head on top (a linear layer on top of
108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 | CAMEMBERT_START_DOCSTRING,
110 | )
111 | class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
112 | """
113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 | superclass for the appropriate documentation alongside usage examples.
115 | """
116 |
117 | config_class = CamembertConfig
118 | pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
119 |
--------------------------------------------------------------------------------
/src/transformers/modeling_tf_xlm_roberta.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ TF 2.0 XLM-RoBERTa model. """
17 |
18 |
19 | import logging
20 |
21 | from .configuration_xlm_roberta import XLMRobertaConfig
22 | from .file_utils import add_start_docstrings
23 | from .modeling_tf_roberta import (
24 | TFRobertaForMaskedLM,
25 | TFRobertaForSequenceClassification,
26 | TFRobertaForTokenClassification,
27 | TFRobertaModel,
28 | )
29 |
30 |
31 | logger = logging.getLogger(__name__)
32 |
33 | TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
34 |
35 |
36 | XLM_ROBERTA_START_DOCSTRING = r"""
37 |
38 | .. note::
39 |
40 | TF 2.0 models accepts two formats as inputs:
41 |
42 | - having all inputs as keyword arguments (like PyTorch models), or
43 | - having all inputs as a list, tuple or dict in the first positional arguments.
44 |
45 | This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
46 | all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
47 |
48 | If you choose this second option, there are three possibilities you can use to gather all the input Tensors
49 | in the first positional argument :
50 |
51 | - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
52 | - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
53 | :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
54 | - a dictionary with one or several input Tensors associated to the input names given in the docstring:
55 | :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
56 |
57 | Parameters:
58 | config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
59 | model. Initializing with a config file does not load the weights associated with the model, only the configuration.
60 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
61 | """
62 |
63 |
64 | @add_start_docstrings(
65 | "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
66 | XLM_ROBERTA_START_DOCSTRING,
67 | )
68 | class TFXLMRobertaModel(TFRobertaModel):
69 | """
70 | This class overrides :class:`~transformers.TFRobertaModel`. Please check the
71 | superclass for the appropriate documentation alongside usage examples.
72 | """
73 |
74 | config_class = XLMRobertaConfig
75 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
76 |
77 |
78 | @add_start_docstrings(
79 | """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
80 | )
81 | class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
82 | """
83 | This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
84 | superclass for the appropriate documentation alongside usage examples.
85 | """
86 |
87 | config_class = XLMRobertaConfig
88 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
89 |
90 |
91 | @add_start_docstrings(
92 | """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
93 | on top of the pooled output) e.g. for GLUE tasks. """,
94 | XLM_ROBERTA_START_DOCSTRING,
95 | )
96 | class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification):
97 | """
98 | This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
99 | superclass for the appropriate documentation alongside usage examples.
100 | """
101 |
102 | config_class = XLMRobertaConfig
103 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
104 |
105 |
106 | @add_start_docstrings(
107 | """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
108 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
109 | XLM_ROBERTA_START_DOCSTRING,
110 | )
111 | class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
112 | """
113 | This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
114 | superclass for the appropriate documentation alongside usage examples.
115 | """
116 |
117 | config_class = XLMRobertaConfig
118 | pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
119 |
--------------------------------------------------------------------------------
/src/transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_roberta import RobertaTokenizer
17 |
18 |
19 | # vocab and merges same as roberta
20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
23 |
24 |
25 | class BartTokenizer(RobertaTokenizer):
26 | # merges and vocab same as Roberta
27 | max_model_input_sizes = {m: 1024 for m in _all_bart_models}
28 | pretrained_vocab_files_map = {
29 | "vocab_file": {m: vocab_url for m in _all_bart_models},
30 | "merges_file": {m: merges_url for m in _all_bart_models},
31 | }
32 |
--------------------------------------------------------------------------------
/src/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 |
17 |
18 | import logging
19 |
20 | from .tokenization_bert import BertTokenizer, BertTokenizerFast
21 |
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
26 |
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 | "vocab_file": {
29 | "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
30 | "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
31 | "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
32 | "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
33 | "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
34 | "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
35 | }
36 | }
37 |
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 | "distilbert-base-uncased": 512,
40 | "distilbert-base-uncased-distilled-squad": 512,
41 | "distilbert-base-cased": 512,
42 | "distilbert-base-cased-distilled-squad": 512,
43 | "distilbert-base-german-cased": 512,
44 | "distilbert-base-multilingual-cased": 512,
45 | }
46 |
47 |
48 | PRETRAINED_INIT_CONFIGURATION = {
49 | "distilbert-base-uncased": {"do_lower_case": True},
50 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
51 | "distilbert-base-cased": {"do_lower_case": False},
52 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
53 | "distilbert-base-german-cased": {"do_lower_case": False},
54 | "distilbert-base-multilingual-cased": {"do_lower_case": False},
55 | }
56 |
57 |
58 | class DistilBertTokenizer(BertTokenizer):
59 | r"""
60 | Constructs a DistilBertTokenizer.
61 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
62 | tokenization: punctuation splitting + wordpiece.
63 |
64 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
65 | parameters.
66 | """
67 |
68 | vocab_files_names = VOCAB_FILES_NAMES
69 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
70 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
71 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
72 |
73 |
74 | class DistilBertTokenizerFast(BertTokenizerFast):
75 | vocab_files_names = VOCAB_FILES_NAMES
76 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
77 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
78 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
79 |
--------------------------------------------------------------------------------
/src/transformers/utils_encoder_decoder.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Classes to support Encoder-Decoder architectures """
16 |
17 |
18 | def prepare_encoder_decoder_model_kwargs(**kwargs):
19 | """ Prepare the encoder and decoder's keyword arguments.
20 |
21 | Keyword arguments come in 3 flavors:
22 | - encoder-specific (prefixed by `encoder_`)
23 | - decoder-specific (prefixed by `decoder_`)
24 | - those that apply to the model as whole.
25 |
26 | We let the specific kwargs override the common ones in case of
27 | conflict.
28 | """
29 |
30 | kwargs_common = {
31 | argument: value
32 | for argument, value in kwargs.items()
33 | if not argument.startswith("encoder_") and not argument.startswith("decoder_")
34 | }
35 | if "input_ids" in kwargs_common:
36 | kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids")
37 |
38 | decoder_kwargs = kwargs_common.copy()
39 | encoder_kwargs = kwargs_common.copy()
40 | encoder_kwargs.update(
41 | {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")}
42 | )
43 | decoder_kwargs.update(
44 | {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")}
45 | )
46 | decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
47 | return encoder_kwargs, decoder_kwargs
48 |
--------------------------------------------------------------------------------
/src/ud-conversion-tools/conllu_to_conll.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from itertools import islice
3 | from pathlib import Path
4 | import argparse
5 | import sys, copy
6 |
7 | from lib.conll import CoNLLReader
8 |
9 | def main():
10 | parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
11 | parser.add_argument('input', help="conllu file")
12 | parser.add_argument('output', help="target file", type=Path)
13 | parser.add_argument('--replace_subtokens_with_fused_forms', help="By default removes fused tokens", default=False, action="store_true")
14 | parser.add_argument('--remove_deprel_suffixes', help="Restrict deprels to the common universal subset, e.g. nmod:tmod becomes nmod", default=False, action="store_true")
15 | parser.add_argument('--remove_node_properties', help="space-separated list of node properties to remove: form, lemma, cpostag, postag, feats", choices=['form', 'lemma', 'cpostag','postag','feats'], metavar='prop', type=str, nargs='+')
16 | parser.add_argument('--lang', help="specify a language 2-letter code", default="default")
17 | parser.add_argument('--output_format', choices=['conll2006', 'conll2009', 'conllu'], default="conll2006")
18 | parser.add_argument('--remove_arabic_diacritics', help="remove Arabic short vowels", default=False, action="store_true")
19 | parser.add_argument('--print_comments',default=False,action="store_true")
20 | parser.add_argument('--print_fused_forms',default=False,action="store_true")
21 |
22 | args = parser.parse_args()
23 |
24 | if sys.version_info < (3,0):
25 | print("Sorry, requires Python 3.x.") #suggestion: install anaconda python
26 | sys.exit(1)
27 |
28 | POSRANKPRECEDENCEDICT = defaultdict(list)
29 | POSRANKPRECEDENCEDICT["default"] = "VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT ".split(" ")
30 | # POSRANKPRECEDENCEDICT["de"] = "PROPN ADP DET ".split(" ")
31 | POSRANKPRECEDENCEDICT["es"] = "VERB AUX PRON ADP DET".split(" ")
32 | POSRANKPRECEDENCEDICT["fr"] = "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" ")
33 | POSRANKPRECEDENCEDICT["it"] = "VERB AUX ADV PRON ADP DET INTJ".split(" ")
34 |
35 | if args.lang in POSRANKPRECEDENCEDICT:
36 | current_pos_precedence_list = POSRANKPRECEDENCEDICT[args.lang]
37 | else:
38 | current_pos_precedence_list = POSRANKPRECEDENCEDICT["default"]
39 |
40 | cio = CoNLLReader()
41 | orig_treebank = cio.read_conll_u(args.input)#, args.keep_fused_forms, args.lang, POSRANKPRECEDENCEDICT)
42 | modif_treebank = copy.copy(orig_treebank)
43 |
44 | # As per Dec 2015 the args.lang variable is redundant once you have current_pos_precedence_list
45 | # We keep it for future modifications, i.e. any language-specific modules
46 | for s in modif_treebank:
47 | # print('sentence', s.get_sentence_as_string(printid=True))
48 | s.filter_sentence_content(args.replace_subtokens_with_fused_forms, args.lang, current_pos_precedence_list,args.remove_node_properties,args.remove_deprel_suffixes,args.remove_arabic_diacritics)
49 |
50 | cio.write_conll(modif_treebank,args.output, args.output_format,print_fused_forms=args.print_fused_forms, print_comments=args.print_comments)
51 |
52 | if __name__ == "__main__":
53 | main()
--------------------------------------------------------------------------------
/src/ud-conversion-tools/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bozheng-hit/xTune/18db15e07ae6dde2f5c56e11d55a36f945e2c15b/src/ud-conversion-tools/lib/__init__.py
--------------------------------------------------------------------------------
/transformers-cli:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from argparse import ArgumentParser
3 |
4 | from transformers.commands.convert import ConvertCommand
5 | from transformers.commands.download import DownloadCommand
6 | from transformers.commands.env import EnvironmentCommand
7 | from transformers.commands.run import RunCommand
8 | from transformers.commands.serving import ServeCommand
9 | from transformers.commands.user import UserCommands
10 |
11 | if __name__ == '__main__':
12 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []')
13 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
14 |
15 | # Register commands
16 | ConvertCommand.register_subcommand(commands_parser)
17 | DownloadCommand.register_subcommand(commands_parser)
18 | EnvironmentCommand.register_subcommand(commands_parser)
19 | RunCommand.register_subcommand(commands_parser)
20 | ServeCommand.register_subcommand(commands_parser)
21 | UserCommands.register_subcommand(commands_parser)
22 |
23 | # Let's go
24 | args = parser.parse_args()
25 |
26 | if not hasattr(args, 'func'):
27 | parser.print_help()
28 | exit(1)
29 |
30 | # Run
31 | service = args.func(args)
32 | service.run()
33 |
--------------------------------------------------------------------------------