├── config
    ├── bitfit_config.json
    ├── adapter_config.json
    └── lora_config.json
├── scripts
    ├── run_glue_finetune.sh
    ├── run_glue_bitfit.sh
    ├── run_glue_lora.sh
    ├── run_glue_adapter.sh
    ├── run_glue_sora_schedule_dense.sh
    └── run_glue_sora_no_schedule.sh
├── requirements.txt
├── README.md
├── src
    ├── util.py
    ├── glue_tasks.py
    ├── processor.py
    ├── sparse_optimizer_multiply_lr.py
    ├── sparse_optimizer.py
    └── lora.py
├── glue.py
├── run_glue_adapter.py
├── run_glue_bitfit.py
├── run_glue_finetune.py
└── run_glue.py


/config/bitfit_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unfrozen_modules": [
3 |         "deltas",
4 |         "layer_norm",
5 |         "final_layer_norm"
6 |     ]
7 | }


--------------------------------------------------------------------------------
/config/adapter_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "bottleneck_dim": 12,
3 |     "unfrozen_modules": [
4 |         "deltas",
5 |         "layer_norm",
6 |         "final_layer_norm"
7 |     ]
8 | }


--------------------------------------------------------------------------------
/config/lora_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "lora_r": 32,
3 |     "lora_alpha": 16,
4 |     "unfrozen_modules": [
5 |         "deltas",
6 |         "layer_norm",
7 |         "final_layer_norm"
8 |     ]
9 | }


--------------------------------------------------------------------------------
/scripts/run_glue_finetune.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | for seed in 100
 3 | do
 4 | for lr in 5e-5
 5 | do
 6 | task=mnli-m # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 7 | bsz=80
 8 | epoch=10
 9 | echo $lr
10 | echo $seed
11 | echo $bsz
12 | echo $task
13 | CUDA_VISIBLE_DEVICES=0 \
14 | python -u run_glue_finetune.py \
15 |     --do_eval \
16 |     --do_predict \
17 |     --do_train \
18 |     --task_name $task \
19 |     --eval_steps 1000 \
20 |     --evaluation_strategy steps \
21 |     --greater_is_better true \
22 |     --learning_rate $lr \
23 |     --max_grad_norm 0.1 \
24 |     --load_best_model_at_end \
25 |     --logging_steps 100 \
26 |     --max_steps -1 \
27 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
28 |     --num_train_epochs $epoch \
29 |     --output_dir results/${task}/${task}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed} \
30 |     --overwrite_output_dir \
31 |     --per_device_eval_batch_size $bsz \
32 |     --per_device_train_batch_size $bsz \
33 |     --save_steps 1000 \
34 |     --save_strategy steps \
35 |     --save_total_limit 1 \
36 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
37 |     --warmup_ratio 0.06 \
38 |     --warmup_steps 0 \
39 |     --weight_decay 0.1 \
40 |     --disable_tqdm true \
41 |     --load_best_model_at_end \
42 |     --ddp_find_unused_parameters false \
43 |     --seed $seed \
44 |     --max_seq_length 256 > results/${task}/${task}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed}.log 2>&1
45 | done
46 | done
47 | 
48 | 


--------------------------------------------------------------------------------
/scripts/run_glue_bitfit.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | export WANDB_DISABLED=true
 3 | for task in mnli-m # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 4 | do
 5 | for lr in 8e-4
 6 | do
 7 | for seed in 100
 8 | do
 9 | bsz=100
10 | epoch=10
11 | echo $lr
12 | echo $seed
13 | echo $bsz
14 | echo $task
15 | CUDA_VISIBLE_DEVICES=0 \
16 | python -u run_glue_bitfit.py \
17 |     --do_eval \
18 |     --do_predict \
19 |     --do_train \
20 |     --task_name $task \
21 |     --eval_steps 1000 \
22 |     --evaluation_strategy steps \
23 |     --greater_is_better true \
24 |     --learning_rate $lr \
25 |     --max_grad_norm 0.1 \
26 |     --load_best_model_at_end \
27 |     --logging_steps 100 \
28 |     --max_steps -1 \
29 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
30 |     --num_train_epochs $epoch \
31 |     --output_dir results/${task}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed} \
32 |     --overwrite_output_dir \
33 |     --per_device_eval_batch_size $bsz \
34 |     --per_device_train_batch_size $bsz \
35 |     --save_steps 1000 \
36 |     --save_strategy steps \
37 |     --save_total_limit 1 \
38 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
39 |     --warmup_ratio 0.06 \
40 |     --warmup_steps 0 \
41 |     --weight_decay 0.1 \
42 |     --disable_tqdm true \
43 |     --load_best_model_at_end \
44 |     --ddp_find_unused_parameters false \
45 |     --sparse_lambda 0 \
46 |     --seed $seed \
47 |     --max_seq_length 256 > results/${task}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed}.log 2>&1
48 | wait
49 | done
50 | done
51 | done


--------------------------------------------------------------------------------
/scripts/run_glue_lora.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | for lora_r in 16
 3 | do
 4 | for seed in 100
 5 | do
 6 | task=mnli-mm # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 7 | lr=3e-4
 8 | bsz=8 # rte: 32; the other: 8
 9 | epoch=10
10 | echo $lr
11 | echo $seed
12 | echo $bsz
13 | echo $task
14 | CUDA_VISIBLE_DEVICES=0 \
15 | python -u run_glue.py \
16 |     --do_eval \
17 |     --do_predict \
18 |     --do_train \
19 |     --task_name $task \
20 |     --eval_steps 1000 \
21 |     --evaluation_strategy steps \
22 |     --greater_is_better true \
23 |     --learning_rate $lr \
24 |     --max_grad_norm 0.1 \
25 |     --load_best_model_at_end \
26 |     --logging_steps 100 \
27 |     --max_steps -1 \
28 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
29 |     --num_train_epochs $epoch \
30 |     --output_dir results/${task}_lora_r_${lora_r}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed} \
31 |     --overwrite_output_dir \
32 |     --per_device_eval_batch_size $bsz \
33 |     --per_device_train_batch_size $bsz \
34 |     --save_steps 1000 \
35 |     --save_strategy steps \
36 |     --save_total_limit 1 \
37 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
38 |     --warmup_ratio 0.06 \
39 |     --warmup_steps 0 \
40 |     --weight_decay 0.1 \
41 |     --disable_tqdm true \
42 |     --load_best_model_at_end \
43 |     --ddp_find_unused_parameters false \
44 |     --sparse_lambda 0 \
45 |     --seed $seed \
46 |     --lora_r $lora_r \
47 |     --max_seq_length 256 > results/${task}_lora_r_${lora_r}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed}.log 2>&1
48 | done
49 | done
50 | 


--------------------------------------------------------------------------------
/scripts/run_glue_adapter.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | export WANDB_DISABLED=true
 3 | for task in cola # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 4 | do
 5 | for seed in 0 21 42 81 100
 6 | do
 7 | bottleneck_dim=16
 8 | lr=8e-4
 9 | bsz=8
10 | epoch=20
11 | echo $bottleneck_dim
12 | echo $lr
13 | echo $seed
14 | echo $bsz
15 | echo $task
16 | CUDA_VISIBLE_DEVICES=0 \
17 | python -u run_glue_adapter.py \
18 |     --do_eval \
19 |     --do_predict \
20 |     --do_train \
21 |     --task_name $task \
22 |     --eval_steps 1000 \
23 |     --evaluation_strategy steps \
24 |     --greater_is_better true \
25 |     --learning_rate $lr \
26 |     --max_grad_norm 0.1 \
27 |     --load_best_model_at_end \
28 |     --logging_steps 100 \
29 |     --max_steps -1 \
30 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
31 |     --num_train_epochs $epoch \
32 |     --output_dir results/${task}_bottleneckdim_${bottleneck_dim}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed} \
33 |     --overwrite_output_dir \
34 |     --per_device_eval_batch_size $bsz \
35 |     --per_device_train_batch_size $bsz \
36 |     --save_steps 1000 \
37 |     --save_strategy steps \
38 |     --save_total_limit 1 \
39 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
40 |     --warmup_ratio 0.06 \
41 |     --warmup_steps 0 \
42 |     --weight_decay 0.1 \
43 |     --disable_tqdm true \
44 |     --load_best_model_at_end \
45 |     --ddp_find_unused_parameters false \
46 |     --sparse_lambda 0 \
47 |     --seed $seed \
48 |     --bottleneck_dim $bottleneck_dim \
49 |     --max_seq_length 128 > results/${task}_bottleneckdim_${bottleneck_dim}_lr_${lr}_bsz_${bsz}_epoch_${epoch}_seed_${seed}.log 2>&1
50 | wait
51 | done
52 | done


--------------------------------------------------------------------------------
/scripts/run_glue_sora_schedule_dense.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | export WANDB_DISABLED=false
 3 | for task in rte # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 4 | do
 5 | lambda=0.001
 6 | lambda2=0
 7 | max_lambda=7e-4
 8 | lambda_num=7
 9 | lr=1.2e-3
10 | r=8
11 | epoch=50
12 | seed=48
13 | bsz=32
14 | epoch2=15
15 | echo $task
16 | echo "lambda=" $lambda
17 | echo $seed
18 | CUDA_VISIBLE_DEVICES=0 \
19 | python -u run_glue.py \
20 |     --do_eval \
21 |     --do_predict \
22 |     --do_train \
23 |     --task_name $task \
24 |     --eval_steps 1000 \
25 |     --evaluation_strategy steps \
26 |     --greater_is_better true \
27 |     --learning_rate $lr \
28 |     --max_grad_norm 0.1 \
29 |     --load_best_model_at_end \
30 |     --logging_steps 100 \
31 |     --max_steps -1 \
32 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
33 |     --num_train_epochs $epoch \
34 |     --output_dir results/$task-lambda2_${lambda2}_${max_lambda}_lambda_${lambda}_epoch_${epoch}_seed_${seed}_${epoch2} \
35 |     --overwrite_output_dir \
36 |     --per_device_eval_batch_size $bsz \
37 |     --per_device_train_batch_size $bsz \
38 |     --save_steps 1000 \
39 |     --save_strategy steps \
40 |     --save_total_limit 1 \
41 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
42 |     --warmup_ratio 0.06 \
43 |     --warmup_steps 0 \
44 |     --weight_decay 0.1 \
45 |     --disable_tqdm true \
46 |     --load_best_model_at_end \
47 |     --ddp_find_unused_parameters false \
48 |     --sparse_lambda $lambda \
49 |     --sparse_lambda_2 $lambda2 \
50 |     --seed $seed \
51 |     --lora_r $r \
52 |     --max_seq_length 320 \
53 |     --max_lambda $max_lambda \
54 |     --lambda_schedule linear \
55 |     --lambda_num $lambda_num \
56 |     --train_sparse > results/$task-lambda2_${lambda2}_${max_lambda}_lambda_${lambda}_epoch_${epoch}_seed_${seed}_${epoch2}.log 2>&1
57 | wait
58 | done


--------------------------------------------------------------------------------
/scripts/run_glue_sora_no_schedule.sh:
--------------------------------------------------------------------------------
 1 | cd ..
 2 | export WANDB_DISABLED=false
 3 | for task in stsb # can be cola/mrpc/rte/stsb/qnli/sst2/qqp/mnli-m/mnli-mm
 4 | do
 5 | for lora_r in 8
 6 | do
 7 | for lambda in 10
 8 | do
 9 | for lambda2 in 3e-4
10 | do
11 | for seed in 0 21 42 81 100
12 | do
13 | for lr in 8e-4
14 | do
15 | epoch=20
16 | bsz=8
17 | echo $task
18 | echo "lambda=" $lambda
19 | echo "lambda2=" $lambda2
20 | echo "lora_r=" $lora_r
21 | echo "seed=" $seed
22 | CUDA_VISIBLE_DEVICES=0 \
23 | python -u run_glue.py \
24 |     --do_eval \
25 |     --do_predict \
26 |     --do_train \
27 |     --task_name $task \
28 |     --eval_steps 1000 \
29 |     --evaluation_strategy steps \
30 |     --greater_is_better true \
31 |     --learning_rate $lr \
32 |     --max_grad_norm 0.1 \
33 |     --load_best_model_at_end \
34 |     --logging_steps 100 \
35 |     --max_steps -1 \
36 |     --model_name_or_path /root/xtlv/data/models/DeBERTaV3_base \
37 |     --num_train_epochs $epoch \
38 |     --output_dir results/${task}_lora_r_${lora_r}_lambda_${lambda}_lambda2_${lambda2}_lr_${lr}_epoch_${epoch}_bsz_${bsz}_seed_${seed} \
39 |     --overwrite_output_dir \
40 |     --per_device_eval_batch_size $bsz \
41 |     --per_device_train_batch_size $bsz \
42 |     --save_steps 1000 \
43 |     --save_strategy steps \
44 |     --save_total_limit 1 \
45 |     --tokenizer_name /root/xtlv/data/models/DeBERTaV3_base \
46 |     --warmup_ratio 0.06 \
47 |     --warmup_steps 0 \
48 |     --weight_decay 0.1 \
49 |     --disable_tqdm true \
50 |     --load_best_model_at_end \
51 |     --ddp_find_unused_parameters false \
52 |     --sparse_lambda $lambda \
53 |     --sparse_lambda_2 $lambda2 \
54 |     --seed $seed \
55 |     --lora_r $lora_r \
56 |     --max_seq_length 128 \
57 |     --train_sparse > results/${task}_lora_r_${lora_r}_lambda_${lambda}_lambda2_${lambda2}_lr_${lr}_epoch_${epoch}_bsz_${bsz}_seed_${seed}.log 2>&1
58 | wait
59 | done
60 | done
61 | done
62 | done
63 | done
64 | done


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.3.0
  2 | aiohttp==3.8.3
  3 | aiosignal==1.3.1
  4 | aliyun-python-sdk-core==2.13.36
  5 | aliyun-python-sdk-kms==2.16.0
  6 | appdirs==1.4.4
  7 | asttokens==2.0.5
  8 | async-timeout==4.0.2
  9 | attrs==22.1.0
 10 | backcall==0.2.0
 11 | bigmodelvis==0.0.1
 12 | blessings==1.7
 13 | Bottleneck==1.3.5
 14 | brotlipy==0.7.0
 15 | certifi==2023.5.7
 16 | cffi==1.15.1
 17 | charset-normalizer==2.0.4
 18 | cheroot==9.0.0
 19 | click==8.0.4
 20 | commonmark==0.9.1
 21 | contourpy==1.0.5
 22 | crcmod==1.7
 23 | cryptography==38.0.1
 24 | cycler==0.11.0
 25 | datasets==1.17.0
 26 | debugpy==1.5.1
 27 | decorator==5.1.1
 28 | delta-center-client==0.0.4
 29 | dill==0.3.6
 30 | docker-pycreds==0.4.0
 31 | entrypoints==0.4
 32 | executing==0.8.3
 33 | filelock==3.6.0
 34 | fonttools==4.25.0
 35 | frozenlist==1.3.3
 36 | fsspec==2022.11.0
 37 | gitdb==4.0.9
 38 | GitPython==3.1.31
 39 | gpustat==0.6.0
 40 | huggingface-hub==0.10.1
 41 | idna==3.4
 42 | importlib-resources==5.2.0
 43 | ipykernel==6.15.2
 44 | ipython==8.6.0
 45 | jaraco.functools==3.5.2
 46 | jedi==0.18.1
 47 | jmespath==0.10.0
 48 | joblib==1.1.1
 49 | jupyter_client==7.4.8
 50 | jupyter_core==4.11.2
 51 | kiwisolver==1.4.4
 52 | loralib==0.1.0
 53 | matplotlib==3.7.1
 54 | matplotlib-inline==0.1.6
 55 | mkl-fft==1.3.1
 56 | mkl-random==1.2.2
 57 | mkl-service==2.4.0
 58 | more-itertools==9.0.0
 59 | multidict==6.0.2
 60 | multiprocess==0.70.14
 61 | munkres==1.1.4
 62 | nest-asyncio==1.5.5
 63 | nltk==3.7
 64 | numexpr==2.8.4
 65 | numpy==1.23.4
 66 | nvidia-ml-py3==7.352.0
 67 | opendelta==0.3.2
 68 | oss2==2.15.0
 69 | packaging==21.3
 70 | pandas==1.5.3
 71 | parso==0.8.3
 72 | pathtools==0.1.2
 73 | pexpect==4.8.0
 74 | pickleshare==0.7.5
 75 | Pillow==9.2.0
 76 | pip==22.2.2
 77 | ply==3.11
 78 | prompt-toolkit==3.0.20
 79 | protobuf==4.22.3
 80 | psutil==5.9.0
 81 | ptyprocess==0.7.0
 82 | pure-eval==0.2.2
 83 | pyarrow==10.0.0
 84 | pycparser==2.21
 85 | pycryptodome==3.15.0
 86 | Pygments==2.13.0
 87 | pyOpenSSL==22.0.0
 88 | pyparsing==3.0.9
 89 | PyQt5-sip==12.11.0
 90 | PySocks==1.7.1
 91 | python-dateutil==2.8.2
 92 | pytz==2022.7
 93 | PyYAML==6.0
 94 | pyzmq==23.2.0
 95 | regex==2022.7.9
 96 | requests==2.28.1
 97 | rich==12.6.0
 98 | rouge-score==0.1.2
 99 | sacremoses==0.0.43
100 | scikit-learn==1.1.3
101 | scipy==1.9.3
102 | seaborn==0.12.2
103 | sentencepiece==0.1.97
104 | sentry-sdk==1.19.1
105 | setproctitle==1.3.2
106 | setuptools==65.5.0
107 | sip==6.6.2
108 | six==1.16.0
109 | sklearn==0.0.post1
110 | smmap==5.0.0
111 | stack-data==0.2.0
112 | tensorboardX==2.6.1
113 | threadpoolctl==2.2.0
114 | tokenizers==0.10.3
115 | toml==0.10.2
116 | torch==1.11.0
117 | torchaudio==0.11.0
118 | torchvision==0.12.0
119 | tornado==6.2
120 | tqdm==4.64.1
121 | traitlets==5.1.1
122 | transformers==4.14.1
123 | typing_extensions==4.3.0
124 | urllib3==1.26.12
125 | wandb==0.14.2
126 | wcwidth==0.2.5
127 | web.py==0.62
128 | wheel==0.37.1
129 | xxhash==3.1.0
130 | yacs==0.1.8
131 | yarl==1.8.1
132 | zipp==3.11.0
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | <h1>Sparse Low-rank Adaptation of Pre-trained Language Models</h1>
 4 | 
 5 | </div>
 6 | 
 7 | 🎉  This is the implementation of EMNLP 2023 paper：[Sparse Low-rank Adaptation of Pre-trained Language Models](https://arxiv.org/abs/2311.11696)
 8 | 
 9 | 
10 | ## Requirements
11 | 
12 | To run our code, please install all the dependency packages by using the following command:
13 | 
14 | ```
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ## Preparation
19 | 
20 | ### Prepare the Data and Modify the Data Path
21 | 
22 | In the paper/code, we use the GLUE datasets, you can download the data from Huggingface or from our [Google Drive](https://drive.google.com/drive/folders/1sNoQIp1x-5aXH4r9dOoSdsm5F1kihg_W?usp=sharing)
23 | 
24 | After download the data, please replace the following data path definition with your data path:
25 | 
26 | - `main_dir` in Line 27 of `SoRA/src/glue_tasks.py`
27 | - `main_dir` in Line 9 of `SoRA/src/processor.py`
28 | - `data_path` in Line 88 of `SoRA/run_glue.py`, `SoRA/run_glue_adapter.py`, `SoRA/run_glue_bitfit.py` and `SoRA/run_glue_adapter.py`
29 | 
30 | ### Prepare the model
31 | 
32 | You can download the base model and the corresponding tokenizer from Huggingface. And after that, do not forget to modify the `model_name_or_path` and `tokenizer_name` in script file (.sh).
33 | 
34 | 
35 | ## Baseline
36 | 
37 | We provide the implementation of LoRA, Adapter, BitFit and Full-parameter Fine-Tune. You can apply these baselines by running the following codes:
38 | 
39 | ```bash
40 | cd scripts
41 | # LoRA
42 | bash run_glue_lora.sh
43 | # Adapter
44 | bash run_glue_adapter.sh
45 | # BitFit
46 | bash run_glue_bitfit.sh
47 | # Full-parameter Fine-Tune
48 | bash run_glue_finetune.sh
49 | ```
50 | 
51 | ## SoRA
52 | 
53 | You can apply SoRA by running the following codes:
54 | 
55 | ```bash
56 | cd scripts
57 | # without the sparsifying scheduler
58 | bash run_glue_sora_no_schedule.sh
59 | # with the sparsifying scheduler (Algorithm 1)
60 | bash run_glue_sora_schedule_dense.sh
61 | ```
62 | 
63 | We explain some of the arguments as follows:
64 | 
65 | - `sparse_lambda`: The hyperparameters $\eta_t$ in paper.
66 | - `sparse_lambda_2`: The hyperparameters $\xi$ in paper.
67 | - `lora_r`: The hyperparameters $r_{max}$ in paper.
68 | - `train_sparse`: The argument to decide whether or not to apply SoRA.
69 | - `lambda_schedule`: The strategies for sparsifying schedulers. Possible values are `linear`, `log_linear` and `exp_linear`.
70 | - `max_lambda`: The max $\xi$ when applying sparsifying scheduler.
71 | - `lambda_num`: The num of the indicator $\xi$ when applying sparsifying scheduler.
72 | 
73 | 
74 | ## Bugs or questions?
75 | 
76 | If you have any questions related to the codes or the paper, please contact Ning Ding (`dn97@mail.tsinghua.edu.cn`), Xingtai Lv (`lvxt20@mails.tsinghua.edu.cn`) or open an issue.
77 | 
78 | ## Citation
79 | 
80 | If you find our work useful, please use the following citation: 
81 | 
82 | ```bibtex
83 | @article{ding2023sparse,
84 |   title={Sparse Low-rank Adaptation of Pre-trained Language Models},
85 |   author={Ding, Ning and Lv, Xingtai and Wang, Qiaosen and Chen, Yulin and Zhou, Bowen and Liu, Zhiyuan and Sun, Maosong},
86 |   journal={arXiv preprint arXiv:2311.11696},
87 |   year={2023}
88 | }
89 | ```
90 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.parallel import DistributedDataParallel
 3 | from transformers.trainer_pt_utils import get_parameter_names
 4 | import torch.nn as nn
 5 | from transformers import AdamW, get_linear_schedule_with_warmup
 6 | GATE_PARAM_NAME= "lora.gate"
 7 | 
 8 | def compute_trainable_sparse_param(model):
 9 |     if isinstance(model, DistributedDataParallel):
10 |         model = model.module
11 | 
12 |     total_trainable_param = 0
13 |     deduct = 0
14 |     for n, p in model.named_parameters():
15 |         if p.requires_grad:
16 |             if GATE_PARAM_NAME in n:
17 |                 deduct += (torch.numel(p) - torch.count_nonzero(p)) * model.config.hidden_size * 2  # zero_number * 768 * 2
18 |             else:
19 |                 total_trainable_param += torch.numel(p)
20 |     sparse_trainable_param = total_trainable_param - deduct
21 |     return sparse_trainable_param, total_trainable_param
22 | 
23 | def create_optimizer_and_scheduler(args, model, num_training_steps: int):
24 |     """
25 |     Setup the optimizer and the learning rate scheduler.
26 | 
27 |     We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
28 |     Trainer's init through :obj:`optimizers`, or subclass and override this method (or :obj:`create_optimizer`
29 |     and/or :obj:`create_scheduler`) in a subclass.
30 |     """
31 |     optimizer = create_optimizer(args, model)
32 |     scheduler = create_scheduler(args, num_training_steps=num_training_steps, optimizer=optimizer)
33 |     return optimizer, scheduler
34 | 
35 | def create_optimizer(args, model):
36 |     """
37 |     Setup the optimizer.
38 | 
39 |     We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
40 |     Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
41 |     """
42 | 
43 |     decay_parameters = get_parameter_names(model, [nn.LayerNorm])
44 |     decay_parameters = [name for name in decay_parameters if "bias" not in name]
45 |     print(f"removing {GATE_PARAM_NAME} from standard optimizer")
46 |     optimizer_grouped_parameters = [
47 |         {
48 |             "params": [p for n, p in model.named_parameters() if n in decay_parameters and GATE_PARAM_NAME not in n and p.requires_grad],
49 |             "weight_decay": args.weight_decay,
50 |         },
51 |         {
52 |             "params": [p for n, p in model.named_parameters() if n not in decay_parameters and GATE_PARAM_NAME not in n and p.requires_grad],
53 |             "weight_decay": 0.0,
54 |         },
55 |     ]
56 |     optimizer_kwargs = {
57 |         "betas": (args.adam_beta1, args.adam_beta2),
58 |         "eps": args.adam_epsilon,
59 |     }
60 |     optimizer_kwargs["lr"] = args.learning_rate
61 |     optimizer = AdamW(optimizer_grouped_parameters, **optimizer_kwargs)
62 | 
63 |     return optimizer
64 | 
65 | def create_scheduler(args, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
66 |     """
67 |     Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
68 |     passed as an argument.
69 | 
70 |     Args:
71 |         num_training_steps (int): The number of training steps to do.
72 |     """
73 |     lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.get_warmup_steps(num_training_steps),
74 |             num_training_steps=num_training_steps)
75 |     return lr_scheduler
76 | 


--------------------------------------------------------------------------------
/src/glue_tasks.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import collections
  3 | import abc
  4 | import functools
  5 | from selectors import EpollSelector
  6 | from typing import Callable, List, Mapping
  7 | import datasets
  8 | import logging
  9 | import numpy as np
 10 | import torch
 11 | import re
 12 | import itertools
 13 | import os
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | from transformers.models.auto.tokenization_auto import tokenizer_class_from_name
 19 | 
 20 | from typing import List, Dict
 21 | from collections import defaultdict
 22 | import warnings
 23 | 
 24 | 
 25 | from .processor import AbstractTask
 26 | 
 27 | main_dir = "/root/xtlv/data/sora_datasets/glue_datasets_from_dn"
 28 | 
 29 | ##GLUE
 30 | class COLA(AbstractTask):
 31 |     name = "cola"
 32 |     split_to_data_split = {"train": "train",
 33 |                            "validation": "validation",
 34 |                            "test": "validation"}
 35 | 
 36 |     def load_dataset(self, split):
 37 |         return datasets.load_from_disk(f"{main_dir}/cola")[split]
 38 | 
 39 | 
 40 | class SST2(AbstractTask):
 41 |     name = "sst2"
 42 |     split_to_data_split = {"train": "train",
 43 |                            "validation": "validation",
 44 |                            "test": "validation"}
 45 | 
 46 | 
 47 | 
 48 | 
 49 | class MRPC(AbstractTask):
 50 |     name = "mrpc"
 51 |     split_to_data_split = {"train": "train",
 52 |                            "validation": "validation",
 53 |                            "test": "validation"}
 54 | 
 55 | 
 56 | class QQP(AbstractTask):
 57 |     name = "qqp"
 58 |     split_to_data_split = {"train": "train",
 59 |                            "validation": "validation",
 60 |                            "test": "validation"}
 61 | 
 62 | class STSB(AbstractTask):
 63 |     name = "stsb"
 64 |     split_to_data_split = {"train": "train",
 65 |                            "validation": "validation",
 66 |                            "test": "validation"}
 67 | 
 68 | 
 69 | 
 70 | class MNLI(AbstractTask):
 71 |     name = "mnli"
 72 |     split_to_data_split = {"train": "train",
 73 |                            "validation": "validation_matched",
 74 |                            "test": "validation_matched"}
 75 | 
 76 | class MNLI_M(AbstractTask):
 77 |     name = "mnli"
 78 |     split_to_data_split = {"train": "train",
 79 |                            "validation": "validation_matched",
 80 |                            "test": "validation_matched"}
 81 | 
 82 | class MNLI_MM(AbstractTask):
 83 |     name = "mnli"
 84 |     split_to_data_split = {"train": "train",
 85 |                            "validation": "validation_mismatched",
 86 |                            "test": "validation_mismatched"}
 87 | 
 88 | 
 89 | class QNLI(AbstractTask):
 90 |     name = "qnli"
 91 |     split_to_data_split = {"train": "train",
 92 |                            "validation": "validation",
 93 |                            "test": "validation"}
 94 | 
 95 | 
 96 | #Tested
 97 | class RTE(AbstractTask):
 98 |     name = "rte"
 99 |     split_to_data_split = {"train": "train",
100 |                            "validation": "validation",
101 |                            "test": "validation"}
102 | 
103 | class WNLI(AbstractTask):
104 |     name = "wnli"
105 |     split_to_data_split = {"train": "train",
106 |                            "validation": "validation",
107 |                            "test": "validation"}
108 | 
109 | 
110 | TASK_MAPPING = OrderedDict(
111 |     [
112 |         ('mrpc', MRPC),
113 |         ('cola', COLA),
114 |         ('sst2', SST2),
115 |         ('qnli', QNLI),
116 |         ('rte', RTE),
117 |         ('wnli', WNLI),
118 |         ('mnli', MNLI),
119 |         ('mnli-m', MNLI_M),
120 |         ('mnli-mm', MNLI_MM),
121 |         ('qqp', QQP),
122 |         ('stsb', STSB),
123 |     ]
124 | )
125 | 
126 | class AutoTask:
127 |     @classmethod
128 |     def get(self, task, config, data_args, seed=42):
129 |         if task in TASK_MAPPING:
130 |             return TASK_MAPPING[task](config, data_args, seed)
131 |         raise ValueError(
132 |             "Unrecognized task {} for AutoTask Model: {}.\n"
133 |             "Task name should be one of {}.".format(
134 |                 ", ".join(c for c in TASK_MAPPING.keys())
135 |             )
136 |         )
137 | 
138 | if __name__ == "__main__":
139 |     for name in TASK_MAPPING:
140 |         print(name)
141 |         task = AutoTask().get(name, None, None)
142 |         print(task.split_train_to_make_test)
143 |         print(task.split_valid_to_make_test)
144 |         train_set = task.get("train", split_validation_test=True)
145 |         print(train_set[0])
146 | 


--------------------------------------------------------------------------------
/src/processor.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from typing import Callable, List, Mapping, Dict
  3 | import datasets
  4 | import logging
  5 | import numpy as np
  6 | import torch
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | main_dir = "/root/xtlv/data/sora_datasets/glue_datasets_from_dn"
 10 | 
 11 | class AbstractTask(abc.ABC):
 12 |     name = NotImplemented
 13 |     config = NotImplemented
 14 |     prefix = NotImplemented
 15 |     split_map = None
 16 |     split_to_data_split: Mapping[str, str] = \
 17 |         {"train": "train", "validation": "validation", "test": "test"}
 18 |     small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc",
 19 |                                          "superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb",
 20 |                                          "superglue-boolq", "mnli"] 
 21 |     large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"]
 22 | 
 23 |     split_valid_to_make_test = True
 24 |     split_train_to_make_test = False
 25 |     keep_fields_after_preprocess = ["label"]  # The fields that should be kept even after preprocessiing
 26 | 
 27 |     def __init__(self, config, data_args, seed=42, default_max_length=1):
 28 |         self.config = config
 29 |         self.seed = seed
 30 |         self.data_args = data_args
 31 | 
 32 |         self.default_max_length = default_max_length
 33 |         self.__post_init__()
 34 |     
 35 |     def __post_init__(self):
 36 |         self.split_valid_to_make_test = self.name in self.small_datasets_without_all_splits
 37 |         self.split_train_to_make_test = self.name in self.large_data_without_all_splits
 38 |     
 39 |     def load_dataset(self, split):
 40 |         tmp = datasets.load_from_disk(f"{main_dir}/{self.name}")
 41 | 
 42 |         return tmp[split]
 43 | 
 44 |     def check_n_obs(self, n_obs, total_size):
 45 |         if n_obs is not None and n_obs > total_size:
 46 |             n_obs = total_size
 47 |             logger.warning("n_obs is set to %s", n_obs)
 48 |         return n_obs
 49 | 
 50 |     def shuffled_indices(self, dataset):
 51 |         num_samples = len(dataset)
 52 |         generator = torch.Generator()
 53 |         generator.manual_seed(self.seed)
 54 |         return torch.randperm(num_samples, generator=generator).tolist()
 55 | 
 56 |     def subsample(self, dataset, n_obs=None, indices=None):
 57 |         """
 58 |         Given a dataset returns the subsampled dataset.
 59 |         :param n_obs: the number of samples of the subsampled dataset.
 60 |         :param indices: indices to select the samples from, if not given, indices are computed
 61 |         from by shuffling the given dataset.
 62 |         :return: subsampled dataset.
 63 |         """
 64 |         num_samples = len(dataset)
 65 |         n_obs = self.check_n_obs(n_obs, num_samples)
 66 |         if indices is None:
 67 |            indices = self.shuffled_indices(dataset)
 68 |         indices = indices[:n_obs]
 69 |         return dataset.select(indices)
 70 | 
 71 | 
 72 |     def get_split_indices(self, split, dataset, validation_size):
 73 |         indices = self.shuffled_indices(dataset)
 74 |         if split == "validation":
 75 |             return indices[:validation_size]
 76 |         else:
 77 |             return indices[validation_size:]
 78 | 
 79 |     def preprocessor(self, example):
 80 |         return example
 81 | 
 82 |     def get(self, split, n_obs=None, split_validation_test=False):
 83 |         # For small datasets (n_samples < 10K) without test set, we divide validation set to
 84 |         # half, use one half as test set and one half as validation set.
 85 |         if split in ["eval", "dev", "valid"]:
 86 |             split = "validation"
 87 |         if split_validation_test and self.split_valid_to_make_test \
 88 |                 and split != "train":
 89 |             mapped_split = self.split_to_data_split["validation"]
 90 |             dataset = self.load_dataset(split=mapped_split)
 91 |             indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2)
 92 |             dataset = self.subsample(dataset, n_obs, indices)
 93 |         # For larger datasets (n_samples > 10K), we divide training set into 1K as
 94 |         # validation and the rest as training set, keeping the original validation
 95 |         # set as the test set.
 96 |         elif split_validation_test and self.split_train_to_make_test \
 97 |                 and split != "test":
 98 |             dataset = self.load_dataset(split="train")
 99 |             indices = self.get_split_indices(split, dataset, validation_size=1000)
100 |             dataset = self.subsample(dataset, n_obs, indices)
101 |         else:
102 |             mapped_split = self.split_to_data_split[split]
103 |             dataset = self.load_dataset(split=mapped_split)
104 |             # shuffles the data and samples it.
105 |             if n_obs is not None:
106 |                 dataset = self.subsample(dataset, n_obs)
107 | 
108 |         this_method = getattr(self.__class__, 'preprocessor')
109 |         base_method = getattr(AbstractTask, 'preprocessor')
110 |         if this_method is not base_method:
111 |             return dataset.map(self.preprocessor)
112 |         else:
113 |             return dataset


--------------------------------------------------------------------------------
/src/sparse_optimizer_multiply_lr.py:
--------------------------------------------------------------------------------
  1 | from transformers import AdamW 
  2 | from torch.optim import Optimizer
  3 | import torch
  4 | import math
  5 | import numpy as np
  6 | 
  7 | class SparseAdamW(AdamW):
  8 |     def __init__(self,
  9 |                 sparse_lambda = 0.1,
 10 |                 lambda_schedule = None,
 11 |                 max_lambda = None,
 12 |                 lambda_num = None,
 13 |                 **kwargs
 14 |                 ):
 15 |         super().__init__(**kwargs)
 16 |         self.sparse_lambda = sparse_lambda
 17 |         print(f"lambda in optimizer={self.sparse_lambda}")
 18 |         self.lambda_idx = 0
 19 |         self.lambda_schedule = lambda_schedule
 20 |         self._build_lambda_list(max_lambda, lambda_num)
 21 |     
 22 |     def _build_lambda_list(self, max_lambda, lambda_num):
 23 |         if self.lambda_schedule is None:
 24 |             self._lambdas = None
 25 |             return
 26 |         if isinstance(self.lambda_schedule, list):
 27 |             self._lambdas = self.lambda_schedule
 28 |         if self.lambda_schedule == "linear":
 29 |             assert max_lambda is not None and lambda_num is not None, print(f"when using linear schedule, max_lambda and lambda_num must be provided, but got ({max_lambda} and {lambda_num})")
 30 |             self._lambdas = np.linspace(self.sparse_lambda, max_lambda, lambda_num)
 31 |         elif self.lambda_schedule == "log_linear":
 32 |             assert max_lambda is not None and lambda_num is not None, print(f"when using log_linear schedule, max_lambda and lambda_num must be provided, but got ({max_lambda} and {lambda_num})")
 33 |             self._lambdas = np.log(np.linspace(np.exp(self.sparse_lambda), np.exp(max_lambda), lambda_num))
 34 |         else:
 35 |             raise NotImplementedError
 36 |     
 37 |     def step_lambda(self):
 38 |         if self._lambdas is None:
 39 |             print("no lambda schedule is specified, do nothing")
 40 |             return
 41 |         else:
 42 |             if self.lambda_idx < len(self._lambdas) - 1:
 43 |                 self.lambda_idx += 1
 44 |                 self.sparse_lambda = self._lambdas[self.lambda_idx]
 45 |                 print(f"use lambda={self.sparse_lambda}")
 46 |             else:
 47 |                 print(f"reach end of self._lambdas, keep using lambda={self.sparse_lambda}")
 48 | 
 49 |     
 50 |     def step(self, closure = None):
 51 |         """
 52 |         Performs a single optimization step.
 53 |         Arguments:
 54 |             closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
 55 |         """
 56 |         loss = None
 57 |         if closure is not None:
 58 |             loss = closure()
 59 | 
 60 |         for group in self.param_groups:
 61 |             for p in group["params"]:
 62 |                 if p.grad is None:
 63 |                     continue
 64 |                 grad = p.grad.data
 65 |                 if grad.is_sparse:
 66 |                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 67 | 
 68 |                 state = self.state[p]
 69 | 
 70 |                 # State initialization
 71 |                 if len(state) == 0:
 72 |                     state["step"] = 0
 73 |                     # Exponential moving average of gradient values
 74 |                     state["exp_avg"] = torch.zeros_like(p.data)
 75 |                     # Exponential moving average of squared gradient values
 76 |                     state["exp_avg_sq"] = torch.zeros_like(p.data)
 77 | 
 78 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 79 |                 beta1, beta2 = group["betas"]
 80 | 
 81 |                 state["step"] += 1
 82 | 
 83 |                 # Decay the first and second moment running average coefficient
 84 |                 # In-place operations to update the averages at the same time
 85 |                 exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
 86 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
 87 |                 denom = exp_avg_sq.sqrt().add_(group["eps"])
 88 | 
 89 |                 step_size = group["lr"]
 90 |                 if group["correct_bias"]:  # No bias correction for Bert
 91 |                     bias_correction1 = 1.0 - beta1 ** state["step"]
 92 |                     bias_correction2 = 1.0 - beta2 ** state["step"]
 93 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 94 | 
 95 |                 # Just adding the square of the weights to the loss function is *not*
 96 |                 # the correct way of using L2 regularization/weight decay with Adam,
 97 |                 # since that will interact with the m and v parameters in strange ways.
 98 |                 #
 99 |                 # Instead we want to decay the weights in a manner that doesn't interact
100 |                 # with the m/v parameters. This is equivalent to adding the square
101 |                 # of the weights to the loss with plain (non-momentum) SGD.
102 |                 # Add weight decay at the end (fixed version)
103 | 
104 |                 # params with sparsity regularization do not need weight decay
105 |                 # still hard to decide: which quantity stands for $\eta$ in Adam? group['lr] or stepsize?
106 |                 to_add = torch.div(exp_avg, denom) * (-step_size)
107 |                 if group["weight_decay"] > 0.0:
108 |                     # p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
109 |                     to_add = to_add + (-group["lr"] * group["weight_decay"]) * p.data
110 |                 p.data.add_(to_add) 
111 | 
112 |                 if self.sparse_lambda > 0:
113 |                     p.data[p.data > self.sparse_lambda * group["lr"]] -= self.sparse_lambda * group["lr"]
114 |                     p.data[p.data < -self.sparse_lambda * group["lr"]] += self.sparse_lambda * group["lr"]
115 |                     p.data[abs(p.data) < self.sparse_lambda * group["lr"]] = 0.0
116 |             print("in sparse optimizer lr=", group["lr"])
117 |                 
118 |         return loss
119 | 


--------------------------------------------------------------------------------
/src/sparse_optimizer.py:
--------------------------------------------------------------------------------
  1 | from transformers import AdamW 
  2 | from torch.optim import Optimizer
  3 | import torch
  4 | import math
  5 | import numpy as np
  6 | 
  7 | class SparseAdamW(AdamW):
  8 |     def __init__(self,
  9 |                 sparse_lambda = 0.1,
 10 |                 lambda_schedule = None,
 11 |                 max_lambda = None,
 12 |                 lambda_num = None,
 13 |                 **kwargs
 14 |                 ):
 15 |         super().__init__(**kwargs)
 16 |         self.sparse_lambda = sparse_lambda
 17 |         print(f"lambda in optimizer={self.sparse_lambda}")
 18 |         self.lambda_idx = 0
 19 |         self.lambda_schedule = lambda_schedule
 20 |         self._build_lambda_list(max_lambda, lambda_num)
 21 |     
 22 |     def _build_lambda_list(self, max_lambda, lambda_num):
 23 |         if self.lambda_schedule is None:
 24 |             self._lambdas = None
 25 |             return
 26 |         if isinstance(self.lambda_schedule, list):
 27 |             self._lambdas = self.lambda_schedule
 28 |         if self.lambda_schedule == "linear":
 29 |             assert max_lambda is not None and lambda_num is not None, print(f"when using linear schedule, max_lambda and lambda_num must be provided, but got ({max_lambda} and {lambda_num})")
 30 |             self._lambdas = np.linspace(self.sparse_lambda, max_lambda, lambda_num)
 31 |         elif self.lambda_schedule == "log_linear":
 32 |             assert max_lambda is not None and lambda_num is not None, print(f"when using log_linear schedule, max_lambda and lambda_num must be provided, but got ({max_lambda} and {lambda_num})")
 33 |             self._lambdas = np.log(np.linspace(np.exp(self.sparse_lambda), np.exp(max_lambda), lambda_num))
 34 |         elif self.lambda_schedule == "exp_linear":
 35 |             assert max_lambda is not None and lambda_num is not None, print(f"when using exp_linear schedule, max_lambda and lambda_num must be provided, but got ({max_lambda} and {lambda_num})")
 36 |             self._lambdas = np.exp(np.linspace(np.log(self.sparse_lambda), np.log(max_lambda), lambda_num))
 37 |         else:
 38 |             raise NotImplementedError
 39 |     
 40 |     def step_lambda(self):
 41 |         if self._lambdas is None:
 42 |             print("no lambda schedule is specified, do nothing")
 43 |             return
 44 |         else:
 45 |             if self.lambda_idx < len(self._lambdas) - 1:
 46 |                 self.lambda_idx += 1
 47 |                 self.sparse_lambda = self._lambdas[self.lambda_idx]
 48 |                 print(f"use lambda={self.sparse_lambda}")
 49 |             else:
 50 |                 print(f"reach end of self._lambdas, keep using lambda={self.sparse_lambda}")
 51 | 
 52 |     
 53 |     def step(self, closure = None):
 54 |         """
 55 |         Performs a single optimization step.
 56 |         Arguments:
 57 |             closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
 58 |         """
 59 |         loss = None
 60 |         if closure is not None:
 61 |             loss = closure()
 62 | 
 63 |         for group in self.param_groups:
 64 |             for p in group["params"]:
 65 |                 if p.grad is None:
 66 |                     continue
 67 |                 grad = p.grad.data
 68 |                 if grad.is_sparse:
 69 |                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 70 | 
 71 |                 state = self.state[p]
 72 | 
 73 |                 # State initialization
 74 |                 if len(state) == 0:
 75 |                     state["step"] = 0
 76 |                     # Exponential moving average of gradient values
 77 |                     state["exp_avg"] = torch.zeros_like(p.data)
 78 |                     # Exponential moving average of squared gradient values
 79 |                     state["exp_avg_sq"] = torch.zeros_like(p.data)
 80 | 
 81 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 82 |                 beta1, beta2 = group["betas"]
 83 | 
 84 |                 state["step"] += 1
 85 | 
 86 |                 # Decay the first and second moment running average coefficient
 87 |                 # In-place operations to update the averages at the same time
 88 |                 exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
 89 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
 90 |                 denom = exp_avg_sq.sqrt().add_(group["eps"])
 91 | 
 92 |                 step_size = group["lr"]
 93 |                 if group["correct_bias"]:  # No bias correction for Bert
 94 |                     bias_correction1 = 1.0 - beta1 ** state["step"]
 95 |                     bias_correction2 = 1.0 - beta2 ** state["step"]
 96 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 97 | 
 98 |                 # Just adding the square of the weights to the loss function is *not*
 99 |                 # the correct way of using L2 regularization/weight decay with Adam,
100 |                 # since that will interact with the m and v parameters in strange ways.
101 |                 #
102 |                 # Instead we want to decay the weights in a manner that doesn't interact
103 |                 # with the m/v parameters. This is equivalent to adding the square
104 |                 # of the weights to the loss with plain (non-momentum) SGD.
105 |                 # Add weight decay at the end (fixed version)
106 |                 
107 |                 # params with sparsity regularization do not need weight decay
108 |                 # still hard to decide: which quantity stands for $\eta$ in Adam? group['lr] or stepsize?
109 |                 to_add = torch.div(exp_avg, denom) * (-step_size)
110 |                 if group["weight_decay"] > 0.0:
111 |                     # p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
112 |                     to_add = to_add + (-group["lr"] * group["weight_decay"]) * p.data
113 |                 p.data.add_(to_add) 
114 | 
115 | 
116 |                 if self.sparse_lambda > 0:
117 |                     p.data[p.data > self.sparse_lambda] -= self.sparse_lambda
118 |                     p.data[p.data < -self.sparse_lambda] += self.sparse_lambda
119 |                     p.data[abs(p.data) < self.sparse_lambda] = 0.0
120 |                 
121 |         return loss
122 | 


--------------------------------------------------------------------------------
/glue.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """ GLUE benchmark metric. """
 15 | 
 16 | from scipy.stats import pearsonr, spearmanr
 17 | from sklearn.metrics import f1_score, matthews_corrcoef
 18 | 
 19 | import datasets
 20 | 
 21 | 
 22 | _CITATION = """\
 23 | @inproceedings{wang2019glue,
 24 |   title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
 25 |   author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
 26 |   note={In the Proceedings of ICLR.},
 27 |   year={2019}
 28 | }
 29 | """
 30 | 
 31 | _DESCRIPTION = """\
 32 | GLUE, the General Language Understanding Evaluation benchmark
 33 | (https://gluebenchmark.com/) is a collection of resources for training,
 34 | evaluating, and analyzing natural language understanding systems.
 35 | """
 36 | 
 37 | _KWARGS_DESCRIPTION = """
 38 | Compute GLUE evaluation metric associated to each GLUE dataset.
 39 | Args:
 40 |     predictions: list of predictions to score.
 41 |         Each translation should be tokenized into a list of tokens.
 42 |     references: list of lists of references for each translation.
 43 |         Each reference should be tokenized into a list of tokens.
 44 | Returns: depending on the GLUE subset, one or several of:
 45 |     "accuracy": Accuracy
 46 |     "f1": F1 score
 47 |     "pearson": Pearson Correlation
 48 |     "spearmanr": Spearman Correlation
 49 |     "matthews_correlation": Matthew Correlation
 50 | Examples:
 51 | 
 52 |     >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
 53 |     >>> references = [0, 1]
 54 |     >>> predictions = [0, 1]
 55 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 56 |     >>> print(results)
 57 |     {'accuracy': 1.0}
 58 | 
 59 |     >>> glue_metric = datasets.load_metric('glue', 'mrpc')  # 'mrpc' or 'qqp'
 60 |     >>> references = [0, 1]
 61 |     >>> predictions = [0, 1]
 62 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 63 |     >>> print(results)
 64 |     {'accuracy': 1.0, 'f1': 1.0}
 65 | 
 66 |     >>> glue_metric = datasets.load_metric('glue', 'stsb')
 67 |     >>> references = [0., 1., 2., 3., 4., 5.]
 68 |     >>> predictions = [0., 1., 2., 3., 4., 5.]
 69 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 70 |     >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)})
 71 |     {'pearson': 1.0, 'spearmanr': 1.0}
 72 | 
 73 |     >>> glue_metric = datasets.load_metric('glue', 'cola')
 74 |     >>> references = [0, 1]
 75 |     >>> predictions = [0, 1]
 76 |     >>> results = glue_metric.compute(predictions=predictions, references=references)
 77 |     >>> print(results)
 78 |     {'matthews_correlation': 1.0}
 79 | """
 80 | 
 81 | 
 82 | def simple_accuracy(preds, labels):
 83 |     return float((preds == labels).mean())
 84 | 
 85 | 
 86 | def acc_and_f1(preds, labels):
 87 |     acc = simple_accuracy(preds, labels)
 88 |     f1 = float(f1_score(y_true=labels, y_pred=preds))
 89 |     return {
 90 |         "accuracy": acc,
 91 |         "f1": f1,
 92 |     }
 93 | 
 94 | 
 95 | def pearson_and_spearman(preds, labels):
 96 |     pearson_corr = float(pearsonr(preds, labels)[0])
 97 |     spearman_corr = float(spearmanr(preds, labels)[0])
 98 |     return {
 99 |         "pearson": pearson_corr,
100 |         "spearmanr": spearman_corr,
101 |     }
102 | 
103 | 
104 | @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
105 | class Glue(datasets.Metric):
106 |     def _info(self):
107 |         if self.config_name not in [
108 |             "sst2",
109 |             "mnli",
110 |             "mnli_mismatched",
111 |             "mnli_matched",
112 |             "cola",
113 |             "stsb",
114 |             "mrpc",
115 |             "qqp",
116 |             "qnli",
117 |             "rte",
118 |             "wnli",
119 |             "hans",
120 |         ]:
121 |             raise KeyError(
122 |                 "You should supply a configuration name selected in "
123 |                 '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
124 |                 '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
125 |             )
126 |         return datasets.MetricInfo(
127 |             description=_DESCRIPTION,
128 |             citation=_CITATION,
129 |             inputs_description=_KWARGS_DESCRIPTION,
130 |             features=datasets.Features(
131 |                 {
132 |                     "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
133 |                     "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
134 |                 }
135 |             ),
136 |             codebase_urls=[],
137 |             reference_urls=[],
138 |             format="numpy",
139 |         )
140 | 
141 |     def _compute(self, predictions, references):
142 |         if self.config_name == "cola":
143 |             return {"matthews_correlation": matthews_corrcoef(references, predictions)}
144 |         elif self.config_name == "stsb":
145 |             return pearson_and_spearman(predictions, references)
146 |         elif self.config_name in ["mrpc", "qqp"]:
147 |             return acc_and_f1(predictions, references)
148 |         elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]:
149 |             return {"accuracy": simple_accuracy(predictions, references)}
150 |         else:
151 |             raise KeyError(
152 |                 "You should supply a configuration name selected in "
153 |                 '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
154 |                 '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
155 |             )


--------------------------------------------------------------------------------
/src/lora.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Union
  2 | 
  3 | from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func
  4 | from opendelta.utils.name_based_addressing import *
  5 | from opendelta.basemodel import DeltaBase
  6 | import torch.nn as nn
  7 | from opendelta import BaseDeltaConfig
  8 | import math
  9 | from dataclasses import dataclass, field
 10 | import torch
 11 | 
 12 | """
 13 | implementation of sparse lora
 14 | """
 15 | 
 16 | class LowRankLinear(nn.Module):
 17 |     #  ------------------------------------------------------------------------------------------
 18 |     #  Copyright (c) Microsoft Corporation. All rights reserved.
 19 |     #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 20 |     #  ------------------------------------------------------------------------------------------
 21 |     #  copy from loralib and do some refactor
 22 |     def __init__(self,
 23 |         in_features,
 24 |         out_features,
 25 |         weight,
 26 |         r=8,
 27 |         lora_alpha=16,
 28 |         lora_dropout=0.0,
 29 |     ):
 30 |         super().__init__()
 31 |         self.r = r
 32 |         self.lora_alpha = lora_alpha
 33 |         self.lora_dropout = lora_dropout
 34 |         if lora_dropout > 0.:
 35 |             self.lora_dropout = nn.Dropout(p=lora_dropout)
 36 |         else:
 37 |             self.lora_dropout = lambda x: x
 38 |         if r > 0:
 39 |             self.lora_A = nn.Parameter(weight.new_zeros((r, in_features)))
 40 |             self.lora_B = nn.Parameter(weight.new_zeros((out_features, r)))
 41 |             self.gate = nn.Parameter(torch.randn(1, r))
 42 |             self.scaling = self.lora_alpha / self.r
 43 |             nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
 44 |             nn.init.zeros_(self.lora_B)
 45 | 
 46 |     def forward(self, x):
 47 |         return ((self.lora_dropout(x) @ self.lora_A.T).mul(self.gate) @ self.lora_B.T) * self.scaling
 48 | 
 49 | @dataclass
 50 | class LoraArguments:
 51 |     r: int = 8
 52 |     lora_alpha: int = 16
 53 |     lora_dropout: float = 0.0
 54 | 
 55 | class LoraConfig(BaseDeltaConfig):
 56 |     r"""
 57 |     This is the configuration class to store the configuration of a :py:class:`~LoraModel`
 58 |     """
 59 |     def __init__(
 60 |         self,
 61 |         lora_r=8,
 62 |         lora_alpha=16,
 63 |         lora_dropout=0.0,
 64 |         **kwargs
 65 |     ):
 66 |         super().__init__(**kwargs)
 67 |         arg_names = get_arg_names_inside_func(self.__init__)
 68 |         for arg_name in arg_names:
 69 |             if not hasattr(self, arg_name): # the arg has not been registered in parent config
 70 |                 setattr(self, arg_name, locals()[arg_name])
 71 | 
 72 | 
 73 | class LoraModel(DeltaBase):
 74 |     r""" The implementation of `LoRA: Low-Rank Adaptation of Large Language Models <https://arxiv.org/abs/2106.09685>`_ .
 75 |     Thanks for their `loralib <https://github.com/microsoft/LoRA/tree/main/loralib>`_.
 76 | 
 77 |     .. note::
 78 |         In our implementation, we did not use loralib.linear to replace the linear layer of the backbone model.
 79 |         Instead, we insert a parallel module into the backbone.
 80 |         In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module.
 81 |         If you want to use the original implementation, please refer to `lora_old.py`
 82 | 
 83 |     class attributes:
 84 |         - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the
 85 |         attention layer. However, other linears can also be modified, and may lead to better performance.
 86 | 
 87 |         .. note::
 88 |             modified_modules should point to linear layer. We currently don't support broadcast to all linears in
 89 |             a module's child modules.
 90 | 
 91 |         - delta_type = "lora"
 92 | 
 93 | 
 94 |     Args:
 95 |         backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified.
 96 |         lora_r (:obj:`int`, *optional*): the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has.
 97 |         lora_alpha (:obj:`int`, *optional*): A hyper-parameter to control the init scale of loralib.linear .
 98 |         lora_dropout (:obj:`float`, *optional*): The dropout rate in lora.linear.
 99 |         modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only
100 |                         the implemented ones)
101 |         unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen
102 |                          together with the prefix parameters.
103 |         common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping.
104 | 
105 |     """
106 | 
107 |     config_class = LoraConfig
108 |     delta_type = "lora"
109 |     default_modified_modules = ['attn@.q@', 'attn@.v@', 'attn@.k@', 'attn@.proj@', 'ff@.w1@', 'ff@.w2@']
110 |     _supported_backends = ['hf', 'bmt']
111 |     _need_pseudo_data = False
112 |     def __init__(self,
113 |                  backbone_model: nn.Module,
114 |                  lora_r=8,
115 |                  lora_alpha=16,
116 |                  lora_dropout=0.0,
117 |                  modified_modules: Optional[List[str]] = None,
118 |                  unfrozen_modules: Optional[List[str]] = None,
119 |                  exclude_modules: Optional[List[str]] = None,
120 |                  common_structure: Optional[bool] = None,
121 |                  interactive_modify: Optional[Union[bool, int]] = False,
122 |                  ):
123 |         DeltaBase.__init__(self,
124 |                            backbone_model,
125 |                            modified_modules=modified_modules,
126 |                            unfrozen_modules=unfrozen_modules,
127 |                            common_structure=common_structure,
128 |                            interactive_modify=interactive_modify,
129 |                            )
130 |         arg_names = get_arg_names_inside_func(self.__init__)
131 |         for arg_name in arg_names:
132 |             if not hasattr(self, arg_name): # not registered in parent class
133 |                 setattr(self, arg_name, locals()[arg_name])
134 | 
135 |         self.delta_modules = nn.ModuleList()
136 | 
137 |         self.add_all_delta_to_backbone(self.backbone_model,
138 |                                    self.modified_modules,
139 |                                    )
140 | 
141 | 
142 |     def update_module(self, module: nn.Module, key: str):
143 |         print("calling update module")
144 |         parent_ref, child_name, child_ref = self.find_module(module, key)
145 |         print("child ref:", child_ref)
146 | 
147 |         parallel_module = self.new_module_like(child_module=child_ref)
148 |         print("parallel module:", parallel_module)
149 |         self.insert_parallel_module(child_ref, delta_module=parallel_module, delta_name="lora")
150 | 
151 |     def _pseudo_data_to_instantiate(self, module):
152 |         # no need to pass pseudo input, so overwrite it
153 |         pass
154 | 
155 |     def new_module_like(self, child_module):
156 |         if isinstance(child_module, nn.Linear):
157 |             in_features, out_features = child_module.in_features, child_module.out_features
158 |             new_module = LowRankLinear(in_features = in_features,
159 |                                      out_features = out_features,
160 |                                      weight = child_module.weight,
161 |                                      r=self.lora_r,
162 |                                      lora_alpha=self.lora_alpha,
163 |                                      lora_dropout=self.lora_dropout)
164 |             self.delta_modules.append(new_module)
165 |         else:
166 |             raise NotImplementedError
167 |         return new_module
168 | 


--------------------------------------------------------------------------------
/run_glue_adapter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import wandb
 22 | os.environ['WANDB_MODE'] = 'offline'
 23 | import random
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | 
 28 | import datasets
 29 | import numpy as np
 30 | from datasets import load_dataset
 31 | 
 32 | import transformers
 33 | from transformers import (
 34 |     AutoConfig,
 35 |     AutoModelForSequenceClassification,
 36 |     AutoTokenizer,
 37 |     DataCollatorWithPadding,
 38 |     EvalPrediction,
 39 |     HfArgumentParser,
 40 |     PretrainedConfig,
 41 |     Trainer,
 42 |     TrainingArguments,
 43 |     default_data_collator,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint
 47 | from transformers.utils import check_min_version
 48 | from transformers.utils.versions import require_version
 49 | sys.path.append('../')
 50 | from src.trainer import SparseTrainer 
 51 | from src.util import compute_trainable_sparse_param, create_optimizer_and_scheduler
 52 | from src.sparse_optimizer import SparseAdamW
 53 | from transformers import get_linear_schedule_with_warmup
 54 | 
 55 | 
 56 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 57 | # check_min_version("4.24.0")
 58 | 
 59 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 60 | 
 61 | task_to_keys = {
 62 |     "cola": ("sentence", None),
 63 |     "mnli": ("premise", "hypothesis"),
 64 |     "mnli-m": ("premise", "hypothesis"),
 65 |     "mnli-mm": ("premise", "hypothesis"),
 66 |     "mrpc": ("sentence1", "sentence2"),
 67 |     "qnli": ("question", "sentence"),
 68 |     "qqp": ("question1", "question2"),
 69 |     "rte": ("sentence1", "sentence2"),
 70 |     "sst2": ("sentence", None),
 71 |     "stsb": ("sentence1", "sentence2"),
 72 |     "wnli": ("sentence1", "sentence2"),
 73 | }
 74 | 
 75 | task_to_best_metric = {
 76 |     "rte": "eval_accuracy",
 77 |     "mrpc": "eval_f1", 
 78 |     "cola": "eval_matthews_correlation", 
 79 |     "stsb": "eval_pearson", 
 80 |     "sst2": "eval_accuracy", 
 81 |     "qnli": "eval_accuracy",
 82 |     "mnli": "eval_accuracy",
 83 |     "mnli-m": "eval_accuracy",
 84 |     "mnli-mm": "eval_accuracy",
 85 |     "qqp": "eval_accuracy",
 86 | }
 87 | 
 88 | data_path = '/root/xtlv/data/sora_datasets/glue_datasets_from_dn/'
 89 | 
 90 | logger = logging.getLogger(__name__)
 91 | 
 92 | 
 93 | @dataclass
 94 | class DataTrainingArguments:
 95 |     """
 96 |     Arguments pertaining to what data we are going to input our model for training and eval.
 97 |     Using `HfArgumentParser` we can turn this class
 98 |     into argparse arguments to be able to specify them on
 99 |     the command line.
100 |     """
101 | 
102 |     task_name: Optional[str] = field(
103 |         default=None,
104 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
105 |     )
106 |     dataset_name: Optional[str] = field(
107 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
108 |     )
109 |     dataset_config_name: Optional[str] = field(
110 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
111 |     )
112 |     max_seq_length: int = field(
113 |         default=128,
114 |         metadata={
115 |             "help": (
116 |                 "The maximum total input sequence length after tokenization. Sequences longer "
117 |                 "than this will be truncated, sequences shorter will be padded."
118 |             )
119 |         },
120 |     )
121 |     overwrite_cache: bool = field(
122 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
123 |     )
124 |     pad_to_max_length: bool = field(
125 |         default=True,
126 |         metadata={
127 |             "help": (
128 |                 "Whether to pad all samples to `max_seq_length`. "
129 |                 "If False, will pad the samples dynamically when batching to the maximum length in the batch."
130 |             )
131 |         },
132 |     )
133 |     max_train_samples: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": (
137 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
138 |                 "value if set."
139 |             )
140 |         },
141 |     )
142 |     max_eval_samples: Optional[int] = field(
143 |         default=None,
144 |         metadata={
145 |             "help": (
146 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
147 |                 "value if set."
148 |             )
149 |         },
150 |     )
151 |     max_predict_samples: Optional[int] = field(
152 |         default=None,
153 |         metadata={
154 |             "help": (
155 |                 "For debugging purposes or quicker training, truncate the number of prediction examples to this "
156 |                 "value if set."
157 |             )
158 |         },
159 |     )
160 |     train_file: Optional[str] = field(
161 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
162 |     )
163 |     validation_file: Optional[str] = field(
164 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
165 |     )
166 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
167 | 
168 |     def __post_init__(self):
169 |         if self.task_name is not None:
170 |             self.task_name = self.task_name.lower()
171 |             if self.task_name not in task_to_keys.keys():
172 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
173 |         elif self.dataset_name is not None:
174 |             pass
175 |         elif self.train_file is None or self.validation_file is None:
176 |             raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
177 |         else:
178 |             train_extension = self.train_file.split(".")[-1]
179 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
180 |             validation_extension = self.validation_file.split(".")[-1]
181 |             assert (
182 |                 validation_extension == train_extension
183 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
184 | 
185 | 
186 | @dataclass
187 | class ModelArguments:
188 |     """
189 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
190 |     """
191 | 
192 |     model_name_or_path: str = field(
193 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
194 |     )
195 |     config_name: Optional[str] = field(
196 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
197 |     )
198 |     tokenizer_name: Optional[str] = field(
199 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
200 |     )
201 |     cache_dir: Optional[str] = field(
202 |         default=None,
203 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
204 |     )
205 |     use_fast_tokenizer: bool = field(
206 |         default=True,
207 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
208 |     )
209 |     model_revision: str = field(
210 |         default="main",
211 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
212 |     )
213 |     use_auth_token: bool = field(
214 |         default=False,
215 |         metadata={
216 |             "help": (
217 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
218 |                 "with private models)."
219 |             )
220 |         },
221 |     )
222 |     ignore_mismatched_sizes: bool = field(
223 |         default=False,
224 |         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
225 |     )
226 | 
227 | @dataclass
228 | class SparseArguments:
229 |     sparse_lambda: Optional[float] = field(
230 |         default=1e-3, metadata={"help": "loss penalty term for gate param"}
231 |     )
232 |     sparse_lambda_2: Optional[float] = field(
233 |         default=1e-3, metadata={"help": "clipping scale for gate param"}
234 |     )
235 |     sparse_lr: Optional[float] = field(
236 |         default=None, metadata={"help": "lr for gate parameter in sparse lora, default to same as learning rate for other parameters"}
237 |     )
238 |     lora_r: Optional[int] = field(
239 |         default=16, metadata={"help": "matrix rank in lora"}
240 |     )
241 |     lambda_schedule: Optional[str] = field(
242 |         default=None, metadata={"help": "scheduling of lambda_2, {linear, log_linear}"}
243 |     )
244 |     max_lambda: Optional[float] = field(
245 |         default=10, metadata={"help": "maximum value of lambda_2 in scheduling"}
246 |     )
247 |     lambda_num: Optional[int] = field(
248 |         default=10, metadata={"help": "total number of lambdas in scheduling"}
249 |     )
250 |     bottleneck_dim: Optional[int] = field(
251 |         default=12, metadata={"help": "matrix rank in lora"}
252 |     )
253 | 
254 | @dataclass
255 | class SparseTrainingArguments(TrainingArguments):
256 |     train_sparse: Optional[bool] = field(
257 |         default=False, metadata={"help": "whether use sparse lora"}
258 |     )
259 |     debug_mode: Optional[bool] = field(
260 |         default=False, metadata={"help": "debug mode"}
261 |     )
262 | 
263 | 
264 | def main():
265 |     # See all possible arguments in src/transformers/training_args.py
266 |     # or by passing the --help flag to this script.
267 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
268 | 
269 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SparseTrainingArguments, SparseArguments))
270 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
271 |         # If we pass only one argument to the script and it's the path to a json file,
272 |         # let's parse it to get our arguments.
273 |         model_args, data_args, training_args, sparse_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
274 |     else:
275 |         model_args, data_args, training_args, sparse_args = parser.parse_args_into_dataclasses()
276 | 
277 | 
278 |     task_name_for_get = data_args.task_name
279 |     if "mnli" in data_args.task_name:
280 |         data_args.task_name = "mnli"
281 |     
282 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
283 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
284 |     # send_example_telemetry("run_glue", model_args, data_args)
285 |     training_args.metric_for_best_model = task_to_best_metric[data_args.task_name]
286 | 
287 |     if os.getenv("LOCAL_RANK"):
288 |         training_args.local_rank = int(os.environ["LOCAL_RANK"])
289 |     else:
290 |         training_args.local_rank = -1
291 | 
292 |     if training_args.train_sparse:
293 |         if sparse_args.sparse_lr is None:
294 |             sparse_args.sparse_lr = training_args.learning_rate
295 |         if training_args.debug_mode:
296 |             training_args.output_dir += "-debug"
297 |         print(f"save model to {training_args.output_dir}")
298 | 
299 |     # Setup logging
300 |     logging.basicConfig(
301 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
302 |         datefmt="%m/%d/%Y %H:%M:%S",
303 |         handlers=[logging.StreamHandler(sys.stdout)],
304 |     )
305 | 
306 |     log_level = training_args.get_process_log_level()
307 |     logger.setLevel(log_level)
308 |     datasets.utils.logging.set_verbosity(log_level)
309 |     transformers.utils.logging.set_verbosity(log_level)
310 |     transformers.utils.logging.enable_default_handler()
311 |     transformers.utils.logging.enable_explicit_format()
312 | 
313 |     # Log on each process the small summary:
314 |     logger.warning(
315 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
316 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
317 |     )
318 |     logger.info(f"Training/evaluation parameters {training_args}")
319 | 
320 |     # Detecting last checkpoint.
321 |     last_checkpoint = None
322 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
323 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
324 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
325 |             raise ValueError(
326 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
327 |                 "Use --overwrite_output_dir to overcome."
328 |             )
329 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
330 |             logger.info(
331 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
332 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
333 |             )
334 | 
335 |     # Set seed before initializing model.
336 |     set_seed(training_args.seed)
337 | 
338 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
339 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
340 |     #
341 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
342 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
343 |     # label if at least two columns are provided.
344 |     #
345 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
346 |     # single column. You can easily tweak this behavior (see below)
347 |     #
348 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
349 |     # download the dataset.
350 |     if data_args.task_name is not None:
351 |         # Downloading and loading a dataset from the hub.
352 |         from datasets import load_from_disk
353 |         from src.glue_tasks import AutoTask
354 |         raw_datasets = load_from_disk(data_path + data_args.task_name)
355 | 
356 |         task = AutoTask().get(data_args.task_name, None, None)
357 |         raw_datasets = {
358 |             "train": task.get("train", split_validation_test=True),
359 |             "validation": task.get("validation", split_validation_test=True),
360 |             "test": task.get("test", split_validation_test=True)
361 |         }
362 |         from datasets import DatasetDict
363 |         raw_datasets = DatasetDict(raw_datasets)
364 |         
365 |     elif data_args.dataset_name is not None:
366 |         raise NotImplementedError
367 |         
368 |     else:
369 |         # Loading a dataset from your local files.
370 |         # CSV/JSON training and evaluation files are needed.
371 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
372 | 
373 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
374 |         # when you use `do_predict` without specifying a GLUE benchmark task.
375 |         if training_args.do_predict:
376 |             if data_args.test_file is not None:
377 |                 train_extension = data_args.train_file.split(".")[-1]
378 |                 test_extension = data_args.test_file.split(".")[-1]
379 |                 assert (
380 |                     test_extension == train_extension
381 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
382 |                 data_files["test"] = data_args.test_file
383 |             else:
384 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
385 | 
386 |         for key in data_files.keys():
387 |             logger.info(f"load a local file for {key}: {data_files[key]}")
388 | 
389 |         if data_args.train_file.endswith(".csv"):
390 |             # Loading a dataset from local csv files
391 |             raw_datasets = load_dataset(
392 |                 "csv",
393 |                 data_files=data_files,
394 |                 cache_dir=model_args.cache_dir,
395 |                 use_auth_token=True if model_args.use_auth_token else None,
396 |             )
397 |         else:
398 |             # Loading a dataset from local json files
399 |             raw_datasets = load_dataset(
400 |                 "json",
401 |                 data_files=data_files,
402 |                 cache_dir=model_args.cache_dir,
403 |                 use_auth_token=True if model_args.use_auth_token else None,
404 |             )
405 |     # See more about loading any type of standard or custom dataset at
406 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
407 | 
408 |     # Labels
409 |     if data_args.task_name is not None:
410 |         is_regression = data_args.task_name == "stsb"
411 |         if not is_regression:
412 |             label_list = raw_datasets["train"].features["label"].names
413 |             num_labels = len(label_list)
414 |         else:
415 |             num_labels = 1
416 |     else:
417 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
418 |         is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
419 |         if is_regression:
420 |             num_labels = 1
421 |         else:
422 |             # A useful fast method:
423 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
424 |             label_list = raw_datasets["train"].unique("label")
425 |             label_list.sort()  # Let's sort it for determinism
426 |             num_labels = len(label_list)
427 | 
428 |     # Load pretrained model and tokenizer
429 |     #
430 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
431 |     # download model & vocab.
432 |     config = AutoConfig.from_pretrained(
433 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
434 |         num_labels=num_labels,
435 |         finetuning_task=data_args.task_name,
436 |         cache_dir=model_args.cache_dir,
437 |         revision=model_args.model_revision,
438 |         use_auth_token=True if model_args.use_auth_token else None,
439 |     )
440 |     tokenizer = AutoTokenizer.from_pretrained(
441 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
442 |         cache_dir=model_args.cache_dir,
443 |         use_fast=model_args.use_fast_tokenizer,
444 |         revision=model_args.model_revision,
445 |         use_auth_token=True if model_args.use_auth_token else None,
446 |     )
447 |     model = AutoModelForSequenceClassification.from_pretrained(
448 |         model_args.model_name_or_path,
449 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
450 |         config=config,
451 |         cache_dir=model_args.cache_dir,
452 |         revision=model_args.model_revision,
453 |         use_auth_token=True if model_args.use_auth_token else None,
454 |         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
455 |     )
456 | 
457 |     from opendelta.delta_models.adapter import AdapterModel, AdapterConfig
458 |     import json
459 |     adapter_config = json.load(open("./adapter_config.json"))
460 |     adapter_config["bottleneck_dim"] = sparse_args.bottleneck_dim
461 |     adapter_config = AdapterConfig.from_dict(adapter_config)
462 |     delta_model = AdapterModel.from_config(adapter_config, backbone_model=model)
463 |     delta_model.freeze_module(set_state_dict = True)
464 |     delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=False)
465 | 
466 | 
467 | 
468 |     # Preprocessing the raw_datasets
469 |     if data_args.task_name is not None:
470 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
471 |     else:
472 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
473 |         non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
474 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
475 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
476 |         else:
477 |             if len(non_label_column_names) >= 2:
478 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
479 |             else:
480 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
481 | 
482 |     # Padding strategy
483 |     if data_args.pad_to_max_length:
484 |         padding = "max_length"
485 |     else:
486 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
487 |         padding = False
488 | 
489 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
490 |     label_to_id = None
491 |     if (
492 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
493 |         and data_args.task_name is not None
494 |         and not is_regression
495 |     ):
496 |         # Some have all caps in their config, some don't.
497 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
498 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
499 |             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
500 |         else:
501 |             logger.warning(
502 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
503 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
504 |                 "\nIgnoring the model labels as a result.",
505 |             )
506 |     elif data_args.task_name is None and not is_regression:
507 |         label_to_id = {v: i for i, v in enumerate(label_list)}
508 | 
509 |     if label_to_id is not None:
510 |         model.config.label2id = label_to_id
511 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
512 |     elif data_args.task_name is not None and not is_regression:
513 |         model.config.label2id = {l: i for i, l in enumerate(label_list)}
514 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
515 | 
516 |     if data_args.max_seq_length > tokenizer.model_max_length:
517 |         logger.warning(
518 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
519 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
520 |         )
521 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
522 | 
523 |     def preprocess_function(examples):
524 |         # Tokenize the texts
525 |         args = (
526 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
527 |         )
528 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
529 | 
530 |         # Map labels to IDs (not necessary for GLUE tasks)
531 |         if label_to_id is not None and "label" in examples:
532 |             result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
533 |         return result
534 | 
535 |     with training_args.main_process_first(desc="dataset map pre-processing"):
536 |         raw_datasets = raw_datasets.map(
537 |             preprocess_function,
538 |             batched=True,
539 |             load_from_cache_file=not data_args.overwrite_cache,
540 |             desc="Running tokenizer on dataset",
541 |         )
542 |     if training_args.do_train:
543 |         if "train" not in raw_datasets:
544 |             raise ValueError("--do_train requires a train dataset")
545 |         train_dataset = raw_datasets["train"]
546 |         if data_args.max_train_samples is not None:
547 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
548 |             train_dataset = train_dataset.select(range(max_train_samples))
549 | 
550 |     if training_args.do_eval:
551 |         if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
552 |             raise ValueError("--do_eval requires a validation dataset")
553 |         eval_dataset = raw_datasets["validation"]
554 |         if data_args.max_eval_samples is not None:
555 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
556 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
557 | 
558 |     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
559 |         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
560 |             raise ValueError("--do_predict requires a test dataset")
561 |         predict_dataset = raw_datasets["test"]
562 |         if data_args.max_predict_samples is not None:
563 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
564 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
565 | 
566 |     # Log a few random samples from the training set:
567 |     if training_args.do_train:
568 |         for index in random.sample(range(len(train_dataset)), 3):
569 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
570 | 
571 |     # Get the metric function
572 |     from datasets import load_metric
573 |     if data_args.task_name is not None:
574 |         metric = load_metric("./glue.py", data_args.task_name)
575 |     else:
576 |         metric = load_metric("accuracy")
577 | 
578 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
579 |     # predictions and label_ids field) and has to return a dictionary string to float.
580 |     def compute_metrics(mode, p: EvalPrediction):
581 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
582 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
583 |         if data_args.task_name is not None:
584 |             result = metric.compute(predictions=preds, references=p.label_ids)
585 |             if len(result) > 1:
586 |                 result["combined_score"] = np.mean(list(result.values())).item()
587 |             return result
588 |         elif is_regression:
589 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
590 |         else:
591 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
592 | 
593 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
594 |     # we already did the padding.
595 |     if data_args.pad_to_max_length:
596 |         data_collator = default_data_collator
597 |     elif training_args.fp16:
598 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
599 |     else:
600 |         data_collator = None
601 |     
602 | 
603 |     # Initialize our Trainer
604 |     optimizer, lr_scheduler = create_optimizer_and_scheduler(training_args, model, num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
605 |     sparse_optimizer = None
606 |     sparse_scheduler = None
607 |     if training_args.train_sparse:
608 |         print("building sparse optimizer and scheduler")
609 |         from src.trainer import GATE_PARAM_NAME
610 |         valid_param_name = []
611 |         for n, p in model.named_parameters():
612 |             print(n)
613 |             if GATE_PARAM_NAME in n:
614 |                 valid_param_name.append(n)
615 |         print("valid param name:", valid_param_name)
616 |         sparse_optimizer = SparseAdamW(sparse_lambda=sparse_args.sparse_lambda_2, lambda_schedule=sparse_args.lambda_schedule, max_lambda=sparse_args.max_lambda, lambda_num=sparse_args.lambda_num, params=[p for n, p in model.named_parameters() if GATE_PARAM_NAME in n and p.requires_grad], lr=sparse_args.sparse_lr)
617 |         sparse_scheduler = get_linear_schedule_with_warmup(sparse_optimizer, 
618 |         num_warmup_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)*training_args.warmup_ratio), 
619 |         num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
620 |     
621 |     if training_args.debug_mode:
622 |         train_dataset = eval_dataset
623 | 
624 |     # Initialize our Trainer
625 |     trainer = SparseTrainer(
626 |         model=model,
627 |         args=training_args,
628 |         train_dataset=train_dataset if training_args.do_train else None,
629 |         eval_dataset=eval_dataset if training_args.do_eval else None,
630 |         compute_metrics=compute_metrics,
631 |         tokenizer=tokenizer,
632 |         data_collator=data_collator,
633 |         optimizers = (optimizer, lr_scheduler),
634 |         sparse_lambda = sparse_args.sparse_lambda,
635 |         sparse_optimizer = (sparse_optimizer, sparse_scheduler)
636 |     )
637 | 
638 |     # Training
639 |     if training_args.do_train:
640 |         checkpoint = None
641 |         if training_args.resume_from_checkpoint is not None:
642 |             checkpoint = training_args.resume_from_checkpoint
643 |         elif last_checkpoint is not None:
644 |             checkpoint = last_checkpoint
645 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
646 |         metrics = train_result.metrics
647 |         max_train_samples = (
648 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
649 |         )
650 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
651 | 
652 |         trainer.save_model()  # Saves the tokenizer too for easy upload
653 | 
654 |         trainer.log_metrics("train", metrics)
655 |         trainer.save_metrics("train", metrics)
656 |         trainer.save_state()
657 |     
658 |     sparse_param, total_param = compute_trainable_sparse_param(model)
659 | 
660 | 
661 |     # eval on 1000 samples train set
662 |     train_dataset_for_eval = train_dataset.shuffle(seed=42).select(range(1000))
663 |     logger.info("*** Evaluate on training subset ***")
664 |     metrics = trainer.evaluate(eval_dataset=train_dataset_for_eval, metric_key_prefix = "eval_train")
665 |     trainer.log_metrics("eval_train", metrics)
666 |     trainer.save_metrics("eval_train", metrics)
667 |     BEST_TRAIN_METRIC = metrics["eval_train_" + "_".join(task_to_best_metric[data_args.task_name].split("_")[1:])]
668 | 
669 | 
670 |     # Evaluation
671 |     if training_args.do_eval:
672 |         logger.info("*** Evaluate ***")
673 | 
674 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
675 |         tasks = [data_args.task_name]
676 |         eval_datasets = [eval_dataset]
677 |         
678 | 
679 |         for eval_dataset, task in zip(eval_datasets, tasks):
680 |             metrics = trainer.evaluate(eval_dataset=eval_dataset)
681 | 
682 |             max_eval_samples = (
683 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
684 |             )
685 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
686 | 
687 | 
688 |             trainer.log_metrics("eval", metrics)
689 |             trainer.save_metrics("eval", metrics)
690 | 
691 |             BEST_EVAL_METRIC = metrics[task_to_best_metric[data_args.task_name]]
692 | 
693 |     if training_args.do_predict:
694 |         logger.info("*** Predict ***")
695 | 
696 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
697 |         tasks = [data_args.task_name]
698 |         predict_datasets = [predict_dataset]
699 |         
700 | 
701 |         for predict_dataset, task in zip(predict_datasets, tasks):
702 |             metrics = trainer.evaluate(eval_dataset=predict_dataset)
703 | 
704 |             max_eval_samples = (
705 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
706 |             )
707 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
708 | 
709 |             trainer.log_metrics("test", metrics)
710 |             trainer.save_metrics("test", metrics)
711 | 
712 |     logger.info("***** Final Model ******\nAdapter bottleneck_dim: %d\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (adapter_config.bottleneck_dim, total_param, sparse_param, sparse_param / total_param * 100))
713 | 
714 | if __name__ == "__main__":
715 |     main()


--------------------------------------------------------------------------------
/run_glue_bitfit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import wandb
 22 | os.environ['WANDB_MODE'] = 'offline'
 23 | import random
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | 
 28 | import datasets
 29 | import numpy as np
 30 | from datasets import load_dataset
 31 | 
 32 | import transformers
 33 | from transformers import (
 34 |     AutoConfig,
 35 |     AutoModelForSequenceClassification,
 36 |     AutoTokenizer,
 37 |     DataCollatorWithPadding,
 38 |     EvalPrediction,
 39 |     HfArgumentParser,
 40 |     PretrainedConfig,
 41 |     Trainer,
 42 |     TrainingArguments,
 43 |     default_data_collator,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint
 47 | from transformers.utils import check_min_version
 48 | from transformers.utils.versions import require_version
 49 | sys.path.append('../')
 50 | from src.trainer import SparseTrainer 
 51 | from src.util import compute_trainable_sparse_param, create_optimizer_and_scheduler
 52 | from src.sparse_optimizer import SparseAdamW
 53 | from transformers import get_linear_schedule_with_warmup
 54 | 
 55 | 
 56 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 57 | # check_min_version("4.24.0")
 58 | 
 59 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 60 | 
 61 | task_to_keys = {
 62 |     "cola": ("sentence", None),
 63 |     "mnli": ("premise", "hypothesis"),
 64 |     "mnli-m": ("premise", "hypothesis"),
 65 |     "mnli-mm": ("premise", "hypothesis"),
 66 |     "mrpc": ("sentence1", "sentence2"),
 67 |     "qnli": ("question", "sentence"),
 68 |     "qqp": ("question1", "question2"),
 69 |     "rte": ("sentence1", "sentence2"),
 70 |     "sst2": ("sentence", None),
 71 |     "stsb": ("sentence1", "sentence2"),
 72 |     "wnli": ("sentence1", "sentence2"),
 73 | }
 74 | 
 75 | task_to_best_metric = {
 76 |     "rte": "eval_accuracy",
 77 |     "mrpc": "eval_f1", 
 78 |     "cola": "eval_matthews_correlation", 
 79 |     "stsb": "eval_pearson", 
 80 |     "sst2": "eval_accuracy", 
 81 |     "qnli": "eval_accuracy",
 82 |     "mnli": "eval_accuracy",
 83 |     "mnli-m": "eval_accuracy",
 84 |     "mnli-mm": "eval_accuracy",
 85 |     "qqp": "eval_accuracy",
 86 | }
 87 | 
 88 | data_path = '/root/xtlv/data/sora_datasets/glue_datasets_from_dn/'
 89 | 
 90 | logger = logging.getLogger(__name__)
 91 | 
 92 | 
 93 | @dataclass
 94 | class DataTrainingArguments:
 95 |     """
 96 |     Arguments pertaining to what data we are going to input our model for training and eval.
 97 |     Using `HfArgumentParser` we can turn this class
 98 |     into argparse arguments to be able to specify them on
 99 |     the command line.
100 |     """
101 | 
102 |     task_name: Optional[str] = field(
103 |         default=None,
104 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
105 |     )
106 |     dataset_name: Optional[str] = field(
107 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
108 |     )
109 |     dataset_config_name: Optional[str] = field(
110 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
111 |     )
112 |     max_seq_length: int = field(
113 |         default=128,
114 |         metadata={
115 |             "help": (
116 |                 "The maximum total input sequence length after tokenization. Sequences longer "
117 |                 "than this will be truncated, sequences shorter will be padded."
118 |             )
119 |         },
120 |     )
121 |     overwrite_cache: bool = field(
122 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
123 |     )
124 |     pad_to_max_length: bool = field(
125 |         default=True,
126 |         metadata={
127 |             "help": (
128 |                 "Whether to pad all samples to `max_seq_length`. "
129 |                 "If False, will pad the samples dynamically when batching to the maximum length in the batch."
130 |             )
131 |         },
132 |     )
133 |     max_train_samples: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": (
137 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
138 |                 "value if set."
139 |             )
140 |         },
141 |     )
142 |     max_eval_samples: Optional[int] = field(
143 |         default=None,
144 |         metadata={
145 |             "help": (
146 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
147 |                 "value if set."
148 |             )
149 |         },
150 |     )
151 |     max_predict_samples: Optional[int] = field(
152 |         default=None,
153 |         metadata={
154 |             "help": (
155 |                 "For debugging purposes or quicker training, truncate the number of prediction examples to this "
156 |                 "value if set."
157 |             )
158 |         },
159 |     )
160 |     train_file: Optional[str] = field(
161 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
162 |     )
163 |     validation_file: Optional[str] = field(
164 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
165 |     )
166 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
167 | 
168 |     def __post_init__(self):
169 |         if self.task_name is not None:
170 |             self.task_name = self.task_name.lower()
171 |             if self.task_name not in task_to_keys.keys():
172 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
173 |         elif self.dataset_name is not None:
174 |             pass
175 |         elif self.train_file is None or self.validation_file is None:
176 |             raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
177 |         else:
178 |             train_extension = self.train_file.split(".")[-1]
179 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
180 |             validation_extension = self.validation_file.split(".")[-1]
181 |             assert (
182 |                 validation_extension == train_extension
183 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
184 | 
185 | 
186 | @dataclass
187 | class ModelArguments:
188 |     """
189 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
190 |     """
191 | 
192 |     model_name_or_path: str = field(
193 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
194 |     )
195 |     config_name: Optional[str] = field(
196 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
197 |     )
198 |     tokenizer_name: Optional[str] = field(
199 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
200 |     )
201 |     cache_dir: Optional[str] = field(
202 |         default=None,
203 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
204 |     )
205 |     use_fast_tokenizer: bool = field(
206 |         default=True,
207 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
208 |     )
209 |     model_revision: str = field(
210 |         default="main",
211 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
212 |     )
213 |     use_auth_token: bool = field(
214 |         default=False,
215 |         metadata={
216 |             "help": (
217 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
218 |                 "with private models)."
219 |             )
220 |         },
221 |     )
222 |     ignore_mismatched_sizes: bool = field(
223 |         default=False,
224 |         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
225 |     )
226 | 
227 | @dataclass
228 | class SparseArguments:
229 |     sparse_lambda: Optional[float] = field(
230 |         default=1e-3, metadata={"help": "loss penalty term for gate param"}
231 |     )
232 |     sparse_lambda_2: Optional[float] = field(
233 |         default=1e-3, metadata={"help": "clipping scale for gate param"}
234 |     )
235 |     sparse_lr: Optional[float] = field(
236 |         default=None, metadata={"help": "lr for gate parameter in sparse lora, default to same as learning rate for other parameters"}
237 |     )
238 |     lora_r: Optional[int] = field(
239 |         default=16, metadata={"help": "matrix rank in lora"}
240 |     )
241 |     lambda_schedule: Optional[str] = field(
242 |         default=None, metadata={"help": "scheduling of lambda_2, {linear, log_linear}"}
243 |     )
244 |     max_lambda: Optional[float] = field(
245 |         default=10, metadata={"help": "maximum value of lambda_2 in scheduling"}
246 |     )
247 |     lambda_num: Optional[int] = field(
248 |         default=10, metadata={"help": "total number of lambdas in scheduling"}
249 |     )
250 | 
251 |     bottleneck_dim: Optional[int] = field(
252 |         default=12, metadata={"help": "matrix rank in lora"}
253 |     )
254 | 
255 | @dataclass
256 | class SparseTrainingArguments(TrainingArguments):
257 |     train_sparse: Optional[bool] = field(
258 |         default=False, metadata={"help": "whether use sparse lora"}
259 |     )
260 |     debug_mode: Optional[bool] = field(
261 |         default=False, metadata={"help": "debug mode"}
262 |     )
263 | 
264 | 
265 | def main():
266 |     # See all possible arguments in src/transformers/training_args.py
267 |     # or by passing the --help flag to this script.
268 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
269 | 
270 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SparseTrainingArguments, SparseArguments))
271 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
272 |         # If we pass only one argument to the script and it's the path to a json file,
273 |         # let's parse it to get our arguments.
274 |         model_args, data_args, training_args, sparse_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
275 |     else:
276 |         model_args, data_args, training_args, sparse_args = parser.parse_args_into_dataclasses()
277 | 
278 | 
279 |     task_name_for_get = data_args.task_name
280 |     if "mnli" in data_args.task_name:
281 |         data_args.task_name = "mnli"
282 |     
283 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
284 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
285 |     # send_example_telemetry("run_glue", model_args, data_args)
286 |     training_args.metric_for_best_model = task_to_best_metric[data_args.task_name]
287 | 
288 |     if os.getenv("LOCAL_RANK"):
289 |         training_args.local_rank = int(os.environ["LOCAL_RANK"])
290 |     else:
291 |         training_args.local_rank = -1
292 | 
293 |     if training_args.train_sparse:
294 |         if sparse_args.sparse_lr is None:
295 |             sparse_args.sparse_lr = training_args.learning_rate
296 |         if training_args.debug_mode:
297 |             training_args.output_dir += "-debug"
298 |         print(f"save model to {training_args.output_dir}")
299 | 
300 |     # Setup logging
301 |     logging.basicConfig(
302 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
303 |         datefmt="%m/%d/%Y %H:%M:%S",
304 |         handlers=[logging.StreamHandler(sys.stdout)],
305 |     )
306 | 
307 |     log_level = training_args.get_process_log_level()
308 |     logger.setLevel(log_level)
309 |     datasets.utils.logging.set_verbosity(log_level)
310 |     transformers.utils.logging.set_verbosity(log_level)
311 |     transformers.utils.logging.enable_default_handler()
312 |     transformers.utils.logging.enable_explicit_format()
313 | 
314 |     # Log on each process the small summary:
315 |     logger.warning(
316 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
317 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
318 |     )
319 |     logger.info(f"Training/evaluation parameters {training_args}")
320 | 
321 |     # Detecting last checkpoint.
322 |     last_checkpoint = None
323 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
324 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
325 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
326 |             raise ValueError(
327 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
328 |                 "Use --overwrite_output_dir to overcome."
329 |             )
330 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
331 |             logger.info(
332 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
333 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
334 |             )
335 | 
336 |     # Set seed before initializing model.
337 |     set_seed(training_args.seed)
338 | 
339 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
340 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
341 |     #
342 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
343 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
344 |     # label if at least two columns are provided.
345 |     #
346 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
347 |     # single column. You can easily tweak this behavior (see below)
348 |     #
349 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
350 |     # download the dataset.
351 |     if data_args.task_name is not None:
352 |         # Downloading and loading a dataset from the hub.
353 |         from datasets import load_from_disk
354 |         from src.glue_tasks import AutoTask
355 |         raw_datasets = load_from_disk(data_path + data_args.task_name)
356 | 
357 |         task = AutoTask().get(data_args.task_name, None, None)
358 |         raw_datasets = {
359 |             "train": task.get("train", split_validation_test=True),
360 |             "validation": task.get("validation", split_validation_test=True),
361 |             "test": task.get("test", split_validation_test=True)
362 |         }
363 |         from datasets import DatasetDict
364 |         raw_datasets = DatasetDict(raw_datasets)
365 |         
366 |     elif data_args.dataset_name is not None:
367 |         raise NotImplementedError
368 |         
369 |     else:
370 |         # Loading a dataset from your local files.
371 |         # CSV/JSON training and evaluation files are needed.
372 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
373 | 
374 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
375 |         # when you use `do_predict` without specifying a GLUE benchmark task.
376 |         if training_args.do_predict:
377 |             if data_args.test_file is not None:
378 |                 train_extension = data_args.train_file.split(".")[-1]
379 |                 test_extension = data_args.test_file.split(".")[-1]
380 |                 assert (
381 |                     test_extension == train_extension
382 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
383 |                 data_files["test"] = data_args.test_file
384 |             else:
385 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
386 | 
387 |         for key in data_files.keys():
388 |             logger.info(f"load a local file for {key}: {data_files[key]}")
389 | 
390 |         if data_args.train_file.endswith(".csv"):
391 |             # Loading a dataset from local csv files
392 |             raw_datasets = load_dataset(
393 |                 "csv",
394 |                 data_files=data_files,
395 |                 cache_dir=model_args.cache_dir,
396 |                 use_auth_token=True if model_args.use_auth_token else None,
397 |             )
398 |         else:
399 |             # Loading a dataset from local json files
400 |             raw_datasets = load_dataset(
401 |                 "json",
402 |                 data_files=data_files,
403 |                 cache_dir=model_args.cache_dir,
404 |                 use_auth_token=True if model_args.use_auth_token else None,
405 |             )
406 |     # See more about loading any type of standard or custom dataset at
407 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
408 | 
409 |     # Labels
410 |     if data_args.task_name is not None:
411 |         is_regression = data_args.task_name == "stsb"
412 |         if not is_regression:
413 |             label_list = raw_datasets["train"].features["label"].names
414 |             num_labels = len(label_list)
415 |         else:
416 |             num_labels = 1
417 |     else:
418 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
419 |         is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
420 |         if is_regression:
421 |             num_labels = 1
422 |         else:
423 |             # A useful fast method:
424 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
425 |             label_list = raw_datasets["train"].unique("label")
426 |             label_list.sort()  # Let's sort it for determinism
427 |             num_labels = len(label_list)
428 | 
429 |     # Load pretrained model and tokenizer
430 |     #
431 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
432 |     # download model & vocab.
433 |     config = AutoConfig.from_pretrained(
434 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
435 |         num_labels=num_labels,
436 |         finetuning_task=data_args.task_name,
437 |         cache_dir=model_args.cache_dir,
438 |         revision=model_args.model_revision,
439 |         use_auth_token=True if model_args.use_auth_token else None,
440 |     )
441 |     tokenizer = AutoTokenizer.from_pretrained(
442 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
443 |         cache_dir=model_args.cache_dir,
444 |         use_fast=model_args.use_fast_tokenizer,
445 |         revision=model_args.model_revision,
446 |         use_auth_token=True if model_args.use_auth_token else None,
447 |     )
448 |     model = AutoModelForSequenceClassification.from_pretrained(
449 |         model_args.model_name_or_path,
450 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
451 |         config=config,
452 |         cache_dir=model_args.cache_dir,
453 |         revision=model_args.model_revision,
454 |         use_auth_token=True if model_args.use_auth_token else None,
455 |         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
456 |     )
457 | 
458 |     from opendelta.delta_models.bitfit import BitFitConfig, BitFitModel
459 |     import json
460 |     bitfit_config = json.load(open("./bitfit_config.json"))
461 |     bitfit_config = BitFitConfig.from_dict(bitfit_config)
462 |     delta_model = BitFitModel.from_config(bitfit_config, backbone_model=model)
463 |     delta_model.freeze_module(set_state_dict = True)
464 |     delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=False)
465 | 
466 | 
467 | 
468 |     # Preprocessing the raw_datasets
469 |     if data_args.task_name is not None:
470 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
471 |     else:
472 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
473 |         non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
474 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
475 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
476 |         else:
477 |             if len(non_label_column_names) >= 2:
478 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
479 |             else:
480 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
481 | 
482 |     # Padding strategy
483 |     if data_args.pad_to_max_length:
484 |         padding = "max_length"
485 |     else:
486 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
487 |         padding = False
488 | 
489 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
490 |     label_to_id = None
491 |     if (
492 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
493 |         and data_args.task_name is not None
494 |         and not is_regression
495 |     ):
496 |         # Some have all caps in their config, some don't.
497 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
498 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
499 |             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
500 |         else:
501 |             logger.warning(
502 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
503 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
504 |                 "\nIgnoring the model labels as a result.",
505 |             )
506 |     elif data_args.task_name is None and not is_regression:
507 |         label_to_id = {v: i for i, v in enumerate(label_list)}
508 | 
509 |     if label_to_id is not None:
510 |         model.config.label2id = label_to_id
511 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
512 |     elif data_args.task_name is not None and not is_regression:
513 |         model.config.label2id = {l: i for i, l in enumerate(label_list)}
514 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
515 | 
516 |     if data_args.max_seq_length > tokenizer.model_max_length:
517 |         logger.warning(
518 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
519 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
520 |         )
521 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
522 | 
523 |     def preprocess_function(examples):
524 |         # Tokenize the texts
525 |         args = (
526 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
527 |         )
528 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
529 | 
530 |         # Map labels to IDs (not necessary for GLUE tasks)
531 |         if label_to_id is not None and "label" in examples:
532 |             result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
533 |         return result
534 | 
535 |     with training_args.main_process_first(desc="dataset map pre-processing"):
536 |         raw_datasets = raw_datasets.map(
537 |             preprocess_function,
538 |             batched=True,
539 |             load_from_cache_file=not data_args.overwrite_cache,
540 |             desc="Running tokenizer on dataset",
541 |         )
542 |     if training_args.do_train:
543 |         if "train" not in raw_datasets:
544 |             raise ValueError("--do_train requires a train dataset")
545 |         train_dataset = raw_datasets["train"]
546 |         if data_args.max_train_samples is not None:
547 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
548 |             train_dataset = train_dataset.select(range(max_train_samples))
549 | 
550 |     if training_args.do_eval:
551 |         if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
552 |             raise ValueError("--do_eval requires a validation dataset")
553 |         eval_dataset = raw_datasets["validation"]
554 |         if data_args.max_eval_samples is not None:
555 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
556 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
557 | 
558 |     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
559 |         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
560 |             raise ValueError("--do_predict requires a test dataset")
561 |         predict_dataset = raw_datasets["test"]
562 |         if data_args.max_predict_samples is not None:
563 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
564 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
565 | 
566 |     # Log a few random samples from the training set:
567 |     if training_args.do_train:
568 |         for index in random.sample(range(len(train_dataset)), 3):
569 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
570 | 
571 |     # Get the metric function
572 |     from datasets import load_metric
573 |     if data_args.task_name is not None:
574 |         metric = load_metric("./glue.py", data_args.task_name)
575 |     else:
576 |         metric = load_metric("accuracy")
577 | 
578 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
579 |     # predictions and label_ids field) and has to return a dictionary string to float.
580 |     def compute_metrics(mode, p: EvalPrediction):
581 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
582 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
583 |         if data_args.task_name is not None:
584 |             result = metric.compute(predictions=preds, references=p.label_ids)
585 |             if len(result) > 1:
586 |                 result["combined_score"] = np.mean(list(result.values())).item()
587 |             return result
588 |         elif is_regression:
589 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
590 |         else:
591 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
592 | 
593 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
594 |     # we already did the padding.
595 |     if data_args.pad_to_max_length:
596 |         data_collator = default_data_collator
597 |     elif training_args.fp16:
598 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
599 |     else:
600 |         data_collator = None
601 |     
602 |     # Initialize our Trainer
603 |     optimizer, lr_scheduler = create_optimizer_and_scheduler(training_args, model, num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
604 |     sparse_optimizer = None
605 |     sparse_scheduler = None
606 |     if training_args.train_sparse:
607 |         print("building sparse optimizer and scheduler")
608 |         from src.trainer import GATE_PARAM_NAME
609 |         valid_param_name = []
610 |         for n, p in model.named_parameters():
611 |             print(n)
612 |             if GATE_PARAM_NAME in n:
613 |                 valid_param_name.append(n)
614 |         print("valid param name:", valid_param_name)
615 |         sparse_optimizer = SparseAdamW(sparse_lambda=sparse_args.sparse_lambda_2, lambda_schedule=sparse_args.lambda_schedule, max_lambda=sparse_args.max_lambda, lambda_num=sparse_args.lambda_num, params=[p for n, p in model.named_parameters() if GATE_PARAM_NAME in n and p.requires_grad], lr=sparse_args.sparse_lr)
616 |         sparse_scheduler = get_linear_schedule_with_warmup(sparse_optimizer, 
617 |         num_warmup_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)*training_args.warmup_ratio), 
618 |         num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
619 |     
620 |     if training_args.debug_mode:
621 |         train_dataset = eval_dataset
622 | 
623 |     # Initialize our Trainer
624 |     trainer = SparseTrainer(
625 |         model=model,
626 |         args=training_args,
627 |         train_dataset=train_dataset if training_args.do_train else None,
628 |         eval_dataset=eval_dataset if training_args.do_eval else None,
629 |         compute_metrics=compute_metrics,
630 |         tokenizer=tokenizer,
631 |         data_collator=data_collator,
632 |         optimizers = (optimizer, lr_scheduler),
633 |         sparse_lambda = sparse_args.sparse_lambda,
634 |         sparse_optimizer = (sparse_optimizer, sparse_scheduler)
635 |     )
636 | 
637 |     # Training
638 |     if training_args.do_train:
639 |         checkpoint = None
640 |         if training_args.resume_from_checkpoint is not None:
641 |             checkpoint = training_args.resume_from_checkpoint
642 |         elif last_checkpoint is not None:
643 |             checkpoint = last_checkpoint
644 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
645 |         metrics = train_result.metrics
646 |         max_train_samples = (
647 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
648 |         )
649 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
650 | 
651 |         trainer.save_model()  # Saves the tokenizer too for easy upload
652 | 
653 |         trainer.log_metrics("train", metrics)
654 |         trainer.save_metrics("train", metrics)
655 |         trainer.save_state()
656 |     
657 |     sparse_param, total_param = compute_trainable_sparse_param(model)
658 | 
659 | 
660 |     # eval on 1000 samples train set
661 |     train_dataset_for_eval = train_dataset.shuffle(seed=42).select(range(1000))
662 |     logger.info("*** Evaluate on training subset ***")
663 |     metrics = trainer.evaluate(eval_dataset=train_dataset_for_eval, metric_key_prefix = "eval_train")
664 |     trainer.log_metrics("eval_train", metrics)
665 |     trainer.save_metrics("eval_train", metrics)
666 |     BEST_TRAIN_METRIC = metrics["eval_train_" + "_".join(task_to_best_metric[data_args.task_name].split("_")[1:])]
667 | 
668 | 
669 |     # Evaluation
670 |     if training_args.do_eval:
671 |         logger.info("*** Evaluate ***")
672 | 
673 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
674 |         tasks = [data_args.task_name]
675 |         eval_datasets = [eval_dataset]
676 |        
677 | 
678 |         for eval_dataset, task in zip(eval_datasets, tasks):
679 |             metrics = trainer.evaluate(eval_dataset=eval_dataset)
680 | 
681 |             max_eval_samples = (
682 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
683 |             )
684 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
685 | 
686 |             
687 |             trainer.log_metrics("eval", metrics)
688 |             # trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics)
689 |             trainer.save_metrics("eval", metrics)
690 | 
691 |             BEST_EVAL_METRIC = metrics[task_to_best_metric[data_args.task_name]]
692 | 
693 |     if training_args.do_predict:
694 |         logger.info("*** Predict ***")
695 | 
696 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
697 |         tasks = [data_args.task_name]
698 |         predict_datasets = [predict_dataset]
699 |         
700 | 
701 |         for predict_dataset, task in zip(predict_datasets, tasks):
702 |             metrics = trainer.evaluate(eval_dataset=predict_dataset)
703 | 
704 |             max_eval_samples = (
705 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
706 |             )
707 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
708 | 
709 |         
710 |             trainer.log_metrics("test", metrics)
711 |             
712 |             trainer.save_metrics("test", metrics)
713 | 
714 |     logger.info("***** Final Model ******\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (total_param, sparse_param, sparse_param / total_param * 100))
715 | 
716 | if __name__ == "__main__":
717 |     main()


--------------------------------------------------------------------------------
/run_glue_finetune.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import wandb
 22 | os.environ['WANDB_MODE'] = 'offline'
 23 | import random
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | 
 28 | import datasets
 29 | import numpy as np
 30 | from datasets import load_dataset
 31 | 
 32 | import transformers
 33 | from transformers import (
 34 |     AutoConfig,
 35 |     AutoModelForSequenceClassification,
 36 |     AutoTokenizer,
 37 |     DataCollatorWithPadding,
 38 |     EvalPrediction,
 39 |     HfArgumentParser,
 40 |     PretrainedConfig,
 41 |     Trainer,
 42 |     TrainingArguments,
 43 |     default_data_collator,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint
 47 | from transformers.utils import check_min_version
 48 | from transformers.utils.versions import require_version
 49 | sys.path.append('../')
 50 | from src.trainer import SparseTrainer 
 51 | from src.util import compute_trainable_sparse_param, create_optimizer_and_scheduler
 52 | from src.sparse_optimizer import SparseAdamW
 53 | from transformers import get_linear_schedule_with_warmup
 54 | 
 55 | 
 56 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 57 | # check_min_version("4.24.0")
 58 | 
 59 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 60 | 
 61 | task_to_keys = {
 62 |     "cola": ("sentence", None),
 63 |     "mnli": ("premise", "hypothesis"),
 64 |     "mnli-m": ("premise", "hypothesis"),
 65 |     "mnli-mm": ("premise", "hypothesis"),
 66 |     "mrpc": ("sentence1", "sentence2"),
 67 |     "qnli": ("question", "sentence"),
 68 |     "qqp": ("question1", "question2"),
 69 |     "rte": ("sentence1", "sentence2"),
 70 |     "sst2": ("sentence", None),
 71 |     "stsb": ("sentence1", "sentence2"),
 72 |     "wnli": ("sentence1", "sentence2"),
 73 | }
 74 | 
 75 | task_to_best_metric = {
 76 |     "rte": "eval_accuracy",
 77 |     "mrpc": "eval_f1", 
 78 |     "cola": "eval_matthews_correlation", 
 79 |     "stsb": "eval_pearson", 
 80 |     "sst2": "eval_accuracy", 
 81 |     "qnli": "eval_accuracy",
 82 |     "mnli": "eval_accuracy",
 83 |     "mnli-m": "eval_accuracy",
 84 |     "mnli-mm": "eval_accuracy",
 85 |     "qqp": "eval_accuracy",
 86 | }
 87 | 
 88 | data_path = '/root/xtlv/data/sora_datasets/glue_datasets_from_dn/'
 89 | 
 90 | logger = logging.getLogger(__name__)
 91 | 
 92 | 
 93 | @dataclass
 94 | class DataTrainingArguments:
 95 |     """
 96 |     Arguments pertaining to what data we are going to input our model for training and eval.
 97 |     Using `HfArgumentParser` we can turn this class
 98 |     into argparse arguments to be able to specify them on
 99 |     the command line.
100 |     """
101 | 
102 |     task_name: Optional[str] = field(
103 |         default=None,
104 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
105 |     )
106 |     dataset_name: Optional[str] = field(
107 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
108 |     )
109 |     dataset_config_name: Optional[str] = field(
110 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
111 |     )
112 |     max_seq_length: int = field(
113 |         default=128,
114 |         metadata={
115 |             "help": (
116 |                 "The maximum total input sequence length after tokenization. Sequences longer "
117 |                 "than this will be truncated, sequences shorter will be padded."
118 |             )
119 |         },
120 |     )
121 |     overwrite_cache: bool = field(
122 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
123 |     )
124 |     pad_to_max_length: bool = field(
125 |         default=True,
126 |         metadata={
127 |             "help": (
128 |                 "Whether to pad all samples to `max_seq_length`. "
129 |                 "If False, will pad the samples dynamically when batching to the maximum length in the batch."
130 |             )
131 |         },
132 |     )
133 |     max_train_samples: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": (
137 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
138 |                 "value if set."
139 |             )
140 |         },
141 |     )
142 |     max_eval_samples: Optional[int] = field(
143 |         default=None,
144 |         metadata={
145 |             "help": (
146 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
147 |                 "value if set."
148 |             )
149 |         },
150 |     )
151 |     max_predict_samples: Optional[int] = field(
152 |         default=None,
153 |         metadata={
154 |             "help": (
155 |                 "For debugging purposes or quicker training, truncate the number of prediction examples to this "
156 |                 "value if set."
157 |             )
158 |         },
159 |     )
160 |     train_file: Optional[str] = field(
161 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
162 |     )
163 |     validation_file: Optional[str] = field(
164 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
165 |     )
166 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
167 | 
168 |     def __post_init__(self):
169 |         if self.task_name is not None:
170 |             self.task_name = self.task_name.lower()
171 |             if self.task_name not in task_to_keys.keys():
172 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
173 |         elif self.dataset_name is not None:
174 |             pass
175 |         elif self.train_file is None or self.validation_file is None:
176 |             raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
177 |         else:
178 |             train_extension = self.train_file.split(".")[-1]
179 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
180 |             validation_extension = self.validation_file.split(".")[-1]
181 |             assert (
182 |                 validation_extension == train_extension
183 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
184 | 
185 | 
186 | @dataclass
187 | class ModelArguments:
188 |     """
189 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
190 |     """
191 | 
192 |     model_name_or_path: str = field(
193 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
194 |     )
195 |     config_name: Optional[str] = field(
196 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
197 |     )
198 |     tokenizer_name: Optional[str] = field(
199 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
200 |     )
201 |     cache_dir: Optional[str] = field(
202 |         default=None,
203 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
204 |     )
205 |     use_fast_tokenizer: bool = field(
206 |         default=True,
207 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
208 |     )
209 |     model_revision: str = field(
210 |         default="main",
211 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
212 |     )
213 |     use_auth_token: bool = field(
214 |         default=False,
215 |         metadata={
216 |             "help": (
217 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
218 |                 "with private models)."
219 |             )
220 |         },
221 |     )
222 |     ignore_mismatched_sizes: bool = field(
223 |         default=False,
224 |         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
225 |     )
226 | 
227 | @dataclass
228 | class SparseArguments:
229 |     sparse_lambda: Optional[float] = field(
230 |         default=1e-3, metadata={"help": "loss penalty term for gate param"}
231 |     )
232 |     sparse_lambda_2: Optional[float] = field(
233 |         default=1e-3, metadata={"help": "clipping scale for gate param"}
234 |     )
235 |     sparse_lr: Optional[float] = field(
236 |         default=None, metadata={"help": "lr for gate parameter in sparse lora, default to same as learning rate for other parameters"}
237 |     )
238 |     lora_r: Optional[int] = field(
239 |         default=16, metadata={"help": "matrix rank in lora"}
240 |     )
241 |     lambda_schedule: Optional[str] = field(
242 |         default=None, metadata={"help": "scheduling of lambda_2, {linear, log_linear}"}
243 |     )
244 |     max_lambda: Optional[float] = field(
245 |         default=10, metadata={"help": "maximum value of lambda_2 in scheduling"}
246 |     )
247 |     lambda_num: Optional[int] = field(
248 |         default=10, metadata={"help": "total number of lambdas in scheduling"}
249 |     )
250 | 
251 | 
252 | @dataclass
253 | class SparseTrainingArguments(TrainingArguments):
254 |     train_sparse: Optional[bool] = field(
255 |         default=False, metadata={"help": "whether use sparse lora"}
256 |     )
257 |     debug_mode: Optional[bool] = field(
258 |         default=False, metadata={"help": "debug mode"}
259 |     )
260 | 
261 | 
262 | def main():
263 |     # See all possible arguments in src/transformers/training_args.py
264 |     # or by passing the --help flag to this script.
265 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
266 | 
267 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SparseTrainingArguments, SparseArguments))
268 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
269 |         # If we pass only one argument to the script and it's the path to a json file,
270 |         # let's parse it to get our arguments.
271 |         model_args, data_args, training_args, sparse_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
272 |     else:
273 |         model_args, data_args, training_args, sparse_args = parser.parse_args_into_dataclasses()
274 | 
275 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
276 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
277 |     # send_example_telemetry("run_glue", model_args, data_args)
278 |     
279 |     task_name_for_get = data_args.task_name
280 |     if "mnli" in data_args.task_name:
281 |         data_args.task_name = "mnli"
282 | 
283 |     training_args.metric_for_best_model = task_to_best_metric[data_args.task_name]
284 | 
285 |     if os.getenv("LOCAL_RANK"):
286 |         training_args.local_rank = int(os.environ["LOCAL_RANK"])
287 |     else:
288 |         training_args.local_rank = -1
289 | 
290 |     if training_args.train_sparse:
291 |         if sparse_args.sparse_lr is None:
292 |             sparse_args.sparse_lr = training_args.learning_rate
293 | 
294 |         if training_args.debug_mode:
295 |             training_args.output_dir += "-debug"
296 |         print(f"save model to {training_args.output_dir}")
297 | 
298 |     # Setup logging
299 |     logging.basicConfig(
300 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
301 |         datefmt="%m/%d/%Y %H:%M:%S",
302 |         handlers=[logging.StreamHandler(sys.stdout)],
303 |     )
304 | 
305 |     log_level = training_args.get_process_log_level()
306 |     logger.setLevel(log_level)
307 |     datasets.utils.logging.set_verbosity(log_level)
308 |     transformers.utils.logging.set_verbosity(log_level)
309 |     transformers.utils.logging.enable_default_handler()
310 |     transformers.utils.logging.enable_explicit_format()
311 | 
312 |     # Log on each process the small summary:
313 |     logger.warning(
314 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
315 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
316 |     )
317 |     logger.info(f"Training/evaluation parameters {training_args}")
318 | 
319 |     # Detecting last checkpoint.
320 |     last_checkpoint = None
321 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
322 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
323 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
324 |             raise ValueError(
325 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
326 |                 "Use --overwrite_output_dir to overcome."
327 |             )
328 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
329 |             logger.info(
330 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
331 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
332 |             )
333 | 
334 |     # Set seed before initializing model.
335 |     set_seed(training_args.seed)
336 | 
337 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
338 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
339 |     #
340 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
341 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
342 |     # label if at least two columns are provided.
343 |     #
344 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
345 |     # single column. You can easily tweak this behavior (see below)
346 |     #
347 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
348 |     # download the dataset.
349 |     if data_args.task_name is not None:
350 |         # Downloading and loading a dataset from the hub.
351 |         from datasets import load_from_disk
352 |         from src.glue_tasks import AutoTask
353 |         raw_datasets = load_from_disk(data_path + data_args.task_name)
354 | 
355 |         task = AutoTask().get(task_name_for_get, None, None)
356 |         raw_datasets = {
357 |             "train": task.get("train", split_validation_test=True),
358 |             "validation": task.get("validation", split_validation_test=True),
359 |             "test": task.get("test", split_validation_test=True)
360 |         }
361 |         from datasets import DatasetDict
362 |         raw_datasets = DatasetDict(raw_datasets)
363 |         
364 | 
365 | 
366 |     elif data_args.dataset_name is not None:
367 |         raise NotImplementedError
368 |         
369 |     else:
370 |         # Loading a dataset from your local files.
371 |         # CSV/JSON training and evaluation files are needed.
372 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
373 | 
374 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
375 |         # when you use `do_predict` without specifying a GLUE benchmark task.
376 |         if training_args.do_predict:
377 |             if data_args.test_file is not None:
378 |                 train_extension = data_args.train_file.split(".")[-1]
379 |                 test_extension = data_args.test_file.split(".")[-1]
380 |                 assert (
381 |                     test_extension == train_extension
382 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
383 |                 data_files["test"] = data_args.test_file
384 |             else:
385 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
386 | 
387 |         for key in data_files.keys():
388 |             logger.info(f"load a local file for {key}: {data_files[key]}")
389 | 
390 |         if data_args.train_file.endswith(".csv"):
391 |             # Loading a dataset from local csv files
392 |             raw_datasets = load_dataset(
393 |                 "csv",
394 |                 data_files=data_files,
395 |                 cache_dir=model_args.cache_dir,
396 |                 use_auth_token=True if model_args.use_auth_token else None,
397 |             )
398 |         else:
399 |             # Loading a dataset from local json files
400 |             raw_datasets = load_dataset(
401 |                 "json",
402 |                 data_files=data_files,
403 |                 cache_dir=model_args.cache_dir,
404 |                 use_auth_token=True if model_args.use_auth_token else None,
405 |             )
406 |     # See more about loading any type of standard or custom dataset at
407 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
408 | 
409 |     # Labels
410 |     if data_args.task_name is not None:
411 |         is_regression = data_args.task_name == "stsb"
412 |         if not is_regression:
413 |             label_list = raw_datasets["train"].features["label"].names
414 |             num_labels = len(label_list)
415 |         else:
416 |             num_labels = 1
417 |     else:
418 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
419 |         is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
420 |         if is_regression:
421 |             num_labels = 1
422 |         else:
423 |             # A useful fast method:
424 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
425 |             label_list = raw_datasets["train"].unique("label")
426 |             label_list.sort()  # Let's sort it for determinism
427 |             num_labels = len(label_list)
428 | 
429 |     # Load pretrained model and tokenizer
430 |     #
431 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
432 |     # download model & vocab.
433 |     config = AutoConfig.from_pretrained(
434 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
435 |         num_labels=num_labels,
436 |         finetuning_task=data_args.task_name,
437 |         cache_dir=model_args.cache_dir,
438 |         revision=model_args.model_revision,
439 |         use_auth_token=True if model_args.use_auth_token else None,
440 |     )
441 |     tokenizer = AutoTokenizer.from_pretrained(
442 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
443 |         cache_dir=model_args.cache_dir,
444 |         use_fast=model_args.use_fast_tokenizer,
445 |         revision=model_args.model_revision,
446 |         use_auth_token=True if model_args.use_auth_token else None,
447 |     )
448 |     model = AutoModelForSequenceClassification.from_pretrained(
449 |         model_args.model_name_or_path,
450 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
451 |         config=config,
452 |         cache_dir=model_args.cache_dir,
453 |         revision=model_args.model_revision,
454 |         use_auth_token=True if model_args.use_auth_token else None,
455 |         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
456 |     )
457 | 
458 |     # Preprocessing the raw_datasets
459 |     if data_args.task_name is not None:
460 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
461 |     else:
462 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
463 |         non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
464 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
465 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
466 |         else:
467 |             if len(non_label_column_names) >= 2:
468 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
469 |             else:
470 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
471 | 
472 |     # Padding strategy
473 |     if data_args.pad_to_max_length:
474 |         padding = "max_length"
475 |     else:
476 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
477 |         padding = False
478 | 
479 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
480 |     label_to_id = None
481 |     if (
482 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
483 |         and data_args.task_name is not None
484 |         and not is_regression
485 |     ):
486 |         # Some have all caps in their config, some don't.
487 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
488 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
489 |             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
490 |         else:
491 |             logger.warning(
492 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
493 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
494 |                 "\nIgnoring the model labels as a result.",
495 |             )
496 |     elif data_args.task_name is None and not is_regression:
497 |         label_to_id = {v: i for i, v in enumerate(label_list)}
498 | 
499 |     if label_to_id is not None:
500 |         model.config.label2id = label_to_id
501 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
502 |     elif data_args.task_name is not None and not is_regression:
503 |         model.config.label2id = {l: i for i, l in enumerate(label_list)}
504 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
505 | 
506 | 
507 |     if data_args.max_seq_length > tokenizer.model_max_length:
508 |         logger.warning(
509 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
510 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
511 |         )
512 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
513 | 
514 |     def preprocess_function(examples):
515 |         # Tokenize the texts
516 |         args = (
517 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
518 |         )
519 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
520 | 
521 |         # Map labels to IDs (not necessary for GLUE tasks)
522 |         if label_to_id is not None and "label" in examples:
523 |             result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
524 |         return result
525 | 
526 |     with training_args.main_process_first(desc="dataset map pre-processing"):
527 |         raw_datasets = raw_datasets.map(
528 |             preprocess_function,
529 |             batched=True,
530 |             load_from_cache_file=not data_args.overwrite_cache,
531 |             desc="Running tokenizer on dataset",
532 |         )
533 |     if training_args.do_train:
534 |         if "train" not in raw_datasets:
535 |             raise ValueError("--do_train requires a train dataset")
536 |         train_dataset = raw_datasets["train"]
537 |         if data_args.max_train_samples is not None:
538 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
539 |             train_dataset = train_dataset.select(range(max_train_samples))
540 | 
541 |     if training_args.do_eval:
542 |         if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
543 |             raise ValueError("--do_eval requires a validation dataset")
544 |         # eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
545 |         eval_dataset = raw_datasets["validation"]
546 |         if data_args.max_eval_samples is not None:
547 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
548 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
549 | 
550 |     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
551 |         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
552 |             raise ValueError("--do_predict requires a test dataset")
553 |         # predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
554 |         predict_dataset = raw_datasets["test"]
555 |         if data_args.max_predict_samples is not None:
556 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
557 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
558 | 
559 |     # Log a few random samples from the training set:
560 |     if training_args.do_train:
561 |         for index in random.sample(range(len(train_dataset)), 3):
562 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
563 |     
564 |     # Get the metric function
565 |     from datasets import load_metric
566 |     if data_args.task_name is not None:
567 |         metric = load_metric("./glue.py", data_args.task_name)
568 |     else:
569 |         metric = load_metric("accuracy")
570 | 
571 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
572 |     # predictions and label_ids field) and has to return a dictionary string to float.
573 |     def compute_metrics(mode, p: EvalPrediction):
574 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
575 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
576 |         if data_args.task_name is not None:
577 |             result = metric.compute(predictions=preds, references=p.label_ids)
578 |             if len(result) > 1:
579 |                 result["combined_score"] = np.mean(list(result.values())).item()
580 |             return result
581 |         elif is_regression:
582 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
583 |         else:
584 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
585 | 
586 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
587 |     # we already did the padding.
588 |     if data_args.pad_to_max_length:
589 |         data_collator = default_data_collator
590 |     elif training_args.fp16:
591 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
592 |     else:
593 |         data_collator = None
594 |     
595 | 
596 |     # Initialize our Trainer
597 |     optimizer, lr_scheduler = create_optimizer_and_scheduler(training_args, model, num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
598 |     sparse_optimizer = None
599 |     sparse_scheduler = None
600 |     if training_args.train_sparse:
601 |         print("building sparse optimizer and scheduler")
602 |         from src.trainer import GATE_PARAM_NAME
603 |         valid_param_name = []
604 |         for n, p in model.named_parameters():
605 |             print(n)
606 |             if GATE_PARAM_NAME in n:
607 |                 valid_param_name.append(n)
608 |         print("valid param name:", valid_param_name)
609 |         sparse_optimizer = SparseAdamW(sparse_lambda=sparse_args.sparse_lambda_2, lambda_schedule=sparse_args.lambda_schedule, max_lambda=sparse_args.max_lambda, lambda_num=sparse_args.lambda_num, params=[p for n, p in model.named_parameters() if GATE_PARAM_NAME in n and p.requires_grad], lr=sparse_args.sparse_lr)
610 |         sparse_scheduler = get_linear_schedule_with_warmup(sparse_optimizer, 
611 |         num_warmup_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)*training_args.warmup_ratio), 
612 |         num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
613 |     
614 |     if training_args.debug_mode:
615 |         train_dataset = eval_dataset
616 | 
617 |     # Initialize our Trainer
618 |     trainer = SparseTrainer(
619 |         model=model,
620 |         args=training_args,
621 |         train_dataset=train_dataset if training_args.do_train else None,
622 |         eval_dataset=eval_dataset if training_args.do_eval else None,
623 |         compute_metrics=compute_metrics,
624 |         tokenizer=tokenizer,
625 |         data_collator=data_collator,
626 |         optimizers = (optimizer, lr_scheduler),
627 |         sparse_lambda = sparse_args.sparse_lambda,
628 |         sparse_optimizer = (sparse_optimizer, sparse_scheduler)
629 |     )
630 | 
631 |     # Training
632 |     if training_args.do_train:
633 |         checkpoint = None
634 |         if training_args.resume_from_checkpoint is not None:
635 |             checkpoint = training_args.resume_from_checkpoint
636 |         elif last_checkpoint is not None:
637 |             checkpoint = last_checkpoint
638 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
639 |         metrics = train_result.metrics
640 |         max_train_samples = (
641 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
642 |         )
643 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
644 | 
645 |         trainer.save_model()  # Saves the tokenizer too for easy upload
646 | 
647 |         trainer.log_metrics("train", metrics)
648 |         trainer.save_metrics("train", metrics)
649 |         trainer.save_state()
650 |     
651 |     sparse_param, total_param = compute_trainable_sparse_param(model)
652 | 
653 | 
654 |     # eval on 1000 samples train set
655 |     train_dataset_for_eval = train_dataset.shuffle(seed=42).select(range(1000))
656 |     logger.info("*** Evaluate on training subset ***")
657 |     metrics = trainer.evaluate(eval_dataset=train_dataset_for_eval, metric_key_prefix = "eval_train")
658 |     trainer.log_metrics("eval_train", metrics)
659 |     trainer.save_metrics("eval_train", metrics)
660 |     BEST_TRAIN_METRIC = metrics["eval_train_" + "_".join(task_to_best_metric[data_args.task_name].split("_")[1:])]
661 | 
662 | 
663 |     # Evaluation
664 | 
665 |     if training_args.do_eval:
666 |         logger.info("*** Evaluate ***")
667 | 
668 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
669 |         tasks = [data_args.task_name]
670 |         eval_datasets = [eval_dataset]
671 |         
672 | 
673 |         for eval_dataset, task in zip(eval_datasets, tasks):
674 |             metrics = trainer.evaluate(eval_dataset=eval_dataset)
675 | 
676 |             max_eval_samples = (
677 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
678 |             )
679 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
680 | 
681 |            
682 | 
683 |             trainer.log_metrics("eval", metrics)
684 |             trainer.save_metrics("eval", metrics)
685 | 
686 |             BEST_EVAL_METRIC = metrics[task_to_best_metric[data_args.task_name]]
687 | 
688 |     if training_args.do_predict:
689 |         logger.info("*** Predict ***")
690 | 
691 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
692 |         tasks = [data_args.task_name]
693 |         predict_datasets = [predict_dataset]
694 |         
695 | 
696 |         for predict_dataset, task in zip(predict_datasets, tasks):
697 |             metrics = trainer.evaluate(eval_dataset=predict_dataset)
698 | 
699 |             max_eval_samples = (
700 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
701 |             )
702 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
703 | 
704 |             
705 | 
706 |             trainer.log_metrics("test", metrics)
707 |             
708 |             trainer.save_metrics("test", metrics)
709 |     
710 |     logger.info("***** Final Model ******\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (total_param, sparse_param, sparse_param / total_param * 100))
711 | 
712 | 
713 |     def compute_metrics_in_schedule(mode, p: EvalPrediction):
714 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
715 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
716 |         if data_args.task_name is not None:
717 |             result = metric.compute(predictions=preds, references=p.label_ids)
718 |             if len(result) > 1:
719 |                 result["combined_score"] = np.mean(list(result.values())).item()
720 |             if mode == "eval":
721 |                 result["generalization"] = result["_".join(task_to_best_metric[data_args.task_name].split("_")[1:])] / BEST_EVAL_METRIC * 100
722 |             elif mode == "eval_train":
723 |                 result["memorization"] = result["_".join(task_to_best_metric[data_args.task_name].split("_")[1:])] / BEST_TRAIN_METRIC * 100
724 |             elif mode == "test":
725 |                 pass
726 |             else:
727 |                 raise NotImplementedError
728 |             return result
729 |         elif is_regression:
730 |             raise NotImplementedError
731 |         else:
732 |             raise NotImplementedError
733 |     
734 | 
735 |     # schedule
736 |     if sparse_args.lambda_schedule is not None:
737 |         logger.info("*****Start lambda_2 scheduling***")
738 |         from transformers import EarlyStoppingCallback
739 |         for _ in range(sparse_args.lambda_num - 1):
740 |             training_args.num_train_epochs = 15
741 |             training_args.load_best_model_at_end = False
742 |             sparse_optimizer.step_lambda()
743 |             trainer = SparseTrainer(
744 |                 model=model,
745 |                 args=training_args,
746 |                 train_dataset=train_dataset if training_args.do_train else None,
747 |                 eval_dataset=[eval_dataset if training_args.do_eval else None, train_dataset_for_eval],
748 |                 compute_metrics=compute_metrics_in_schedule,
749 |                 tokenizer=tokenizer,
750 |                 data_collator=data_collator,
751 |                 optimizers = (optimizer, lr_scheduler),
752 |                 sparse_lambda = sparse_args.sparse_lambda,
753 |                 sparse_optimizer = (sparse_optimizer, sparse_scheduler),
754 |             )
755 |             
756 |             trainer.train()
757 | 
758 |             if training_args.do_predict:
759 |                 logger.info("*** Predict ***")
760 | 
761 |                 # Loop to handle MNLI double evaluation (matched, mis-matched)
762 |                 tasks = [data_args.task_name]
763 |                 predict_datasets = [predict_dataset]
764 |                 
765 | 
766 |                 for predict_dataset, task in zip(predict_datasets, tasks):
767 |                     metrics = trainer.evaluate(eval_dataset=predict_dataset, metric_key_prefix="test")
768 | 
769 |                     max_eval_samples = (
770 |                         data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
771 |                     )
772 |                     metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
773 | 
774 |                     
775 | 
776 |                     trainer.log_metrics("test", metrics)
777 |                     
778 |                     trainer.save_metrics("test", metrics)
779 | 
780 |             
781 | 
782 | 
783 | 
784 |             sparse_param, total_param = compute_trainable_sparse_param(model)
785 | 
786 |             logger.info("***** Lambda=%f Final Model ******\nLora rank: %d\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (sparse_optimizer.sparse_lambda, lora_config.lora_r, total_param, sparse_param, sparse_param / total_param * 100))
787 | 
788 | 
789 | 
790 | def _mp_fn(index):
791 |     # For xla_spawn (TPUs)
792 |     main()
793 | 
794 | 
795 | if __name__ == "__main__":
796 |     main()


--------------------------------------------------------------------------------
/run_glue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import wandb
 22 | os.environ['WANDB_MODE'] = 'offline'
 23 | import random
 24 | import sys
 25 | from dataclasses import dataclass, field
 26 | from typing import Optional
 27 | 
 28 | import datasets
 29 | import numpy as np
 30 | from datasets import load_dataset
 31 | 
 32 | import transformers
 33 | from transformers import (
 34 |     AutoConfig,
 35 |     AutoModelForSequenceClassification,
 36 |     AutoTokenizer,
 37 |     DataCollatorWithPadding,
 38 |     EvalPrediction,
 39 |     HfArgumentParser,
 40 |     PretrainedConfig,
 41 |     Trainer,
 42 |     TrainingArguments,
 43 |     default_data_collator,
 44 |     set_seed,
 45 | )
 46 | from transformers.trainer_utils import get_last_checkpoint
 47 | from transformers.utils import check_min_version
 48 | from transformers.utils.versions import require_version
 49 | sys.path.append('../')
 50 | from src.trainer import SparseTrainer 
 51 | from src.util import compute_trainable_sparse_param, create_optimizer_and_scheduler
 52 | from src.sparse_optimizer import SparseAdamW
 53 | from transformers import get_linear_schedule_with_warmup
 54 | 
 55 | 
 56 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 57 | # check_min_version("4.24.0")
 58 | 
 59 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 60 | 
 61 | task_to_keys = {
 62 |     "cola": ("sentence", None),
 63 |     "mnli": ("premise", "hypothesis"),
 64 |     "mnli-m": ("premise", "hypothesis"),
 65 |     "mnli-mm": ("premise", "hypothesis"),
 66 |     "mrpc": ("sentence1", "sentence2"),
 67 |     "qnli": ("question", "sentence"),
 68 |     "qqp": ("question1", "question2"),
 69 |     "rte": ("sentence1", "sentence2"),
 70 |     "sst2": ("sentence", None),
 71 |     "stsb": ("sentence1", "sentence2"),
 72 |     "wnli": ("sentence1", "sentence2"),
 73 | }
 74 | 
 75 | task_to_best_metric = {
 76 |     "rte": "eval_accuracy",
 77 |     "mrpc": "eval_f1", 
 78 |     "cola": "eval_matthews_correlation", 
 79 |     "stsb": "eval_pearson", 
 80 |     "sst2": "eval_accuracy", 
 81 |     "qnli": "eval_accuracy",
 82 |     "mnli": "eval_accuracy",
 83 |     "mnli-m": "eval_accuracy",
 84 |     "mnli-mm": "eval_accuracy",
 85 |     "qqp": "eval_accuracy",
 86 | }
 87 | 
 88 | data_path = '/root/xtlv/data/sora_datasets/glue_datasets_from_dn/'
 89 | 
 90 | logger = logging.getLogger(__name__)
 91 | 
 92 | 
 93 | @dataclass
 94 | class DataTrainingArguments:
 95 |     """
 96 |     Arguments pertaining to what data we are going to input our model for training and eval.
 97 |     Using `HfArgumentParser` we can turn this class
 98 |     into argparse arguments to be able to specify them on
 99 |     the command line.
100 |     """
101 | 
102 |     task_name: Optional[str] = field(
103 |         default=None,
104 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
105 |     )
106 |     dataset_name: Optional[str] = field(
107 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
108 |     )
109 |     dataset_config_name: Optional[str] = field(
110 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
111 |     )
112 |     max_seq_length: int = field(
113 |         default=128,
114 |         metadata={
115 |             "help": (
116 |                 "The maximum total input sequence length after tokenization. Sequences longer "
117 |                 "than this will be truncated, sequences shorter will be padded."
118 |             )
119 |         },
120 |     )
121 |     overwrite_cache: bool = field(
122 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
123 |     )
124 |     pad_to_max_length: bool = field(
125 |         default=True,
126 |         metadata={
127 |             "help": (
128 |                 "Whether to pad all samples to `max_seq_length`. "
129 |                 "If False, will pad the samples dynamically when batching to the maximum length in the batch."
130 |             )
131 |         },
132 |     )
133 |     max_train_samples: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": (
137 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
138 |                 "value if set."
139 |             )
140 |         },
141 |     )
142 |     max_eval_samples: Optional[int] = field(
143 |         default=None,
144 |         metadata={
145 |             "help": (
146 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
147 |                 "value if set."
148 |             )
149 |         },
150 |     )
151 |     max_predict_samples: Optional[int] = field(
152 |         default=None,
153 |         metadata={
154 |             "help": (
155 |                 "For debugging purposes or quicker training, truncate the number of prediction examples to this "
156 |                 "value if set."
157 |             )
158 |         },
159 |     )
160 |     train_file: Optional[str] = field(
161 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
162 |     )
163 |     validation_file: Optional[str] = field(
164 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
165 |     )
166 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
167 | 
168 |     def __post_init__(self):
169 |         if self.task_name is not None:
170 |             self.task_name = self.task_name.lower()
171 |             if self.task_name not in task_to_keys.keys():
172 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
173 |         elif self.dataset_name is not None:
174 |             pass
175 |         elif self.train_file is None or self.validation_file is None:
176 |             raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
177 |         else:
178 |             train_extension = self.train_file.split(".")[-1]
179 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
180 |             validation_extension = self.validation_file.split(".")[-1]
181 |             assert (
182 |                 validation_extension == train_extension
183 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
184 | 
185 | 
186 | @dataclass
187 | class ModelArguments:
188 |     """
189 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
190 |     """
191 | 
192 |     model_name_or_path: str = field(
193 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
194 |     )
195 |     config_name: Optional[str] = field(
196 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
197 |     )
198 |     tokenizer_name: Optional[str] = field(
199 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
200 |     )
201 |     cache_dir: Optional[str] = field(
202 |         default=None,
203 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
204 |     )
205 |     use_fast_tokenizer: bool = field(
206 |         default=True,
207 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
208 |     )
209 |     model_revision: str = field(
210 |         default="main",
211 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
212 |     )
213 |     use_auth_token: bool = field(
214 |         default=False,
215 |         metadata={
216 |             "help": (
217 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
218 |                 "with private models)."
219 |             )
220 |         },
221 |     )
222 |     ignore_mismatched_sizes: bool = field(
223 |         default=False,
224 |         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
225 |     )
226 | 
227 | @dataclass
228 | class SparseArguments:
229 |     sparse_lambda: Optional[float] = field(
230 |         default=1e-3, metadata={"help": "loss penalty term for gate param"}
231 |     )
232 |     sparse_lambda_2: Optional[float] = field(
233 |         default=1e-3, metadata={"help": "clipping scale for gate param"}
234 |     )
235 |     sparse_lr: Optional[float] = field(
236 |         default=None, metadata={"help": "lr for gate parameter in sparse lora, default to same as learning rate for other parameters"}
237 |     )
238 |     lora_r: Optional[int] = field(
239 |         default=16, metadata={"help": "matrix rank in lora"}
240 |     )
241 |     lambda_schedule: Optional[str] = field(
242 |         default=None, metadata={"help": "scheduling of lambda_2, {linear, log_linear}"}
243 |     )
244 |     max_lambda: Optional[float] = field(
245 |         default=10, metadata={"help": "maximum value of lambda_2 in scheduling"}
246 |     )
247 |     lambda_num: Optional[int] = field(
248 |         default=10, metadata={"help": "total number of lambdas in scheduling"}
249 |     )
250 |     
251 | @dataclass
252 | class SparseTrainingArguments(TrainingArguments):
253 |     train_sparse: Optional[bool] = field(
254 |         default=False, metadata={"help": "whether use sparse lora"}
255 |     )
256 |     debug_mode: Optional[bool] = field(
257 |         default=False, metadata={"help": "debug mode"}
258 |     )
259 | 
260 | 
261 | def main():
262 |     # See all possible arguments in src/transformers/training_args.py
263 |     # or by passing the --help flag to this script.
264 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
265 | 
266 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SparseTrainingArguments, SparseArguments))
267 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
268 |         # If we pass only one argument to the script and it's the path to a json file,
269 |         # let's parse it to get our arguments.
270 |         model_args, data_args, training_args, sparse_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
271 |     else:
272 |         model_args, data_args, training_args, sparse_args = parser.parse_args_into_dataclasses()
273 | 
274 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
275 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
276 |     # send_example_telemetry("run_glue", model_args, data_args)
277 |     
278 |     task_name_for_get = data_args.task_name
279 |     if "mnli" in data_args.task_name:
280 |         data_args.task_name = "mnli"
281 | 
282 |     training_args.metric_for_best_model = task_to_best_metric[data_args.task_name]
283 | 
284 |     if os.getenv("LOCAL_RANK"):
285 |         training_args.local_rank = int(os.environ["LOCAL_RANK"])
286 |     else:
287 |         training_args.local_rank = -1
288 | 
289 |     if training_args.train_sparse:
290 |         if sparse_args.sparse_lr is None:
291 |             sparse_args.sparse_lr = training_args.learning_rate
292 | 
293 |         if training_args.debug_mode:
294 |             training_args.output_dir += "-debug"
295 |         print(f"save model to {training_args.output_dir}")
296 | 
297 |     # Setup logging
298 |     logging.basicConfig(
299 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
300 |         datefmt="%m/%d/%Y %H:%M:%S",
301 |         handlers=[logging.StreamHandler(sys.stdout)],
302 |     )
303 | 
304 |     log_level = training_args.get_process_log_level()
305 |     logger.setLevel(log_level)
306 |     datasets.utils.logging.set_verbosity(log_level)
307 |     transformers.utils.logging.set_verbosity(log_level)
308 |     transformers.utils.logging.enable_default_handler()
309 |     transformers.utils.logging.enable_explicit_format()
310 | 
311 |     # Log on each process the small summary:
312 |     logger.warning(
313 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
314 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
315 |     )
316 |     logger.info(f"Training/evaluation parameters {training_args}")
317 | 
318 |     # Detecting last checkpoint.
319 |     last_checkpoint = None
320 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
321 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
322 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
323 |             raise ValueError(
324 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
325 |                 "Use --overwrite_output_dir to overcome."
326 |             )
327 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
328 |             logger.info(
329 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
330 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
331 |             )
332 | 
333 |     # Set seed before initializing model.
334 |     set_seed(training_args.seed)
335 | 
336 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
337 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
338 |     #
339 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
340 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
341 |     # label if at least two columns are provided.
342 |     #
343 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
344 |     # single column. You can easily tweak this behavior (see below)
345 |     #
346 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
347 |     # download the dataset.
348 |     if data_args.task_name is not None:
349 |         # Downloading and loading a dataset from the hub.
350 |         from datasets import load_from_disk
351 |         from src.glue_tasks import AutoTask
352 |         raw_datasets = load_from_disk(data_path + data_args.task_name)
353 | 
354 |         task = AutoTask().get(task_name_for_get, None, None)
355 |         raw_datasets = {
356 |             "train": task.get("train", split_validation_test=True),
357 |             "validation": task.get("validation", split_validation_test=True),
358 |             "test": task.get("test", split_validation_test=True)
359 |         }
360 |         from datasets import DatasetDict
361 |         raw_datasets = DatasetDict(raw_datasets)
362 | 
363 |     elif data_args.dataset_name is not None:
364 |         raise NotImplementedError
365 |     else:
366 |         # Loading a dataset from your local files.
367 |         # CSV/JSON training and evaluation files are needed.
368 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
369 | 
370 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
371 |         # when you use `do_predict` without specifying a GLUE benchmark task.
372 |         if training_args.do_predict:
373 |             if data_args.test_file is not None:
374 |                 train_extension = data_args.train_file.split(".")[-1]
375 |                 test_extension = data_args.test_file.split(".")[-1]
376 |                 assert (
377 |                     test_extension == train_extension
378 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
379 |                 data_files["test"] = data_args.test_file
380 |             else:
381 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
382 | 
383 |         for key in data_files.keys():
384 |             logger.info(f"load a local file for {key}: {data_files[key]}")
385 | 
386 |         if data_args.train_file.endswith(".csv"):
387 |             # Loading a dataset from local csv files
388 |             raw_datasets = load_dataset(
389 |                 "csv",
390 |                 data_files=data_files,
391 |                 cache_dir=model_args.cache_dir,
392 |                 use_auth_token=True if model_args.use_auth_token else None,
393 |             )
394 |         else:
395 |             # Loading a dataset from local json files
396 |             raw_datasets = load_dataset(
397 |                 "json",
398 |                 data_files=data_files,
399 |                 cache_dir=model_args.cache_dir,
400 |                 use_auth_token=True if model_args.use_auth_token else None,
401 |             )
402 |     # See more about loading any type of standard or custom dataset at
403 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
404 | 
405 |     # Labels
406 |     if data_args.task_name is not None:
407 |         is_regression = data_args.task_name == "stsb"
408 |         if not is_regression:
409 |             label_list = raw_datasets["train"].features["label"].names
410 |             num_labels = len(label_list)
411 |         else:
412 |             num_labels = 1
413 |     else:
414 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
415 |         is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
416 |         if is_regression:
417 |             num_labels = 1
418 |         else:
419 |             # A useful fast method:
420 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
421 |             label_list = raw_datasets["train"].unique("label")
422 |             label_list.sort()  # Let's sort it for determinism
423 |             num_labels = len(label_list)
424 | 
425 |     # Load pretrained model and tokenizer
426 |     #
427 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
428 |     # download model & vocab.
429 |     config = AutoConfig.from_pretrained(
430 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
431 |         num_labels=num_labels,
432 |         finetuning_task=data_args.task_name,
433 |         cache_dir=model_args.cache_dir,
434 |         revision=model_args.model_revision,
435 |         use_auth_token=True if model_args.use_auth_token else None,
436 |     )
437 |     tokenizer = AutoTokenizer.from_pretrained(
438 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
439 |         cache_dir=model_args.cache_dir,
440 |         use_fast=model_args.use_fast_tokenizer,
441 |         revision=model_args.model_revision,
442 |         use_auth_token=True if model_args.use_auth_token else None,
443 |     )
444 |     model = AutoModelForSequenceClassification.from_pretrained(
445 |         model_args.model_name_or_path,
446 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
447 |         config=config,
448 |         cache_dir=model_args.cache_dir,
449 |         revision=model_args.model_revision,
450 |         use_auth_token=True if model_args.use_auth_token else None,
451 |         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
452 |     )
453 | 
454 |     if training_args.train_sparse:
455 |         print("loading from src.lora")
456 |         from src.lora import LoraModel, LoraConfig
457 |     else:
458 |         from opendelta.delta_models import LoraModel, LoraConfig
459 | 
460 |     import json
461 |     lora_config = json.load(open("config/lora_config.json"))
462 |     lora_config["lora_r"] = sparse_args.lora_r
463 |     lora_config = LoraConfig.from_dict(lora_config)
464 |     delta_model = LoraModel.from_config(lora_config, backbone_model=model)
465 |     delta_model.freeze_module(set_state_dict = True)
466 |     delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=False)
467 | 
468 | 
469 |     # Preprocessing the raw_datasets
470 |     if data_args.task_name is not None:
471 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
472 |     else:
473 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
474 |         non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
475 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
476 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
477 |         else:
478 |             if len(non_label_column_names) >= 2:
479 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
480 |             else:
481 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
482 | 
483 |     # Padding strategy
484 |     if data_args.pad_to_max_length:
485 |         padding = "max_length"
486 |     else:
487 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
488 |         padding = False
489 | 
490 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
491 |     label_to_id = None
492 |     if (
493 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
494 |         and data_args.task_name is not None
495 |         and not is_regression
496 |     ):
497 |         # Some have all caps in their config, some don't.
498 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
499 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
500 |             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
501 |         else:
502 |             logger.warning(
503 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
504 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
505 |                 "\nIgnoring the model labels as a result.",
506 |             )
507 |     elif data_args.task_name is None and not is_regression:
508 |         label_to_id = {v: i for i, v in enumerate(label_list)}
509 | 
510 |     if label_to_id is not None:
511 |         model.config.label2id = label_to_id
512 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
513 |     elif data_args.task_name is not None and not is_regression:
514 |         model.config.label2id = {l: i for i, l in enumerate(label_list)}
515 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
516 | 
517 | 
518 |     if data_args.max_seq_length > tokenizer.model_max_length:
519 |         logger.warning(
520 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
521 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
522 |         )
523 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
524 | 
525 |     def preprocess_function(examples):
526 |         # Tokenize the texts
527 |         args = (
528 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
529 |         )
530 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
531 | 
532 |         # Map labels to IDs (not necessary for GLUE tasks)
533 |         if label_to_id is not None and "label" in examples:
534 |             result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
535 |         return result
536 | 
537 |     with training_args.main_process_first(desc="dataset map pre-processing"):
538 |         raw_datasets = raw_datasets.map(
539 |             preprocess_function,
540 |             batched=True,
541 |             load_from_cache_file=not data_args.overwrite_cache,
542 |             desc="Running tokenizer on dataset",
543 |         )
544 |     if training_args.do_train:
545 |         if "train" not in raw_datasets:
546 |             raise ValueError("--do_train requires a train dataset")
547 |         train_dataset = raw_datasets["train"]
548 |         if data_args.max_train_samples is not None:
549 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
550 |             train_dataset = train_dataset.select(range(max_train_samples))
551 | 
552 |     if training_args.do_eval:
553 |         if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
554 |             raise ValueError("--do_eval requires a validation dataset")
555 |         eval_dataset = raw_datasets["validation"]
556 |         if data_args.max_eval_samples is not None:
557 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
558 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
559 | 
560 |     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
561 |         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
562 |             raise ValueError("--do_predict requires a test dataset")
563 |         predict_dataset = raw_datasets["test"]
564 |         if data_args.max_predict_samples is not None:
565 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
566 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
567 | 
568 |     # Log a few random samples from the training set:
569 |     if training_args.do_train:
570 |         for index in random.sample(range(len(train_dataset)), 3):
571 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
572 |     
573 |     # Get the metric function
574 |     from datasets import load_metric
575 |     if data_args.task_name is not None:
576 |         metric = load_metric("./glue.py", data_args.task_name)
577 |     else:
578 |         metric = load_metric("accuracy")
579 | 
580 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
581 |     # predictions and label_ids field) and has to return a dictionary string to float.
582 |     def compute_metrics(mode, p: EvalPrediction):
583 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
584 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
585 |         if data_args.task_name is not None:
586 |             result = metric.compute(predictions=preds, references=p.label_ids)
587 |             if len(result) > 1:
588 |                 result["combined_score"] = np.mean(list(result.values())).item()
589 |             return result
590 |         elif is_regression:
591 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
592 |         else:
593 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
594 | 
595 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
596 |     # we already did the padding.
597 |     if data_args.pad_to_max_length:
598 |         data_collator = default_data_collator
599 |     elif training_args.fp16:
600 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
601 |     else:
602 |         data_collator = None
603 |     
604 | 
605 |     # Initialize our Trainer
606 |     optimizer, lr_scheduler = create_optimizer_and_scheduler(training_args, model, num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
607 |     sparse_optimizer = None
608 |     sparse_scheduler = None
609 |     if training_args.train_sparse:
610 |         print("building sparse optimizer and scheduler")
611 |         from src.trainer import GATE_PARAM_NAME
612 |         valid_param_name = []
613 |         for n, p in model.named_parameters():
614 |             print(n)
615 |             if GATE_PARAM_NAME in n:
616 |                 valid_param_name.append(n)
617 |         print("valid param name:", valid_param_name)
618 |         sparse_optimizer = SparseAdamW(sparse_lambda=sparse_args.sparse_lambda_2, lambda_schedule=sparse_args.lambda_schedule, max_lambda=sparse_args.max_lambda, lambda_num=sparse_args.lambda_num, params=[p for n, p in model.named_parameters() if GATE_PARAM_NAME in n and p.requires_grad], lr=sparse_args.sparse_lr)
619 |         sparse_scheduler = get_linear_schedule_with_warmup(sparse_optimizer, 
620 |         num_warmup_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)*training_args.warmup_ratio), 
621 |         num_training_steps=int(training_args.num_train_epochs*(len(train_dataset) / training_args.train_batch_size)))
622 |     
623 |     if training_args.debug_mode:
624 |         train_dataset = eval_dataset
625 | 
626 |     # Initialize our Trainer
627 |     trainer = SparseTrainer(
628 |         model=model,
629 |         args=training_args,
630 |         train_dataset=train_dataset if training_args.do_train else None,
631 |         eval_dataset=eval_dataset if training_args.do_eval else None,
632 |         compute_metrics=compute_metrics,
633 |         tokenizer=tokenizer,
634 |         data_collator=data_collator,
635 |         optimizers = (optimizer, lr_scheduler),
636 |         sparse_lambda = sparse_args.sparse_lambda,
637 |         sparse_optimizer = (sparse_optimizer, sparse_scheduler)
638 |     )
639 | 
640 |     # Training
641 |     if training_args.do_train:
642 |         checkpoint = None
643 |         if training_args.resume_from_checkpoint is not None:
644 |             checkpoint = training_args.resume_from_checkpoint
645 |         elif last_checkpoint is not None:
646 |             checkpoint = last_checkpoint
647 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
648 |         metrics = train_result.metrics
649 |         max_train_samples = (
650 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
651 |         )
652 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
653 | 
654 |         trainer.save_model()  # Saves the tokenizer too for easy upload
655 | 
656 |         trainer.log_metrics("train", metrics)
657 |         trainer.save_metrics("train", metrics)
658 |         trainer.save_state()
659 |     
660 |     sparse_param, total_param = compute_trainable_sparse_param(model)
661 | 
662 | 
663 |     # eval on 1000 samples train set
664 |     train_dataset_for_eval = train_dataset.shuffle(seed=42).select(range(1000))
665 |     logger.info("*** Evaluate on training subset ***")
666 |     metrics = trainer.evaluate(eval_dataset=train_dataset_for_eval, metric_key_prefix = "eval_train")
667 |     trainer.log_metrics("eval_train", metrics)
668 |     trainer.save_metrics("eval_train", metrics)
669 |     BEST_TRAIN_METRIC = metrics["eval_train_" + "_".join(task_to_best_metric[data_args.task_name].split("_")[1:])]
670 | 
671 | 
672 |     # Evaluation
673 |     if training_args.do_eval:
674 |         logger.info("*** Evaluate ***")
675 | 
676 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
677 |         tasks = [data_args.task_name]
678 |         eval_datasets = [eval_dataset]
679 | 
680 |         for eval_dataset, task in zip(eval_datasets, tasks):
681 |             metrics = trainer.evaluate(eval_dataset=eval_dataset)
682 | 
683 |             max_eval_samples = (
684 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
685 |             )
686 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
687 | 
688 |             trainer.log_metrics("eval", metrics)
689 |             trainer.save_metrics("eval", metrics)
690 | 
691 |             BEST_EVAL_METRIC = metrics[task_to_best_metric[data_args.task_name]]
692 | 
693 |     if training_args.do_predict:
694 |         logger.info("*** Predict ***")
695 | 
696 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
697 |         tasks = [data_args.task_name]
698 |         predict_datasets = [predict_dataset]
699 | 
700 |         for predict_dataset, task in zip(predict_datasets, tasks):
701 |             metrics = trainer.evaluate(eval_dataset=predict_dataset)
702 | 
703 |             max_eval_samples = (
704 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
705 |             )
706 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
707 | 
708 |             trainer.log_metrics("test", metrics)
709 |            
710 |             trainer.save_metrics("test", metrics)
711 |     
712 |     logger.info("***** Final Model ******\nLora rank: %d\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (lora_config.lora_r, total_param, sparse_param, sparse_param / total_param * 100))
713 | 
714 | 
715 |     def compute_metrics_in_schedule(mode, p: EvalPrediction):
716 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
717 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
718 |         if data_args.task_name is not None:
719 |             result = metric.compute(predictions=preds, references=p.label_ids)
720 |             if len(result) > 1:
721 |                 result["combined_score"] = np.mean(list(result.values())).item()
722 |             if mode == "eval":
723 |                 result["generalization"] = result["_".join(task_to_best_metric[data_args.task_name].split("_")[1:])] / BEST_EVAL_METRIC * 100
724 |             elif mode == "eval_train":
725 |                 result["memorization"] = result["_".join(task_to_best_metric[data_args.task_name].split("_")[1:])] / BEST_TRAIN_METRIC * 100
726 |             elif mode == "test":
727 |                 pass
728 |             else:
729 |                 raise NotImplementedError
730 |             return result
731 |         elif is_regression:
732 |             raise NotImplementedError
733 |             
734 |         else:
735 |             raise NotImplementedError
736 |             
737 | 
738 |     # schedule
739 |     if sparse_args.lambda_schedule is not None:
740 |         logger.info("*****Start lambda_2 scheduling***")
741 |         from transformers import EarlyStoppingCallback
742 |         for _ in range(sparse_args.lambda_num - 1):
743 |             training_args.num_train_epochs = 15
744 |             training_args.load_best_model_at_end = False
745 |             sparse_optimizer.step_lambda()
746 |             trainer = SparseTrainer(
747 |                 model=model,
748 |                 args=training_args,
749 |                 train_dataset=train_dataset if training_args.do_train else None,
750 |                 eval_dataset=[eval_dataset if training_args.do_eval else None, train_dataset_for_eval],
751 |                 compute_metrics=compute_metrics_in_schedule,
752 |                 tokenizer=tokenizer,
753 |                 data_collator=data_collator,
754 |                 optimizers = (optimizer, lr_scheduler),
755 |                 sparse_lambda = sparse_args.sparse_lambda,
756 |                 sparse_optimizer = (sparse_optimizer, sparse_scheduler),
757 |             )
758 |             
759 |             trainer.train()
760 | 
761 |             if training_args.do_predict:
762 |                 logger.info("*** Predict ***")
763 | 
764 |                 # Loop to handle MNLI double evaluation (matched, mis-matched)
765 |                 tasks = [data_args.task_name]
766 |                 predict_datasets = [predict_dataset]
767 |                 
768 | 
769 |                 for predict_dataset, task in zip(predict_datasets, tasks):
770 |                     metrics = trainer.evaluate(eval_dataset=predict_dataset, metric_key_prefix="test")
771 | 
772 |                     max_eval_samples = (
773 |                         data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
774 |                     )
775 |                     metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
776 | 
777 | 
778 |                     trainer.log_metrics("test", metrics)
779 | 
780 |                     trainer.save_metrics("test", metrics)
781 | 
782 |             
783 |             sparse_param, total_param = compute_trainable_sparse_param(model)
784 | 
785 |             logger.info("***** Lambda=%f Final Model ******\nLora rank: %d\nNumber of trainable full param: %d\nNumber of trainable sparse param: %d, Ratio: %.4f%%\n**********" % (sparse_optimizer.sparse_lambda, lora_config.lora_r, total_param, sparse_param, sparse_param / total_param * 100))
786 | 
787 | 
788 | 
789 | def _mp_fn(index):
790 |     # For xla_spawn (TPUs)
791 |     main()
792 | 
793 | 
794 | if __name__ == "__main__":
795 |     main()


--------------------------------------------------------------------------------