├── scripts ├── ft │ ├── llama_2_7b_alpaca_gpt4.sh │ ├── bert_base_squad.sh │ ├── t5_base_lm_adapt_sst2.sh │ ├── t5_base_xsum.sh │ ├── t5_base_lm_adapt_cnndm.sh │ ├── bert_base_mnli.sh │ ├── roberta_base_squad.sh │ ├── roberta_base_sst2.sh │ ├── bert_base_sst2.sh │ ├── roberta_base_mnli.sh │ ├── t5_xl_lm_adapt_sst2.sh │ └── roberta_base_squadv2.sh ├── adaptpruning │ └── roberta_base_squad_momentum.sh ├── eval │ ├── query_alpaca_eval.sh │ ├── truthfulqa.sh │ ├── alpaca_eval.sh │ ├── mmlu.sh │ ├── wmt_enro.sh │ ├── cnndm.sh │ └── xsum.sh ├── tradeoff │ ├── mt5_base_lora_tradeoff.sh │ ├── t5_base_lm_adapt_lora_tradeoff.sh │ ├── roberta_base_ft_mask_tuning.sh │ ├── roberta_base_mask_tuning.sh │ ├── roberta_base_sst2_tuning.sh │ └── roberta_base_sst2.sh ├── eval.sh ├── eval_multiple_lora_roberta_mnli.sh ├── test_pruning_efficiency.sh ├── merge_lora.sh ├── sbatch_scripts │ ├── submit_job.sbatch │ ├── ft │ │ ├── bert_base_sst2.sbatch │ │ └── roberta_base_sst2.sbatch │ ├── lora │ │ ├── bert_base_sst2.sbatch │ │ ├── roberta_base_mnli.sbatch │ │ ├── roberta_base_squad.sbatch │ │ └── roberta_base_sst2.sbatch │ ├── elastictuning │ │ ├── roberta_base_mnli_selfmomentum.sh │ │ ├── t5_xl_lm_adapt_sst2_selfmomentum.sh │ │ ├── roberta_base_squadv2_selfmomentum.sh │ │ ├── t5_base_lm_adapt_sst2_selfmomentum.sh │ │ ├── bert_base_squad_selfmomentum_noffnstart.sh │ │ └── roberta_base_sst2_selfmomentum_noffnstart.sh │ └── submit_job_a100.sbatch ├── post_training_prune.sh ├── post_training_squad_prune.sh ├── main_results │ └── bert_glue_big_momentum.sh ├── post_training_cnndm_prune.sh ├── test_fisher_prune.sh ├── test_random_prune.sh ├── hyperparameter_searching │ ├── test_cutoff_prune_step.sh │ ├── test_throughout_prune.sh │ ├── test_once_rescaled.sh │ ├── test_once_prune_step.sh │ ├── test_distill.sh │ ├── test_distill_fisher.sh │ ├── test_distill_shorter.sh │ └── test_training_hypers.sh ├── prepare_data.sh ├── eval_lora_roberta_mnli.sh ├── post_training_wmt_prune.sh ├── ablation │ ├── roberta_base_sst2_distillation.sh │ └── roberta_base_mnli_distillation.sh ├── merge_llama_lora.sh ├── lora │ ├── bert_base_squad.sh │ ├── t5_base_lm_adapt_sst2.sh │ ├── t5_base_lm_adapt_mnli.sh │ ├── t5_xl_lm_adapt_sst2.sh │ ├── bert_base_mnli.sh │ ├── bert_base_sst2.sh │ ├── t5_xl_lm_adapt_cnndm.sh │ ├── roberta_base_cola.sh │ ├── roberta_base_mrpc.sh │ ├── roberta_base_rte.sh │ ├── roberta_base_stsb.sh │ ├── roberta_base_mnli.sh │ ├── roberta_base_squad.sh │ ├── roberta_base_sst2.sh │ ├── t5_base_xsum.sh │ ├── t5_base_lm_adapt_cnndm.sh │ ├── roberta_base_squadv2.sh │ ├── llama_13b_alpaca_cleaned.sh │ ├── llama_2_7b_alpaca_gpt4.sh │ ├── llama_7b_alpaca_cleaned.sh │ ├── llama_2_13b_alpaca_gpt4.sh │ ├── roberta_base_qnli.sh │ ├── roberta_base_qqp.sh │ ├── mt5_base_wmt_enro.sh │ └── mt5_base_wmt_roen.sh ├── efficiency_testing.sh ├── post_training_sft_prune.sh ├── efficiency_testing_llama.sh ├── train_ft_seq2seq.sh ├── train_ft.sh ├── train_lora_squad.sh ├── train_lora_squadv2.sh ├── train_lora.sh ├── train_lora_seq2seq.sh ├── train_lora_sft.sh ├── train_lora_wmt.sh ├── adaptpruning_nodistill │ ├── t5_base_lm_adapt_mnli.sh │ ├── t5_base_lm_adapt_sst2.sh │ ├── bert_base_mnli.sh │ ├── bert_base_squad.sh │ ├── roberta_base_squad.sh │ └── bert_base_sst2.sh ├── train_ft_distill_seq2seq.sh ├── train_ft_distill.sh ├── train_lora_distill_squadv2.sh ├── train_lora_distill.sh └── train_lora_distill_seq2seq.sh ├── test ├── test_optimizer_state_passing.py ├── test_salience.py ├── test_rewarmup_lr_scheduling.py ├── test_gpu_base_speed.py ├── test_t5_efficiency.py ├── test_t5_prune_consistency.py ├── test_param_tuning.py ├── test_pruned_teacher_training.py ├── test_param_controller.py └── test_deepspeed_profiler.py ├── figures └── APT_arch.png ├── loralib ├── __init__.py └── utils.py ├── utils └── fisher_utils │ ├── schedule.py │ ├── meter.py │ ├── efficiency │ ├── mem.py │ └── latency.py │ ├── linalg.py │ └── timer.py ├── .gitignore ├── run.sh ├── requirements.txt ├── run_glue_multigpu.sh ├── check_param_num.py ├── LICENSE ├── models ├── modeling_outputs.py └── model_args.py ├── eval ├── mmlu │ └── categories.py └── dispatch_openai_requests.py ├── merge_model_lora.py └── evaluate.py /scripts/ft/llama_2_7b_alpaca_gpt4.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_optimizer_state_passing.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/adaptpruning/roberta_base_squad_momentum.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figures/APT_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROIM1998/APT/HEAD/figures/APT_arch.png -------------------------------------------------------------------------------- /loralib/__init__.py: -------------------------------------------------------------------------------- 1 | name = "lora" 2 | 3 | from .layers import * 4 | from .utils import * -------------------------------------------------------------------------------- /scripts/eval/query_alpaca_eval.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY=$NEW_OPENAI_KEY 2 | echo $OPENAI_API_KEY 3 | model_output_path=$1 4 | 5 | alpaca_eval --model_outputs $model_output_path -------------------------------------------------------------------------------- /scripts/tradeoff/mt5_base_lora_tradeoff.sh: -------------------------------------------------------------------------------- 1 | lora_rs=(102 64 32 16 8) 2 | for lora_r in ${lora_rs[@]}; do 3 | echo "lora_r: $lora_r" 4 | bash scripts/lora/mt5_base_wmt_enro.sh 2 16 $lora_r $(($lora_r * 4)) 5e-5 42 5 | done -------------------------------------------------------------------------------- /utils/fisher_utils/schedule.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def get_pruning_schedule(target, num_iter): 5 | p = math.pow(target, 1 / num_iter) 6 | schedule = [p ** i for i in range(1, num_iter)] + [target] 7 | return schedule 8 | -------------------------------------------------------------------------------- /scripts/tradeoff/t5_base_lm_adapt_lora_tradeoff.sh: -------------------------------------------------------------------------------- 1 | lora_rs=(102 64 32 16 8) 2 | for lora_r in ${lora_rs[@]}; do 3 | echo "lora_r: $lora_r" 4 | bash scripts/lora/t5_base_lm_adapt_cnndm.sh 6 16 $lora_r $(($lora_r * 4)) 5e-5 42 5 | done -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.vscode/** 2 | **/__pycache__/** 3 | 4 | *output 5 | *output/ 6 | all_res/ 7 | **/*.xlsx 8 | *-profile.txt 9 | # log files on slurm 10 | **/*.log 11 | **/*.out 12 | legacy_scripts 13 | legacy_scripts/** 14 | 15 | **/*backup* 16 | data 17 | data/** -------------------------------------------------------------------------------- /scripts/eval.sh: -------------------------------------------------------------------------------- 1 | model_name=$1 2 | task_name=$2 3 | output_dir="$model_name/results" 4 | mkdir -p $output_dir 5 | 6 | # Evaluate 7 | python evaluate.py \ 8 | --output_dir ${output_dir}\ 9 | --model_name_or_path ${model_name} \ 10 | --do_eval \ 11 | --task_name ${task_name} -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | bash scripts/adaptpruning_nodistill/llama_2_7b_alpaca_gpt4_preprune.sh 2 | bash scripts/adaptpruning_nodistill/llama_2_13b_alpaca_gpt4_preprune.sh 3 | bash scripts/lora/llama_2_7b_alpaca_gpt4.sh 4 | bash scripts/lora/llama_2_13b_alpaca_gpt4.sh 5 | bash scripts/lora/llama_7b_alpaca_cleaned.sh 6 | bash scripts/lora/llama_13b_alpaca_cleaned.sh -------------------------------------------------------------------------------- /scripts/eval_multiple_lora_roberta_mnli.sh: -------------------------------------------------------------------------------- 1 | for mac_constraint in 0.2 0.4 0.6 2 | do 3 | for pruning_frequency in 0.1 0.5 1.5 4 | do 5 | echo "Using mac_constraint ${mac_constraint}, pruning_frequency ${pruning_frequency}" 6 | bash scripts/eval_lora_roberta_mnli.sh ${pruning_frequency} 64 ${mac_constraint} 7 | done 8 | done -------------------------------------------------------------------------------- /scripts/tradeoff/roberta_base_ft_mask_tuning.sh: -------------------------------------------------------------------------------- 1 | mac_constraints=(0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9) 2 | 3 | for mac_constraint in ${mac_constraints[@]}; do 4 | echo "mac_constraint: $mac_constraint" 5 | bash scripts/post_training_prune.sh 'output/roberta-base/sst2/bz32/ft/epoch60/lr2e-5/seed42/best_model' sst2 $mac_constraint 64 6 | done -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.10.0 2 | deepspeed==0.8.0 3 | matplotlib==3.7.1 4 | numpy==1.24.3 5 | ortools==9.6.2534 6 | pandas==1.5.2 7 | scikit_learn==1.1.3 8 | scipy==1.10.1 9 | seaborn==0.12.2 10 | tqdm==4.65.0 11 | transformers==4.28.1 12 | nltk==3.8.1 13 | rouge-score==0.1.2 14 | torch==1.10.2+cu113 15 | --extra-index-url https://download.pytorch.org/whl/cu113 -------------------------------------------------------------------------------- /scripts/tradeoff/roberta_base_mask_tuning.sh: -------------------------------------------------------------------------------- 1 | mac_constraints=(0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9) 2 | 3 | for mac_constraint in ${mac_constraints[@]}; do 4 | echo "mac_constraint: $mac_constraint" 5 | bash scripts/post_training_prune.sh 'output/roberta-base/sst2/bz32/lora/epoch60/lora_r8/lora_alpha16/lr2e-4/seed42/best_model' sst2 $mac_constraint 64 6 | done -------------------------------------------------------------------------------- /scripts/eval/truthfulqa.sh: -------------------------------------------------------------------------------- 1 | # # export CUDA_VISIBLE_DEVICES=0 2 | 3 | # zero-shot 4 | python -m eval.truthfulqa.run_eval \ 5 | --ntrain 0 \ 6 | --data_dir data/eval/truthfulqa \ 7 | --save_dir results/truthfulqa/llama-7B-0shot/ \ 8 | --model_name_or_path /mmfs1/gscratch/cse/yizhongw/llama_checkpoints/7B/ \ 9 | --tokenizer_name_or_path /mmfs1/gscratch/cse/yizhongw/llama_checkpoints/7B/ \ 10 | --eval_batch_size 2 \ 11 | --load_in_8bit \ 12 | --use_chat_format -------------------------------------------------------------------------------- /scripts/test_pruning_efficiency.sh: -------------------------------------------------------------------------------- 1 | output_dir="output/efficiency_testing" 2 | mkdir -p $output_dir 3 | 4 | python test_pruning_efficiency.py \ 5 | --output_dir ${output_dir}\ 6 | --task_name mnli \ 7 | --model_name_or_path roberta-base \ 8 | --do_train \ 9 | --do_eval \ 10 | --max_seq_length 128 \ 11 | --per_device_train_batch_size 16 \ 12 | --per_device_eval_batch_size 16 \ 13 | --apply_lora \ 14 | --lora_alpha 16 \ 15 | --lora_r 8 \ 16 | --report_to none\ -------------------------------------------------------------------------------- /scripts/merge_lora.sh: -------------------------------------------------------------------------------- 1 | model_path=$1 2 | output_dir=$2 3 | task_name=$3 4 | lora_r=$4 5 | lora_alpha=$5 6 | 7 | python merge_model_lora.py \ 8 | --output_dir ${output_dir}\ 9 | --model_name_or_path ${model_path} \ 10 | --task_name ${task_name} \ 11 | --do_train \ 12 | --do_eval \ 13 | --max_seq_length 128 \ 14 | --per_device_train_batch_size 32 \ 15 | --per_device_eval_batch_size 32 \ 16 | --apply_lora \ 17 | --lora_r ${lora_r} \ 18 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /utils/fisher_utils/meter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter: 2 | 3 | def __init__(self, name): 4 | self.name = name 5 | self.reset() 6 | 7 | def reset(self): 8 | self.val = 0 9 | self.avg = 0 10 | self.sum = 0 11 | self.count = 0 12 | 13 | def update(self, val, n=1): 14 | self.val = val 15 | self.sum += val * n 16 | self.count += n 17 | self.avg = self.sum / self.count 18 | 19 | def __str__(self): 20 | return f"{self.name}: {self.avg:.4f}" 21 | -------------------------------------------------------------------------------- /scripts/sbatch_scripts/submit_job.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=400:00:00 # Walltime (hh:mm:ss) 10 | 11 | bash scripts/adaptpruning/roberta_base_squadv2_momentum.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/ft/bert_base_sst2.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/ft/bert_base_sst2.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/ft/roberta_base_sst2.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/ft/roberta_base_sst2.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/lora/bert_base_sst2.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/lora/bert_base_sst2.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/lora/roberta_base_mnli.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/lora/roberta_base_mnli.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/lora/roberta_base_squad.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/lora/roberta_base_squad.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/lora/roberta_base_sst2.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/lora/roberta_base_sst2.sh -------------------------------------------------------------------------------- /run_glue_multigpu.sh: -------------------------------------------------------------------------------- 1 | model_name=roberta_base 2 | 3 | gpu_available="0,1,2,3,4,5,6" 4 | # Split the gpu_available string into an array 5 | gpu_ids=(${gpu_available//,/ }) 6 | 7 | task_name=(sst2 stsb qqp mnli cola mrpc qnli) 8 | 9 | # For each GPU, run the script with a different mac_constraint 10 | for i in "${!gpu_ids[@]}"; do 11 | gpu_id=${gpu_ids[$i]} 12 | task_name=${task_name[$i]} 13 | echo "Running on GPU $gpu_id with mac_constraint $mac_constraint" 14 | bash scripts/adaptpruning_nodistill/roberta_base_${task_name}.sh 0.4 8 16 cubic_gradual global free_inout $gpu_id & 15 | done -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/roberta_base_mnli_selfmomentum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=500:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/roberta_base_mnli_momentum.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/t5_xl_lm_adapt_sst2_selfmomentum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=500:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/t5_xl_lm_adapt_sst2_momentum.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/roberta_base_squadv2_selfmomentum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/roberta_base_squadv2_momentum.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/t5_base_lm_adapt_sst2_selfmomentum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=400:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/t5_base_lm_adapt_sst2_momentum.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/bert_base_squad_selfmomentum_noffnstart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-a100 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/bert_base_squad_momentum_noffnstart.sh -------------------------------------------------------------------------------- /scripts/sbatch_scripts/elastictuning/roberta_base_sst2_selfmomentum_noffnstart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-a100 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Execute the run.sh script 12 | bash scripts/adaptpruning/roberta_base_sst2_selfmomentum_noffnstart.sh -------------------------------------------------------------------------------- /utils/fisher_utils/efficiency/mem.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | MB = 1024 * 1024 4 | 5 | def bert_forward(batch_size: int = 32, seq_len: int = 128, num_heads: List[int] = [12] * 12, num_neurons: List[int] = [3072] * 12, hidden_size: int = 768, intermediate_size: int = 3072, attn_head_size: int = 64, output_hidden_states: bool = True, output_attention: bool = False, dtype=32)-> float: 6 | assert len(num_heads) == len(num_neurons) 7 | mha_size = sum(num_heads) * ((hidden_size * attn_head_size) + 1) * 4 8 | ffn_size = sum(num_neurons) * hidden_size * 2 + sum(num_neurons) 9 | total = mha_size + ffn_size 10 | return total * dtype / 8 / MB -------------------------------------------------------------------------------- /scripts/sbatch_scripts/submit_job_a100.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-a100 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | # Load the conda environment 12 | cd ../../ # cd to the root directory of the project 13 | 14 | # Execute the run.sh script 15 | bash scripts/adaptpruning/roberta_base_sst2_selfmomentum_noffnstart.sh -------------------------------------------------------------------------------- /scripts/post_training_prune.sh: -------------------------------------------------------------------------------- 1 | model_path=$1 2 | task_name=$2 3 | mac_constraint=$3 4 | num_batches=$4 5 | if [ "$#" -eq 5 ]; then 6 | lora_alpha=$5 7 | else 8 | lora_alpha=16 9 | fi 10 | output_dir="${model_path}/new_pruned/constraint_${mac_constraint}/batches_${num_batches}" 11 | 12 | python post_training_prune.py \ 13 | --output_dir ${output_dir}\ 14 | --model_name_or_path ${model_path} \ 15 | --task_name ${task_name} \ 16 | --do_train \ 17 | --do_eval \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size 32 \ 20 | --per_device_eval_batch_size 32 \ 21 | --pruning_batch_size 32 \ 22 | --pruning_batches ${num_batches} \ 23 | --mac_constraint $3 \ 24 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /scripts/post_training_squad_prune.sh: -------------------------------------------------------------------------------- 1 | model_path=$1 2 | task_name=$2 3 | mac_constraint=$3 4 | num_batches=$4 5 | if [ "$#" -eq 5 ]; then 6 | lora_alpha=$5 7 | else 8 | lora_alpha=16 9 | fi 10 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}" 11 | 12 | python post_training_squad_prune.py \ 13 | --output_dir ${output_dir}\ 14 | --model_name_or_path ${model_path} \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 384 \ 18 | --doc_stride 128 \ 19 | --version_2_with_negative \ 20 | --per_device_train_batch_size 32 \ 21 | --per_device_eval_batch_size 32 \ 22 | --pruning_batch_size 32 \ 23 | --pruning_batches ${num_batches} \ 24 | --mac_constraint $3 \ 25 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /scripts/main_results/bert_glue_big_momentum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-a100 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=200:00:00 # Walltime (hh:mm:ss) 10 | 11 | constraint=$1 12 | task_names=(qnli qqp mnli sst2) 13 | 14 | for task in ${task_names[@]}; do 15 | bash scripts/adaptpruning/bert_base_${task}_momentum.sh $constraint 8 -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student 16 | done -------------------------------------------------------------------------------- /scripts/post_training_cnndm_prune.sh: -------------------------------------------------------------------------------- 1 | model_path=$1 2 | mac_constraint=$2 3 | num_batches=$3 4 | if [ "$#" -eq 5 ]; then 5 | lora_alpha=$5 6 | else 7 | lora_alpha=16 8 | fi 9 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}" 10 | 11 | python post_training_seq2seq_prune.py \ 12 | --output_dir ${output_dir}\ 13 | --model_name_or_path ${model_path} \ 14 | --do_train \ 15 | --do_eval \ 16 | --task_name cnndm \ 17 | --max_input_length 512 \ 18 | --max_target_length 128 \ 19 | --per_device_train_batch_size 32 \ 20 | --per_device_eval_batch_size 32 \ 21 | --tf32 True \ 22 | --pruning_batch_size 32 \ 23 | --pruning_batches ${num_batches} \ 24 | --mac_constraint ${mac_constraint} \ 25 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /scripts/tradeoff/roberta_base_sst2_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=400:00:00 # Walltime (hh:mm:ss) 10 | 11 | 12 | lora_rs=(16 32 64 128 256) 13 | 14 | for lora_r in ${lora_rs[@]}; do 15 | echo "lora_r: $lora_r" 16 | bash scripts/adaptpruning/roberta_base_sst2_momentum.sh 0.4 $lora_r -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student 17 | done -------------------------------------------------------------------------------- /utils/fisher_utils/linalg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cupy 3 | from cupyx.scipy.sparse.linalg import lsmr 4 | 5 | 6 | @torch.no_grad() 7 | def closed_form_solver(A, B): 8 | if B.shape[0] == 1: 9 | X = B / A[0, 0] 10 | else: 11 | # NOTE: for safety, compute matrix inverse on CPU 12 | X = torch.inverse(A.cpu()).to(A.device) @ B 13 | return X 14 | 15 | 16 | @torch.no_grad() 17 | def lsmr_cupy_solver(A, B): 18 | B = B - A.sum(dim=1) 19 | if B.shape[0] == 1: 20 | X = B / A[0, 0] 21 | else: 22 | CU_A = cupy.asarray(A.cpu().numpy()) 23 | CU_B = cupy.asarray(B.cpu().numpy()) 24 | solution = lsmr(CU_A, CU_B, damp=1) 25 | X = cupy.asnumpy(solution[0]) 26 | X = torch.from_numpy(X).to(A.device) 27 | X = X + 1 28 | return X, solution[1] < 3 29 | -------------------------------------------------------------------------------- /scripts/tradeoff/roberta_base_sst2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=400:00:00 # Walltime (hh:mm:ss) 10 | 11 | 12 | mac_constraints=(0.1 0.3 0.4 0.5) 13 | 14 | for mac_constraint in ${mac_constraints[@]}; do 15 | echo "mac_constraint: $mac_constraint" 16 | bash scripts/adaptpruning/roberta_base_sst2_momentum.sh $mac_constraint 8 -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student 17 | done -------------------------------------------------------------------------------- /scripts/test_fisher_prune.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | model_name_or_path='roberta-base' 3 | test_mode='correlation' 4 | elif [ "$#" -eq 1 ]; then 5 | model_name_or_path=$1 6 | test_mode='correlation' 7 | elif [ "$#" -eq 3 ]; then 8 | model_name_or_path=$1 9 | test_mode=$2 10 | fi 11 | 12 | output_dir='./output/test_prune/' 13 | mkdir -p $output_dir 14 | 15 | python run_pruning.py \ 16 | --output_dir ${output_dir}\ 17 | --task_name mnli \ 18 | --model_name_or_path ${model_name_or_path} \ 19 | --max_seq_length 128 \ 20 | --per_device_train_batch_size 32 \ 21 | --per_device_eval_batch_size 32 \ 22 | --report_to none\ 23 | --do_train\ 24 | --do_eval\ 25 | --test_mode ${test_mode}\ 26 | --ratio_bound 0.1\ 27 | --ratio_step 0.01\ 28 | --apply_lora\ 29 | --prune_mode fisher\ -------------------------------------------------------------------------------- /scripts/test_random_prune.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | model_name_or_path='roberta-base' 3 | test_mode='stability' 4 | mask_mode='all' 5 | elif [ "$#" -eq 1 ]; then 6 | model_name_or_path=$1 7 | test_mode='stability' 8 | mask_mode='all' 9 | elif [ "$#" -eq 3 ]; then 10 | model_name_or_path=$1 11 | test_mode=$2 12 | mask_mode=$3 13 | fi 14 | 15 | output_dir='./output/test_prune/' 16 | mkdir -p $output_dir 17 | 18 | python run_pruning.py \ 19 | --output_dir ${output_dir}\ 20 | --task_name mnli \ 21 | --model_name_or_path ${model_name_or_path} \ 22 | --max_seq_length 128 \ 23 | --per_device_train_batch_size 32 \ 24 | --per_device_eval_batch_size 32 \ 25 | --report_to none\ 26 | --do_eval\ 27 | --test_mode ${test_mode}\ 28 | --mask_mode ${mask_mode}\ 29 | --ratio_bound 1.\ 30 | --ratio_step 0.01\ 31 | --apply_lora\ -------------------------------------------------------------------------------- /scripts/ft/bert_base_squad.sh: -------------------------------------------------------------------------------- 1 | model_name="bert-base-uncased" 2 | task_name="squad" 3 | num_epochs=10 4 | learning_rate=1e-5 5 | batch_size=48 6 | output_dir="output/${model_name}_${task_name}_full/epoch${num_epochs}/bz${batch_size}" 7 | 8 | 9 | echo $output_dir 10 | mkdir -p $output_dir 11 | 12 | 13 | python run_minus_squad_training.py \ 14 | --output_dir ${output_dir}\ 15 | --model_name_or_path ${model_name} \ 16 | --do_train \ 17 | --do_eval \ 18 | --save_strategy no \ 19 | --logging_strategy epoch \ 20 | --evaluation_strategy epoch \ 21 | --max_seq_length 384 \ 22 | --doc_stride 128 \ 23 | --num_train_epochs ${num_epochs} \ 24 | --per_device_train_batch_size ${batch_size} \ 25 | --per_device_eval_batch_size ${batch_size} \ 26 | --warmup_ratio 0.06\ 27 | --learning_rate ${learning_rate}\ 28 | --weight_decay 0.1\ 29 | --report_to none -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_cutoff_prune_step.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=3 2 | 3 | for pruning_frequency in 0.1 0.5 1.0 1.5 4 | do 5 | output_dir="output/roberta_lora_minus_mnli_cutoff/freq${pruning_frequency}/batchuse64/mac0.6/" 6 | mkdir -p $output_dir 7 | 8 | python run_minus_training.py \ 9 | --output_dir ${output_dir}\ 10 | --task_name mnli \ 11 | --model_name_or_path roberta-base \ 12 | --do_train \ 13 | --do_eval \ 14 | --max_seq_length 128 \ 15 | --per_device_train_batch_size 32 \ 16 | --per_device_eval_batch_size 32 \ 17 | --apply_lora \ 18 | --lora_alpha 16 \ 19 | --lora_r 8 \ 20 | --report_to none\ 21 | --pruning_batches 64 \ 22 | --mac_constraint 0.6 \ 23 | --pruning_frequency ${pruning_frequency}\ 24 | --pruning_scheduler cutoff 25 | done -------------------------------------------------------------------------------- /scripts/prepare_data.sh: -------------------------------------------------------------------------------- 1 | mkdir -p data/sft 2 | mkdir -p data/eval 3 | 4 | # Download alpaca data for sft 5 | wget -O data/sft/alpaca_data.json https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json 6 | 7 | # Download MMLU eval data 8 | 9 | # MMLU dataset 10 | wget -O data/eval/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar 11 | mkdir -p data/eval/mmlu_data 12 | tar -xvf data/eval/mmlu_data.tar -C data/eval/mmlu_data 13 | mv data/eval/mmlu_data/data data/eval/mmlu && rm -r data/eval/mmlu_data data/eval/mmlu_data.tar 14 | 15 | # TruthfulQA dataset, open-ended and multiple-choice versions 16 | mkdir -p data/eval/truthfulqa 17 | wget -O data/eval/truthfulqa/truthfulqa.csv https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v0/TruthfulQA.csv 18 | wget -O data/eval/truthfulqa/truthfulqa_mc.json https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/mc_task.json -------------------------------------------------------------------------------- /scripts/eval_lora_roberta_mnli.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | pruning_frequency=0.1 3 | pruning_batches=64 4 | mac_constraint=0.6 5 | elif [ "$#" -eq 3 ]; then 6 | pruning_frequency=$1 7 | pruning_batches=$2 8 | mac_constraint=$3 9 | fi 10 | 11 | model_dir="output/roberta_lora_minus_mnli/freq${pruning_frequency}/batchuse${pruning_batches}/mac${mac_constraint}/" 12 | 13 | python run_minus_training.py \ 14 | --output_dir ${model_dir}\ 15 | --task_name mnli \ 16 | --model_name_or_path "./${model_dir}" \ 17 | --do_eval \ 18 | --max_seq_length 128 \ 19 | --per_device_train_batch_size 32 \ 20 | --per_device_eval_batch_size 32 \ 21 | --apply_lora \ 22 | --lora_alpha 16 \ 23 | --lora_r 8 \ 24 | --report_to none\ 25 | --pruning_frequency ${pruning_frequency}\ 26 | --pruning_batches ${pruning_batches} \ 27 | --mac_constraint ${mac_constraint} -------------------------------------------------------------------------------- /scripts/post_training_wmt_prune.sh: -------------------------------------------------------------------------------- 1 | model_path=$1 2 | mac_constraint=$2 3 | num_batches=$3 4 | 5 | source_lang=en 6 | target_lang=ro 7 | task_name=wmt16 8 | 9 | lora_alpha=16 10 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}" 11 | 12 | python post_training_seq2seq_prune.py \ 13 | --output_dir ${output_dir}\ 14 | --model_name_or_path ${model_path} \ 15 | --do_train \ 16 | --do_eval \ 17 | --task_name ${task_name} \ 18 | --max_input_length 256 \ 19 | --max_target_length 256 \ 20 | --lang_pair ${target_lang}-${source_lang} \ 21 | --source_lang ${source_lang} \ 22 | --target_lang ${target_lang} \ 23 | --per_device_train_batch_size 8 \ 24 | --per_device_eval_batch_size 8 \ 25 | --tf32 True \ 26 | --pruning_batch_size 32 \ 27 | --pruning_batches ${num_batches} \ 28 | --mac_constraint ${mac_constraint} \ 29 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /utils/fisher_utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | 5 | 6 | class CPUTimer: 7 | def __init__(self, timelogs): 8 | self.timelogs = timelogs 9 | 10 | def __enter__(self): 11 | self.start = time.time() 12 | 13 | def __exit__(self, type, value, traceback): 14 | end = time.time() 15 | self.timelogs.append((end - self.start) * 1000) # ms 16 | 17 | 18 | class GPUTimer: 19 | def __init__(self, timelogs): 20 | self.timelogs = timelogs 21 | 22 | def __enter__(self): 23 | self.start_event = torch.cuda.Event(enable_timing=True) 24 | self.end_event = torch.cuda.Event(enable_timing=True) 25 | self.start_event.record() 26 | 27 | def __exit__(self, type, value, traceback): 28 | self.end_event.record() 29 | self.end_event.synchronize() 30 | elapsed_time = self.start_event.elapsed_time(self.end_event) 31 | self.timelogs.append(elapsed_time) 32 | -------------------------------------------------------------------------------- /check_param_num.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import torch 4 | from tqdm import tqdm 5 | 6 | if __name__ == '__main__': 7 | root = sys.argv[1] 8 | weights = [os.path.join(root, v) for v in os.listdir(root) if v.endswith('.bin') and 'arg' not in v] 9 | total_param_nums = 0 10 | param_nums = 0 11 | for weight in tqdm(weights): 12 | state_dict = torch.load(weight, map_location='cpu') 13 | for k, v in state_dict.items(): 14 | if 'lora' in k or 'transform' in k: 15 | continue 16 | total_param_nums += v.numel() 17 | if 'lm_head' in k or 'embed' in k or 'shared' in k or 'classifier' in k or 'pooler' in k or 'qa_output' in k: 18 | print("Excluding %s with number of params %d" % (k, v.numel())) 19 | continue 20 | param_nums += v.numel() 21 | 22 | print("Total param nums: {}".format(total_param_nums)) 23 | print("Encoder/decoder param nums: {}".format(param_nums)) -------------------------------------------------------------------------------- /scripts/ablation/roberta_base_sst2_distillation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=300:00:00 # Walltime (hh:mm:ss) 10 | 11 | 12 | distill_mapping_strategies=(static_teacher_dynamic_cofi_student static_teacher_dynamic_student static_teacher_static_student dynamic_block_teacher_dynamic_cofi_student dynamic_block_teacher_static_student) 13 | 14 | for distill_mapping_strategy in ${distill_mapping_strategies[@]}; do 15 | echo "distill_mapping_strategy: $distill_mapping_strategy" 16 | bash scripts/adaptpruning/roberta_base_sst2_momentum.sh 0.4 8 -1 cubic_gradual running_fisher running_fisher self_momentum $distill_mapping_strategy 17 | done -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_throughout_prune.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=2 2 | 3 | for mac_constraint in 0.4 0.5 0.6 4 | do 5 | for pruning_frequency in 0.1 0.5 1.5 6 | do 7 | output_dir="output/roberta_lora_minus_mnli/freq${pruning_frequency}/batchuse64/mac${mac_constraint}/" 8 | mkdir -p $output_dir 9 | 10 | python run_minus_training.py \ 11 | --output_dir ${output_dir}\ 12 | --task_name mnli \ 13 | --model_name_or_path roberta-base \ 14 | --do_train \ 15 | --do_eval \ 16 | --max_seq_length 128 \ 17 | --per_device_train_batch_size 32 \ 18 | --per_device_eval_batch_size 32 \ 19 | --apply_lora \ 20 | --lora_alpha 16 \ 21 | --lora_r 8 \ 22 | --report_to none\ 23 | --pruning_frequency ${pruning_frequency}\ 24 | --pruning_batches 64 \ 25 | --mac_constraint ${mac_constraint} \ 26 | --pruning_scheduler none 27 | done 28 | done -------------------------------------------------------------------------------- /scripts/merge_llama_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_path=$1 12 | output_dir=$2 13 | lora_r=$3 14 | lora_alpha=$4 15 | 16 | python merge_llama_model_lora.py \ 17 | --output_dir ${output_dir}\ 18 | --model_name_or_path ${model_path} \ 19 | --task_name alpaca_gpt4 \ 20 | --do_train \ 21 | --do_eval \ 22 | --bf16 True \ 23 | --data_path 'data/sft/alpaca_data_gpt4.json' \ 24 | --model_max_length 512 \ 25 | --per_device_train_batch_size 4 \ 26 | --per_device_eval_batch_size 4 \ 27 | --tf32 True \ 28 | --apply_lora \ 29 | --lora_r ${lora_r} \ 30 | --lora_alpha ${lora_alpha} -------------------------------------------------------------------------------- /scripts/ablation/roberta_base_mnli_distillation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=300:00:00 # Walltime (hh:mm:ss) 10 | 11 | 12 | distill_mapping_strategies=(dynamic_block_teacher_dynamic_student dynamic_block_teacher_dynamic_cofi_student static_teacher_dynamic_student none static_teacher_dynamic_cofi_student static_teacher_static_student dynamic_block_teacher_static_student) 13 | 14 | for distill_mapping_strategy in ${distill_mapping_strategies[@]}; do 15 | echo "distill_mapping_strategy: $distill_mapping_strategy" 16 | bash scripts/adaptpruning/roberta_base_mnli_momentum.sh 0.4 8 -1 cubic_gradual running_fisher running_fisher self_momentum $distill_mapping_strategy 17 | done -------------------------------------------------------------------------------- /scripts/ft/t5_base_lm_adapt_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name="google/t5-base-lm-adapt" 2 | task_name="sst2" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=1e-4 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --logging_strategy epoch \ 32 | --max_seq_length 128 \ 33 | --num_train_epochs ${epoch} \ 34 | --per_device_train_batch_size ${batch_size} \ 35 | --per_device_eval_batch_size ${batch_size} \ 36 | --warmup_ratio 0.06\ 37 | --learning_rate ${learning_rate}\ 38 | --weight_decay 0.1\ 39 | --seed ${seed} \ 40 | --report_to none -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Bowen Zhao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/ft/t5_base_xsum.sh: -------------------------------------------------------------------------------- 1 | model_name="t5-base" 2 | task_name="xsum" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | num_epochs=10 6 | batch_size=16 7 | learning_rate=1e-4 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | num_epochs=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_seq2seq_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --logging_strategy epoch \ 32 | --max_input_length 512 \ 33 | --max_target_length 128 \ 34 | --num_train_epochs ${num_epochs} \ 35 | --per_device_train_batch_size ${batch_size} \ 36 | --per_device_eval_batch_size ${batch_size} \ 37 | --warmup_ratio 0.06\ 38 | --learning_rate ${learning_rate}\ 39 | --weight_decay 0.1\ 40 | --report_to none \ 41 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/ft/t5_base_lm_adapt_cnndm.sh: -------------------------------------------------------------------------------- 1 | model_name="google/t5-base-lm-adapt" 2 | task_name="cnndm" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | num_epochs=10 6 | batch_size=16 7 | learning_rate=1e-4 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | num_epochs=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_seq2seq_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --evaluation_strategy epoch \ 31 | --logging_strategy epoch \ 32 | --max_input_length 512 \ 33 | --max_target_length 128 \ 34 | --num_train_epochs ${num_epochs} \ 35 | --per_device_train_batch_size ${batch_size} \ 36 | --per_device_eval_batch_size ${batch_size} \ 37 | --warmup_ratio 0.06\ 38 | --learning_rate ${learning_rate}\ 39 | --weight_decay 0.1\ 40 | --report_to none \ 41 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/ft/bert_base_mnli.sh: -------------------------------------------------------------------------------- 1 | model_name="bert-base-uncased" 2 | task_name="mnli" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --logging_strategy steps \ 31 | --evaluation_strategy steps \ 32 | --log_level info \ 33 | --log_level_replica info \ 34 | --logging_steps 1000 \ 35 | --eval_steps 5000 \ 36 | --max_seq_length 128 \ 37 | --num_train_epochs ${epoch} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --report_to none -------------------------------------------------------------------------------- /scripts/eval/alpaca_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_name_or_path=$1 12 | if [ -d "$model_name_or_path" ]; then 13 | output_dir="${model_name_or_path}/alpaca_eval" 14 | else 15 | output_dir="output/${model_name_or_path}/alpaca_eval" 16 | fi 17 | echo $output_dir 18 | mkdir -p $output_dir 19 | 20 | training_batch_size=4 21 | 22 | python run_alpaca_eval.py \ 23 | --output_dir ${output_dir}\ 24 | --task_name alpaca_eval \ 25 | --model_name_or_path ${model_name_or_path} \ 26 | --bf16 True \ 27 | --data_path 'data/eval/alpaca/alpaca_eval.json' \ 28 | --do_train \ 29 | --do_eval \ 30 | --model_max_length 512 \ 31 | --per_device_train_batch_size ${training_batch_size} \ 32 | --per_device_eval_batch_size ${training_batch_size} \ 33 | --tf32 True -------------------------------------------------------------------------------- /scripts/ft/roberta_base_squad.sh: -------------------------------------------------------------------------------- 1 | model_name="roberta-base" 2 | task_name="squad" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | echo $output_dir 19 | mkdir -p $output_dir 20 | 21 | 22 | python run_minus_squad_training.py \ 23 | --output_dir ${output_dir}\ 24 | --model_name_or_path ${model_name} \ 25 | --do_train \ 26 | --do_eval \ 27 | --save_strategy epoch \ 28 | --logging_strategy steps \ 29 | --evaluation_strategy steps \ 30 | --log_level info \ 31 | --log_level_replica info \ 32 | --logging_steps 100 \ 33 | --eval_steps 500 \ 34 | --max_seq_length 384 \ 35 | --doc_stride 128 \ 36 | --num_train_epochs ${epoch} \ 37 | --per_device_train_batch_size ${batch_size} \ 38 | --per_device_eval_batch_size ${batch_size} \ 39 | --warmup_ratio 0.06\ 40 | --learning_rate ${learning_rate}\ 41 | --weight_decay 0.1\ 42 | --seed ${seed} \ 43 | --report_to none -------------------------------------------------------------------------------- /scripts/ft/roberta_base_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name="roberta-base" 2 | task_name="sst2" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --logging_strategy steps \ 31 | --evaluation_strategy steps \ 32 | --log_level info \ 33 | --log_level_replica info \ 34 | --logging_steps 100 \ 35 | --eval_steps 500 \ 36 | --max_seq_length 128 \ 37 | --num_train_epochs ${epoch} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --seed ${seed} \ 44 | --report_to none -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_once_rescaled.sh: -------------------------------------------------------------------------------- 1 | for mac_constraint in 0.3 0.8 2 | do 3 | for steppoint in 0.25 0.5 0.75 1.0 4 | do 5 | output_dir="output/roberta_lora_minus_mnli_once_rescaled/step${steppoint}/batchuse${pruning_batches}/mac${mac_constraint}/" 6 | mkdir -p $output_dir 7 | 8 | python run_minus_training.py \ 9 | --output_dir ${output_dir}\ 10 | --task_name mnli \ 11 | --model_name_or_path roberta-base \ 12 | --do_train \ 13 | --do_eval \ 14 | --minus_scheduler \ 15 | --save_strategy no \ 16 | --max_seq_length 128 \ 17 | --per_device_train_batch_size 32 \ 18 | --per_device_eval_batch_size 32 \ 19 | --lr_scheduler_type linear\ 20 | --warmup_ratio 0.06\ 21 | --learning_rate 5e-4\ 22 | --weight_decay 0.1\ 23 | --apply_lora \ 24 | --lora_alpha 16 \ 25 | --lora_r 8 \ 26 | --report_to none \ 27 | --pruning_batches 64 \ 28 | --mac_constraint ${mac_constraint} \ 29 | --pruning_scheduler once \ 30 | --pruning_start ${steppoint} 31 | done 32 | done -------------------------------------------------------------------------------- /scripts/ft/bert_base_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name="bert-base-uncased" 2 | task_name="sst2" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --logging_strategy steps \ 31 | --evaluation_strategy steps \ 32 | --log_level info \ 33 | --log_level_replica info \ 34 | --logging_steps 100 \ 35 | --eval_steps 500 \ 36 | --max_seq_length 128 \ 37 | --num_train_epochs ${epoch} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --seed ${seed} \ 44 | --report_to none -------------------------------------------------------------------------------- /scripts/ft/roberta_base_mnli.sh: -------------------------------------------------------------------------------- 1 | model_name="roberta-base" 2 | task_name="mnli" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | 23 | python run_minus_training.py \ 24 | --output_dir ${output_dir}\ 25 | --task_name ${task_name} \ 26 | --model_name_or_path ${model_name} \ 27 | --do_train \ 28 | --do_eval \ 29 | --save_strategy epoch \ 30 | --logging_strategy steps \ 31 | --evaluation_strategy steps \ 32 | --log_level info \ 33 | --log_level_replica info \ 34 | --logging_steps 1000 \ 35 | --eval_steps 5000 \ 36 | --max_seq_length 128 \ 37 | --num_train_epochs ${epoch} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --seed ${seed} \ 44 | --report_to none -------------------------------------------------------------------------------- /scripts/ft/t5_xl_lm_adapt_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-xl-lm-adapt 2 | task_name=sst2 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=1e-3 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | echo $output_dir 19 | mkdir -p $output_dir 20 | 21 | 22 | python run_minus_training.py \ 23 | --output_dir ${output_dir}\ 24 | --task_name ${task_name} \ 25 | --model_name_or_path ${model_name} \ 26 | --do_train \ 27 | --do_eval \ 28 | --save_strategy epoch \ 29 | --logging_strategy steps \ 30 | --evaluation_strategy steps \ 31 | --log_level info \ 32 | --log_level_replica info \ 33 | --logging_steps 100 \ 34 | --eval_steps 500 \ 35 | --max_seq_length 128 \ 36 | --num_train_epochs ${epoch} \ 37 | --per_device_train_batch_size ${batch_size} \ 38 | --per_device_eval_batch_size ${batch_size} \ 39 | --warmup_ratio 0.06\ 40 | --learning_rate ${learning_rate}\ 41 | --weight_decay 0.1\ 42 | --seed ${seed} \ 43 | --report_to none -------------------------------------------------------------------------------- /scripts/ft/roberta_base_squadv2.sh: -------------------------------------------------------------------------------- 1 | model_name="roberta-base" 2 | task_name="squad_v2" 3 | 4 | if [ "$#" -eq 0 ]; then 5 | epoch=10 6 | batch_size=32 7 | learning_rate=2e-5 8 | seed=128 9 | elif [ "$#" -eq 4 ]; then 10 | epoch=$1 11 | batch_size=$2 12 | learning_rate=$3 13 | seed=$4 14 | fi 15 | 16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/" 17 | 18 | echo $output_dir 19 | mkdir -p $output_dir 20 | 21 | 22 | python run_minus_squad_training.py \ 23 | --output_dir ${output_dir}\ 24 | --model_name_or_path ${model_name} \ 25 | --do_train \ 26 | --do_eval \ 27 | --save_strategy epoch \ 28 | --logging_strategy steps \ 29 | --evaluation_strategy steps \ 30 | --log_level info \ 31 | --log_level_replica info \ 32 | --logging_steps 100 \ 33 | --eval_steps 500 \ 34 | --max_seq_length 384 \ 35 | --doc_stride 128 \ 36 | --version_2_with_negative \ 37 | --num_train_epochs ${epoch} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --seed ${seed} \ 44 | --report_to none -------------------------------------------------------------------------------- /scripts/lora/bert_base_squad.sh: -------------------------------------------------------------------------------- 1 | model_name='bert-base-uncased' 2 | task_name=squad 3 | adapter_type=lora 4 | learning_rate=2e-4 5 | num_epochs=30 6 | batch_size=32 7 | 8 | if [ "$#" -eq 0 ]; then 9 | lora_r=8 10 | lora_alpha=16 11 | elif [ "$#" -eq 2 ]; then 12 | lora_r=$1 13 | lora_alpha=$2 14 | fi 15 | 16 | teacher_param_tuning_config=q:0-11,v:0-11 17 | output_dir="output/${model_name}_${adapter_type}_${task_name}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 18 | echo $output_dir 19 | mkdir -p $output_dir 20 | 21 | python run_minus_squad_training.py \ 22 | --output_dir ${output_dir}\ 23 | --model_name_or_path ${model_name} \ 24 | --do_train \ 25 | --do_eval \ 26 | --save_strategy no \ 27 | --evaluation_strategy epoch \ 28 | --logging_strategy epoch \ 29 | --max_seq_length 384 \ 30 | --doc_stride 128 \ 31 | --num_train_epochs ${num_epochs} \ 32 | --per_device_train_batch_size ${batch_size} \ 33 | --per_device_eval_batch_size ${batch_size} \ 34 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 35 | --warmup_ratio 0.06\ 36 | --learning_rate ${learning_rate}\ 37 | --weight_decay 0.1\ 38 | --apply_lora \ 39 | --lora_alpha ${lora_alpha} \ 40 | --lora_r ${lora_r} \ 41 | --report_to none \ -------------------------------------------------------------------------------- /scripts/efficiency_testing.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | id=default 3 | backbone_name='roberta-base' 4 | model_name='roberta-base' 5 | task_name=mnli 6 | lora_r=8 7 | lora_alpha=16 8 | batch_size=128 9 | elif [ "$#" -eq 3 ]; then 10 | id=$1 11 | backbone_name='roberta-base' 12 | model_name=$2 13 | task_name=$3 14 | lora_r=8 15 | lora_alpha=16 16 | batch_size=128 17 | elif [ "$#" -eq 7 ]; then 18 | id=$1 19 | backbone_name=$2 20 | model_name=$3 21 | task_name=$4 22 | lora_r=$5 23 | lora_alpha=$6 24 | batch_size=$7 25 | fi 26 | 27 | output_dir="output/efficiency_testing/${backbone_name}/${task_name}/${id}/bz${batch_size}/" 28 | 29 | echo $output_dir 30 | mkdir -p $output_dir 31 | 32 | python efficiency_test.py \ 33 | --output_dir ${output_dir}\ 34 | --task_name ${task_name} \ 35 | --model_name_or_path ${model_name} \ 36 | --do_eval \ 37 | --save_strategy no \ 38 | --evaluation_strategy steps \ 39 | --logging_strategy steps \ 40 | --logging_steps 100 \ 41 | --eval_steps 500 \ 42 | --max_seq_length 128 \ 43 | --per_device_train_batch_size ${batch_size} \ 44 | --per_device_eval_batch_size ${batch_size} \ 45 | --apply_lora \ 46 | --lora_alpha ${lora_alpha} \ 47 | --lora_r ${lora_r} \ 48 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_once_prune_step.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=2 2 | 3 | steppoint=0.5 4 | for mac_constraint in 0.4 0.5 0.6 0.7 5 | do 6 | for steppoint in 0.25 0.5 0.75 1.0 7 | do 8 | output_dir="output/roberta_lora_minus_mnli_once_const_warmup_scheduler/step${steppoint}/batchuse${pruning_batches}/mac${mac_constraint}/" 9 | mkdir -p $output_dir 10 | 11 | python run_minus_training.py \ 12 | --output_dir ${output_dir}\ 13 | --task_name mnli \ 14 | --model_name_or_path roberta-base \ 15 | --do_train \ 16 | --do_eval \ 17 | --minus_scheduler \ 18 | --save_strategy no \ 19 | --evaluation_strategy steps \ 20 | --max_seq_length 128 \ 21 | --per_device_train_batch_size 32 \ 22 | --per_device_eval_batch_size 32 \ 23 | --lr_scheduler_type linear\ 24 | --warmup_ratio 0.06\ 25 | --learning_rate 5e-4\ 26 | --weight_decay 0.1\ 27 | --apply_lora \ 28 | --lora_alpha 16 \ 29 | --lora_r 8 \ 30 | --report_to none \ 31 | --pruning_batches 64 \ 32 | --mac_constraint ${mac_constraint} \ 33 | --pruning_scheduler once \ 34 | --pruning_start ${steppoint} 35 | done 36 | done -------------------------------------------------------------------------------- /scripts/lora/t5_base_lm_adapt_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-base-lm-adapt 2 | task_name=sst2 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-3 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy epoch \ 34 | --logging_strategy epoch \ 35 | --max_seq_length 128 \ 36 | --num_train_epochs ${num_epochs} \ 37 | --per_device_train_batch_size ${batch_size} \ 38 | --per_device_eval_batch_size ${batch_size} \ 39 | --warmup_ratio 0.06\ 40 | --learning_rate ${learning_rate}\ 41 | --weight_decay 0.1\ 42 | --apply_lora \ 43 | --lora_alpha ${lora_alpha} \ 44 | --lora_r ${lora_r} \ 45 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 46 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/t5_base_lm_adapt_mnli.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-base-lm-adapt 2 | task_name=mnli 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-3 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy epoch \ 34 | --logging_strategy epoch \ 35 | --tf32 True \ 36 | --max_seq_length 128 \ 37 | --num_train_epochs ${num_epochs} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --warmup_ratio 0.06\ 41 | --learning_rate ${learning_rate}\ 42 | --weight_decay 0.1\ 43 | --apply_lora \ 44 | --lora_alpha ${lora_alpha} \ 45 | --lora_r ${lora_r} \ 46 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 47 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/t5_xl_lm_adapt_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-xl-lm-adapt 2 | task_name=sst2 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=30 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-3 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy epoch \ 34 | --logging_strategy epoch \ 35 | --max_seq_length 128 \ 36 | --num_train_epochs ${num_epochs} \ 37 | --per_device_train_batch_size ${batch_size} \ 38 | --per_device_eval_batch_size ${batch_size} \ 39 | --tf32 True \ 40 | --bf16 True \ 41 | --warmup_ratio 0.06\ 42 | --learning_rate ${learning_rate}\ 43 | --weight_decay 0.1\ 44 | --apply_lora \ 45 | --lora_alpha ${lora_alpha} \ 46 | --lora_r ${lora_r} \ 47 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 48 | --report_to none \ -------------------------------------------------------------------------------- /test/test_salience.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | salience = torch.load('output/bert-base-uncased_lora_minus_rte_cubic_gradual_running_fisher_alloc_running_fisher_momentum_mapping_static_teacher_dynamic_cofi_student_distill_tophalf_limited_resizing_nonormalize_correctweight_clippedmoving_correctuncertain_bothsquare_freeteacher/mac0.4/epoch120/bz32/numprune5/paramq:0-11,v:0-11,i:0-11/lora_r8/pruning_start-1/distill_epoch96/first_salience.pt', map_location='cpu') 4 | vanilla_score = salience['mask_salience']['intermediate_mask'] * salience['mask_uncertainty']['intermediate_mask'] 5 | sorted_score, sorted_idx = vanilla_score.sort(descending=False) 6 | neuron_tuning_score = torch.cat([salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['u'] for i in range(12)]) 7 | sorted_tuning_score, sorted_tuning_idx = neuron_tuning_score.sort(descending=False) 8 | 9 | combined_score = vanilla_score * neuron_tuning_score 10 | sorted_combined_score, sorted_combined_idx = combined_score.sort(descending=False) 11 | 12 | torch.cat([salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['u'] for i in range(12)]).mean() 13 | 14 | for i in range(12): 15 | print 16 | 17 | for i in range(12): 18 | print((salience['grafting_mask_salience']['modules'][i]['value']['bottleneck_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['value']['bottleneck_mask']['u']).mean()) -------------------------------------------------------------------------------- /scripts/post_training_sft_prune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_path=$1 12 | mac_constraint=$2 13 | num_batches=$3 14 | if [ "$#" -eq 5 ]; then 15 | lora_alpha=$5 16 | else 17 | lora_alpha=16 18 | fi 19 | 20 | if [ -d $model_path ]; then 21 | echo "Model path exists" 22 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}" 23 | else 24 | echo "Model path does not exist" 25 | output_dir="llama_output/${model_path}/${task_name}/mt_pruned/constraint_${mac_constraint}/batches_${num_batches}" 26 | fi 27 | 28 | echo $output_dir 29 | mkdir -p $output_dir 30 | 31 | python post_training_sft_prune.py \ 32 | --output_dir ${output_dir}\ 33 | --model_name_or_path ${model_path} \ 34 | --task_name alpaca_gpt4 \ 35 | --data_path 'data/sft/alpaca_data_gpt4.json' \ 36 | --do_train \ 37 | --do_eval \ 38 | --model_max_length 512 \ 39 | --per_device_train_batch_size 1 \ 40 | --per_device_eval_batch_size 1 \ 41 | --pruning_batch_size 1 \ 42 | --pruning_batches ${num_batches} \ 43 | --mac_constraint ${mac_constraint} \ 44 | --lora_alpha ${lora_alpha} \ -------------------------------------------------------------------------------- /scripts/lora/bert_base_mnli.sh: -------------------------------------------------------------------------------- 1 | model_name='bert-base-uncased' 2 | task_name=mnli 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 1000 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 5000 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --apply_lora \ 47 | --lora_alpha ${lora_alpha} \ 48 | --lora_r ${lora_r} \ 49 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 50 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/bert_base_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name='bert-base-uncased' 2 | task_name=sst2 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --apply_lora \ 47 | --lora_alpha ${lora_alpha} \ 48 | --lora_r ${lora_r} \ 49 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 50 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/t5_xl_lm_adapt_cnndm.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-xl-lm-adapt 2 | task_name=cnndm 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=10 7 | batch_size=4 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-3 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_seq2seq_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy epoch \ 34 | --logging_strategy epoch \ 35 | --max_input_length 512 \ 36 | --max_target_length 128 \ 37 | --num_train_epochs ${num_epochs} \ 38 | --per_device_train_batch_size ${batch_size} \ 39 | --per_device_eval_batch_size ${batch_size} \ 40 | --tf32 True \ 41 | --bf16 True \ 42 | --warmup_ratio 0.06\ 43 | --learning_rate ${learning_rate}\ 44 | --weight_decay 0.1\ 45 | --apply_lora \ 46 | --lora_alpha ${lora_alpha} \ 47 | --lora_r ${lora_r} \ 48 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 49 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_cola.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=cola 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=80 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_mrpc.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=mrpc 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=80 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_rte.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=rte 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=80 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_stsb.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=stsb 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=80 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy no \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_mnli.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=mnli 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 1000 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 5000 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_squad.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=squad 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_squad_training.py \ 27 | --output_dir ${output_dir}\ 28 | --model_name_or_path ${model_name} \ 29 | --do_train \ 30 | --do_eval \ 31 | --save_strategy epoch \ 32 | --evaluation_strategy steps \ 33 | --logging_strategy steps \ 34 | --logging_steps 100 \ 35 | --log_level info \ 36 | --log_level_replica info \ 37 | --eval_steps 500 \ 38 | --max_seq_length 384 \ 39 | --doc_stride 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 44 | --warmup_ratio 0.06\ 45 | --learning_rate ${learning_rate}\ 46 | --weight_decay 0.1\ 47 | --seed ${seed} \ 48 | --apply_lora \ 49 | --lora_alpha ${lora_alpha} \ 50 | --lora_r ${lora_r} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_sst2.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=sst2 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 100 \ 36 | --log_level info \ 37 | --log_level_replica info \ 38 | --eval_steps 500 \ 39 | --max_seq_length 128 \ 40 | --num_train_epochs ${num_epochs} \ 41 | --per_device_train_batch_size ${batch_size} \ 42 | --per_device_eval_batch_size ${batch_size} \ 43 | --warmup_ratio 0.06\ 44 | --learning_rate ${learning_rate}\ 45 | --weight_decay 0.1\ 46 | --seed ${seed} \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/t5_base_xsum.sh: -------------------------------------------------------------------------------- 1 | model_name=t5-base 2 | task_name=xsum 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=15 7 | batch_size=16 8 | lora_r=102 9 | lora_alpha=408 10 | learning_rate=1e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_seq2seq_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 500 \ 36 | --eval_steps 2000 \ 37 | --max_input_length 512 \ 38 | --max_target_length 128 \ 39 | --num_train_epochs ${num_epochs} \ 40 | --per_device_train_batch_size ${batch_size} \ 41 | --per_device_eval_batch_size ${batch_size} \ 42 | --tf32 True \ 43 | --fp16 True \ 44 | --warmup_ratio 0.06\ 45 | --learning_rate ${learning_rate}\ 46 | --weight_decay 0.01 \ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none | tee ${output_dir}/log.txt \ -------------------------------------------------------------------------------- /scripts/lora/t5_base_lm_adapt_cnndm.sh: -------------------------------------------------------------------------------- 1 | model_name=google/t5-base-lm-adapt 2 | task_name=cnndm 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=6 7 | batch_size=16 8 | lora_r=102 9 | lora_alpha=408 10 | learning_rate=5e-5 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_seq2seq_training.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --do_train \ 31 | --do_eval \ 32 | --save_strategy epoch \ 33 | --evaluation_strategy steps \ 34 | --logging_strategy steps \ 35 | --logging_steps 500 \ 36 | --eval_steps 2000 \ 37 | --max_input_length 512 \ 38 | --max_target_length 128 \ 39 | --num_train_epochs ${num_epochs} \ 40 | --per_device_train_batch_size ${batch_size} \ 41 | --per_device_eval_batch_size ${batch_size} \ 42 | --tf32 True \ 43 | --fp16 True \ 44 | --warmup_ratio 0.06\ 45 | --learning_rate ${learning_rate}\ 46 | --weight_decay 0.01\ 47 | --apply_lora \ 48 | --lora_alpha ${lora_alpha} \ 49 | --lora_r ${lora_r} \ 50 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 51 | --report_to none | tee ${output_dir}/log.txt \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_squadv2.sh: -------------------------------------------------------------------------------- 1 | model_name='roberta-base' 2 | task_name=squad_v2 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=60 7 | batch_size=32 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=2e-4 11 | seed=128 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=q:0-11,v:0-11 22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_minus_squad_training.py \ 27 | --output_dir ${output_dir}\ 28 | --model_name_or_path ${model_name} \ 29 | --do_train \ 30 | --do_eval \ 31 | --save_strategy epoch \ 32 | --evaluation_strategy steps \ 33 | --logging_strategy steps \ 34 | --logging_steps 1000 \ 35 | --log_level info \ 36 | --log_level_replica info \ 37 | --eval_steps 5000 \ 38 | --max_seq_length 384 \ 39 | --doc_stride 128 \ 40 | --version_2_with_negative \ 41 | --num_train_epochs ${num_epochs} \ 42 | --per_device_train_batch_size ${batch_size} \ 43 | --per_device_eval_batch_size ${batch_size} \ 44 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 45 | --warmup_ratio 0.06\ 46 | --learning_rate ${learning_rate}\ 47 | --weight_decay 0.1\ 48 | --seed ${seed} \ 49 | --apply_lora \ 50 | --lora_alpha ${lora_alpha} \ 51 | --lora_r ${lora_r} \ 52 | --report_to none \ -------------------------------------------------------------------------------- /test/test_rewarmup_lr_scheduling.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | from matplotlib import pyplot as plt 3 | 4 | num_epochs = 20 5 | epoch_size = 3068 6 | reset_epochs = [2, 5, 8, 11, 14] 7 | reset_steps = [epoch * epoch_size for epoch in reset_epochs] 8 | num_training_steps = num_epochs * epoch_size 9 | num_warmup_steps = 0.06 * num_training_steps 10 | 11 | 12 | if __name__ == '__main__': 13 | steppoints = [] 14 | if not reset_steps[0] == 0: 15 | reset_steps = [0] + reset_steps 16 | warmup_starts = set(reset_steps) 17 | for step in reset_steps: 18 | steppoints.append(step) 19 | steppoints.append(step + num_warmup_steps) 20 | steppoints.append(num_training_steps) 21 | 22 | # Determine which range an integer belongs to using binary search 23 | def find_range(n): 24 | for idx, step in enumerate(steppoints): 25 | if step <= n < steppoints[idx + 1]: 26 | if step in warmup_starts: 27 | return step, steppoints[idx + 1], True # is warmup 28 | else: 29 | return step, steppoints[idx + 1], False # is not warmup 30 | 31 | def lr_lambda(current_step: int): 32 | range_start, range_end, is_warmup = find_range(current_step) 33 | if is_warmup: 34 | return float(current_step - range_start) / float(max(1, range_end - range_start)) 35 | else: 36 | return max( 37 | 0.0, float(range_end - current_step) / float(max(1, range_end - range_start)) 38 | ) 39 | 40 | steps = list(range(num_training_steps)) 41 | lrs = [lr_lambda(step) for step in steps] 42 | sns.lineplot(x=steps, y=lrs) 43 | plt.savefig('lr_test.png') -------------------------------------------------------------------------------- /scripts/lora/llama_13b_alpaca_cleaned.sh: -------------------------------------------------------------------------------- 1 | model_name='huggyllama/llama-13b' 2 | task_name=alpaca_gpt4 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=2 7 | batch_size=4 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=dq:0-39,dv:0-39 22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_llama_sft.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --bf16 True \ 31 | --data_path 'data/sft/alpaca_data_cleaned.json' \ 32 | --do_train \ 33 | --do_eval \ 34 | --save_strategy steps \ 35 | --save_steps 2000 \ 36 | --save_total_limit 1 \ 37 | --evaluation_strategy steps \ 38 | --logging_strategy steps \ 39 | --logging_steps 100 \ 40 | --eval_steps 500 \ 41 | --model_max_length 512 \ 42 | --num_train_epochs ${num_epochs} \ 43 | --per_device_train_batch_size ${batch_size} \ 44 | --per_device_eval_batch_size ${batch_size} \ 45 | --gradient_accumulation_steps 8 \ 46 | --warmup_ratio 0.03\ 47 | --learning_rate ${learning_rate}\ 48 | --weight_decay 0.\ 49 | --lr_scheduler_type cosine \ 50 | --tf32 True \ 51 | --apply_lora \ 52 | --lora_alpha ${lora_alpha} \ 53 | --lora_r ${lora_r} \ 54 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 55 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /utils/fisher_utils/efficiency/latency.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | @torch.no_grad() 6 | def lookup_latency(lut, mask): 7 | n = int(torch.sum(mask != 0)) 8 | if n == 0: 9 | return 0 10 | else: 11 | return lut[n - 1] 12 | 13 | 14 | def estimate_latency(mha_lut, ffn_lut, head_mask, neuron_mask): 15 | num_hidden_layers = head_mask.shape[0] 16 | total = 0 17 | for i in range(num_hidden_layers): 18 | total += lookup_latency(mha_lut, head_mask[i]) 19 | total += lookup_latency(ffn_lut, neuron_mask[i]) 20 | return total 21 | 22 | 23 | class PiecewiseLinearLatency: 24 | 25 | def __init__(self, threshold=None, c=None, slope=None): 26 | self.threshold = threshold 27 | self.c = c 28 | self.slope = slope 29 | 30 | 31 | def fit_latency_fn(lut): 32 | lut = np.asarray(lut) 33 | latency_fn = PiecewiseLinearLatency() 34 | 35 | min_error = 10000 36 | for threshold in range(1, len(lut) + 1): 37 | c = lut[:threshold].sum() / threshold 38 | y = lut[threshold:] - c 39 | x = np.arange(1, len(y) + 1) 40 | 41 | if threshold == len(lut): 42 | slope = 0 43 | else: 44 | slope = (x * y).sum() / (x * x).sum() 45 | slope = 0 if slope < 0 else slope 46 | 47 | approximated = [c] * threshold 48 | for i in range(1, len(lut) - threshold + 1): 49 | approximated.append(slope * i + c) 50 | approximated = np.asarray(approximated) 51 | 52 | squared_error = ((lut - approximated) * (lut - approximated)).sum() 53 | if squared_error < min_error: 54 | min_error = squared_error 55 | latency_fn.threshold = threshold 56 | latency_fn.c = c 57 | latency_fn.slope = slope 58 | 59 | return latency_fn 60 | -------------------------------------------------------------------------------- /scripts/lora/llama_2_7b_alpaca_gpt4.sh: -------------------------------------------------------------------------------- 1 | model_name='meta-llama/Llama-2-7b-hf' 2 | task_name=alpaca_gpt4 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=2 7 | batch_size=4 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=dq:0-31,dv:0-31,di0:0-31 22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_llama_sft.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --bf16 True \ 31 | --data_path 'data/sft/alpaca_data_gpt4.json' \ 32 | --do_train \ 33 | --do_eval \ 34 | --save_strategy steps \ 35 | --save_steps 2000 \ 36 | --save_total_limit 1 \ 37 | --evaluation_strategy steps \ 38 | --logging_strategy steps \ 39 | --logging_steps 100 \ 40 | --eval_steps 500 \ 41 | --model_max_length 512 \ 42 | --num_train_epochs ${num_epochs} \ 43 | --per_device_train_batch_size ${batch_size} \ 44 | --per_device_eval_batch_size ${batch_size} \ 45 | --gradient_accumulation_steps 8 \ 46 | --warmup_ratio 0.03\ 47 | --learning_rate ${learning_rate}\ 48 | --weight_decay 0.\ 49 | --lr_scheduler_type cosine \ 50 | --tf32 True \ 51 | --apply_lora \ 52 | --lora_alpha ${lora_alpha} \ 53 | --lora_r ${lora_r} \ 54 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 55 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/lora/llama_7b_alpaca_cleaned.sh: -------------------------------------------------------------------------------- 1 | model_name='huggyllama/llama-7b' 2 | task_name=alpaca_gpt4 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=2 7 | batch_size=4 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=dq:0-31,dv:0-31,di0:0-31 22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_llama_sft.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --bf16 True \ 31 | --data_path 'data/sft/alpaca_data_cleaned.json' \ 32 | --do_train \ 33 | --do_eval \ 34 | --save_strategy steps \ 35 | --save_steps 2000 \ 36 | --save_total_limit 1 \ 37 | --evaluation_strategy steps \ 38 | --logging_strategy steps \ 39 | --logging_steps 100 \ 40 | --eval_steps 500 \ 41 | --model_max_length 512 \ 42 | --num_train_epochs ${num_epochs} \ 43 | --per_device_train_batch_size ${batch_size} \ 44 | --per_device_eval_batch_size ${batch_size} \ 45 | --gradient_accumulation_steps 8 \ 46 | --warmup_ratio 0.03\ 47 | --learning_rate ${learning_rate}\ 48 | --weight_decay 0.\ 49 | --lr_scheduler_type cosine \ 50 | --tf32 True \ 51 | --apply_lora \ 52 | --lora_alpha ${lora_alpha} \ 53 | --lora_r ${lora_r} \ 54 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 55 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/lora/llama_2_13b_alpaca_gpt4.sh: -------------------------------------------------------------------------------- 1 | model_name='meta-llama/Llama-2-13b-hf' 2 | task_name=alpaca_gpt4 3 | adapter_type=lora 4 | 5 | if [ "$#" -eq 0 ]; then 6 | num_epochs=2 7 | batch_size=4 8 | lora_r=8 9 | lora_alpha=16 10 | learning_rate=1e-4 11 | seed=42 12 | elif [ "$#" -eq 6 ]; then 13 | num_epochs=$1 14 | batch_size=$2 15 | lora_r=$3 16 | lora_alpha=$4 17 | learning_rate=$5 18 | seed=$6 19 | fi 20 | 21 | teacher_param_tuning_config=dq:0-39,dv:0-39,di0:0-39 22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 23 | echo $output_dir 24 | mkdir -p $output_dir 25 | 26 | python run_llama_sft.py \ 27 | --output_dir ${output_dir}\ 28 | --task_name ${task_name} \ 29 | --model_name_or_path ${model_name} \ 30 | --bf16 True \ 31 | --data_path 'data/sft/alpaca_data_gpt4.json' \ 32 | --do_train \ 33 | --do_eval \ 34 | --save_strategy steps \ 35 | --save_steps 2000 \ 36 | --save_total_limit 1 \ 37 | --evaluation_strategy steps \ 38 | --logging_strategy steps \ 39 | --logging_steps 100 \ 40 | --eval_steps 500 \ 41 | --model_max_length 512 \ 42 | --num_train_epochs ${num_epochs} \ 43 | --per_device_train_batch_size ${batch_size} \ 44 | --per_device_eval_batch_size ${batch_size} \ 45 | --gradient_accumulation_steps 8 \ 46 | --warmup_ratio 0.03\ 47 | --learning_rate ${learning_rate}\ 48 | --weight_decay 0.\ 49 | --lr_scheduler_type cosine \ 50 | --tf32 True \ 51 | --apply_lora \ 52 | --lora_alpha ${lora_alpha} \ 53 | --lora_r ${lora_r} \ 54 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 55 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/efficiency_testing_llama.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | id=default 13 | backbone_name='roberta-base' 14 | model_name='roberta-base' 15 | lora_r=8 16 | lora_alpha=16 17 | batch_size=4 18 | elif [ "$#" -eq 2 ]; then 19 | id=$1 20 | backbone_name='roberta-base' 21 | model_name=$2 22 | lora_r=8 23 | lora_alpha=16 24 | batch_size=4 25 | elif [ "$#" -eq 6 ]; then 26 | id=$1 27 | backbone_name=$2 28 | model_name=$3 29 | lora_r=$4 30 | lora_alpha=$5 31 | batch_size=$6 32 | fi 33 | 34 | task_name=alpaca_gpt4 35 | output_dir="output/efficiency_testing/${backbone_name}/${task_name}/${id}/bz${batch_size}/" 36 | 37 | echo $output_dir 38 | mkdir -p $output_dir 39 | 40 | python efficiency_test_llama.py \ 41 | --output_dir ${output_dir}\ 42 | --task_name alpaca_gpt4 \ 43 | --model_name_or_path ${model_name} \ 44 | --bf16 True \ 45 | --tf32 True \ 46 | --data_path 'data/sft/alpaca_data_gpt4.json' \ 47 | --do_eval \ 48 | --save_strategy no \ 49 | --evaluation_strategy steps \ 50 | --logging_strategy steps \ 51 | --logging_steps 100 \ 52 | --eval_steps 500 \ 53 | --model_max_length 512 \ 54 | --per_device_train_batch_size ${batch_size} \ 55 | --per_device_eval_batch_size ${batch_size} \ 56 | --apply_lora \ 57 | --lora_alpha ${lora_alpha} \ 58 | --lora_r ${lora_r} \ 59 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /models/modeling_outputs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from dataclasses import dataclass 4 | from typing import Optional, Tuple 5 | 6 | from transformers.file_utils import ModelOutput 7 | from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, BaseModelOutputWithPastAndCrossAttentions 8 | 9 | 10 | @dataclass 11 | class NewQuestionAnsweringModelOutput(QuestionAnsweringModelOutput): 12 | masked_loss: Optional[torch.FloatTensor] = None 13 | masked_start_logits: Optional[torch.FloatTensor] = None 14 | masked_end_logits: Optional[torch.FloatTensor] = None 15 | masked_states: Optional[Tuple[torch.FloatTensor]] = None 16 | 17 | 18 | @dataclass 19 | class NewBaseModelOutputWithPooling(ModelOutput): 20 | last_hidden_state: torch.FloatTensor = None 21 | pooler_output: torch.FloatTensor = None 22 | hidden_states: Optional[Tuple[torch.FloatTensor]] = None 23 | attentions: Optional[Tuple[torch.FloatTensor]] = None 24 | attention_layers: Optional[Tuple[torch.FloatTensor]] = None 25 | masked_states: Optional[Tuple[torch.FloatTensor]] = None 26 | masked_pooler_output: Optional[torch.FloatTensor] = None 27 | 28 | @dataclass 29 | class NewBaseModelOutput(ModelOutput): 30 | last_hidden_state: torch.FloatTensor = None 31 | hidden_states: Optional[Tuple[torch.FloatTensor]] = None 32 | attentions: Optional[Tuple[torch.FloatTensor]] = None 33 | attention_layers: Optional[Tuple[torch.FloatTensor]] = None 34 | masked_states: Optional[Tuple[torch.FloatTensor]] = None 35 | 36 | @dataclass 37 | class NewSequenceClassifierOutput(SequenceClassifierOutput): 38 | masked_states: Optional[Tuple[torch.FloatTensor]] = None 39 | masked_logits: Optional[torch.FloatTensor] = None 40 | masked_loss: Optional[torch.FloatTensor] = None 41 | 42 | class AdaPBaseModelOutputWithPastAndCrossAttentions(BaseModelOutputWithPastAndCrossAttentions): 43 | masked_hidden_states: torch.FloatTensor = None -------------------------------------------------------------------------------- /scripts/eval/mmlu.sh: -------------------------------------------------------------------------------- 1 | # # export CUDA_VISIBLE_DEVICES=0 2 | # zero-shot 3 | model_name_or_path=$1 4 | 5 | mkdir -p output/results/mmlu/llama-7B-5shot/ 6 | 7 | python run_eval_llama_mmlu.py \ 8 | --ntrain 5 \ 9 | --data_dir /mmfs1/home/bowen98/projects/AdaptPruning/data/eval/mmlu \ 10 | --output_dir output/results/mmlu/llama2-7B-0shot/ \ 11 | --model_name_or_path ${model_name_or_path} \ 12 | --tokenizer_name meta-llama/Llama-2-7b-hf \ 13 | --eval_batch_size 2 | tee "${model_name_or_path}/mmlu-5shot.log" 14 | 15 | # python -m eval.mmlu.run_eval \ 16 | # --ntrain 0 \ 17 | # --data_dir /mmfs1/home/bowen98/projects/AdaptPruning/data/eval/mmlu \ 18 | # --save_dir results/mmlu/llama2-7B-0shot/ \ 19 | # --model_name_or_path meta-llama/Llama-2-7b-hf \ 20 | # --tokenizer_name_or_path meta-llama/Llama-2-7b-hf \ 21 | # --eval_batch_size 2 \ 22 | # --use_chat_format 23 | 24 | # # zero-shot with chatgpt 25 | # python -m eval.mmlu.run_eval \ 26 | # --ntrain 0 \ 27 | # --data_dir data/eval/mmlu \ 28 | # --save_dir results/mmlu/chatgpt-0shot/ \ 29 | # --openai_engine "gpt-3.5-turbo-0301" \ 30 | # --eval_batch_size 20 31 | 32 | 33 | # # few-shot with chatgpt 34 | # python -m eval.mmlu.run_eval \ 35 | # --ntrain 5 \ 36 | # --data_dir data/eval/mmlu \ 37 | # --save_dir results/mmlu/chatgpt-5shot/ \ 38 | # --openai_engine "gpt-3.5-turbo-0301" \ 39 | # --eval_batch_size 20 40 | 41 | 42 | # # zero-shot with gpt4 43 | # python -m eval.mmlu.run_eval \ 44 | # --ntrain 0 \ 45 | # --data_dir data/eval/mmlu \ 46 | # --save_dir results/mmlu/gpt4-0shot/ \ 47 | # --openai_engine "gpt-4-0314" \ 48 | # --n_instances 100 \ 49 | # --eval_batch_size 20 50 | 51 | 52 | # # few-shot with gpt4 53 | # python -m eval.mmlu.run_eval \ 54 | # --ntrain 5 \ 55 | # --data_dir data/eval/mmlu \ 56 | # --save_dir results/mmlu/gpt4-5shot/ \ 57 | # --openai_engine "gpt-4-0314" \ 58 | # --n_instances 100 \ 59 | # --eval_batch_size 2 60 | -------------------------------------------------------------------------------- /loralib/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 4 | # ------------------------------------------------------------------------------------------ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from typing import Dict 9 | 10 | from .layers import LoRALayer 11 | 12 | 13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: 14 | for n, p in model.named_parameters(): 15 | if 'lora_' not in n: 16 | p.requires_grad = False 17 | if bias == 'none': 18 | return 19 | elif bias == 'all': 20 | for n, p in model.named_parameters(): 21 | if 'bias' in n: 22 | p.requires_grad = True 23 | elif bias == 'lora_only': 24 | for m in model.modules(): 25 | if isinstance(m, LoRALayer) and \ 26 | hasattr(m, 'bias') and \ 27 | m.bias is not None: 28 | m.bias.requires_grad = True 29 | else: 30 | raise NotImplementedError 31 | 32 | 33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]: 34 | my_state_dict = model.state_dict() 35 | if bias == 'none': 36 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k} 37 | elif bias == 'all': 38 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k} 39 | elif bias == 'lora_only': 40 | to_return = {} 41 | for k in my_state_dict: 42 | if 'lora_' in k: 43 | to_return[k] = my_state_dict[k] 44 | bias_name = k.split('lora_')[0]+'bias' 45 | if bias_name in my_state_dict: 46 | to_return[bias_name] = my_state_dict[bias_name] 47 | return to_return 48 | else: 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_distill.sh: -------------------------------------------------------------------------------- 1 | for mac_constraint in 0.1 0.2 0.3 2 | do 3 | for lora_r in 8 32 128 4 | do 5 | model_name='roberta-base' 6 | adapter_type=lora 7 | pruning_scheduler=once 8 | pruner_type=global 9 | task_name=mnli 10 | lora_alpha=16 11 | pruning_batches=256 12 | pruning_batch_size=4 13 | steppoint=1.0 14 | 15 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}" 16 | echo $output_dir 17 | mkdir -p $output_dir 18 | 19 | python run_minus_training.py \ 20 | --output_dir ${output_dir}\ 21 | --task_name ${task_name} \ 22 | --model_name_or_path ${model_name} \ 23 | --do_train \ 24 | --do_eval \ 25 | --save_strategy no \ 26 | --evaluation_strategy steps \ 27 | --minus_scheduler \ 28 | --max_seq_length 128 \ 29 | --num_train_epochs 10 \ 30 | --per_device_train_batch_size 32 \ 31 | --per_device_eval_batch_size 32 \ 32 | --lr_scheduler_type linear\ 33 | --warmup_ratio 0.06\ 34 | --learning_rate 5e-4\ 35 | --weight_decay 0.1\ 36 | --apply_lora \ 37 | --lora_alpha ${lora_alpha} \ 38 | --lora_r ${lora_r} \ 39 | --report_to none \ 40 | --pruning_batches ${pruning_batches} \ 41 | --pruning_batch_size ${pruning_batch_size} \ 42 | --mac_constraint ${mac_constraint} \ 43 | --pruning_scheduler ${pruning_scheduler} \ 44 | --pruning_start ${steppoint} \ 45 | --head_scorer_type gradient_l2 \ 46 | --intermediate_scorer_type gradient_l2 \ 47 | --pruner_type ${pruner_type} \ 48 | --do_distill \ 49 | --distill_epoch 8 50 | done 51 | done -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_distill_fisher.sh: -------------------------------------------------------------------------------- 1 | for mac_constraint in 0.05 0.4 0.5 2 | do 3 | for lora_r in 8 32 128 4 | do 5 | model_name='roberta-base' 6 | adapter_type=lora 7 | pruning_scheduler=once 8 | pruner_type=fisher 9 | task_name=mnli 10 | lora_alpha=16 11 | pruning_batches=256 12 | pruning_batch_size=4 13 | steppoint=1.0 14 | 15 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}" 16 | echo $output_dir 17 | mkdir -p $output_dir 18 | 19 | python run_minus_training.py \ 20 | --output_dir ${output_dir}\ 21 | --task_name ${task_name} \ 22 | --model_name_or_path ${model_name} \ 23 | --do_train \ 24 | --do_eval \ 25 | --save_strategy no \ 26 | --evaluation_strategy steps \ 27 | --minus_scheduler \ 28 | --max_seq_length 128 \ 29 | --num_train_epochs 10 \ 30 | --per_device_train_batch_size 32 \ 31 | --per_device_eval_batch_size 32 \ 32 | --lr_scheduler_type linear\ 33 | --warmup_ratio 0.06\ 34 | --learning_rate 5e-4\ 35 | --weight_decay 0.1\ 36 | --apply_lora \ 37 | --lora_alpha ${lora_alpha} \ 38 | --lora_r ${lora_r} \ 39 | --report_to none \ 40 | --pruning_batches ${pruning_batches} \ 41 | --pruning_batch_size ${pruning_batch_size} \ 42 | --mac_constraint ${mac_constraint} \ 43 | --pruning_scheduler ${pruning_scheduler} \ 44 | --pruning_start ${steppoint} \ 45 | --head_scorer_type gradient_l2 \ 46 | --intermediate_scorer_type gradient_l2 \ 47 | --pruner_type ${pruner_type} \ 48 | --do_distill \ 49 | --distill_epoch 8 50 | done 51 | done -------------------------------------------------------------------------------- /scripts/train_ft_seq2seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='roberta-base' 13 | task_name=sst2 14 | num_epochs=30 15 | learning_rate=2e-5 16 | batch_size=32 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | task_name=$2 20 | num_epochs=$3 21 | learning_rate=$4 22 | batch_size=$5 23 | fi 24 | 25 | lora_alpha=16 26 | lora_r=8 27 | suffix='' 28 | 29 | if [ -d $model_name ] 30 | then 31 | output_dir="${model_name}/finetuned/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 32 | else 33 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 34 | fi 35 | 36 | echo $output_dir 37 | mkdir -p $output_dir 38 | 39 | python run_minus_seq2seq_training.py \ 40 | --output_dir ${output_dir}\ 41 | --model_name_or_path ${model_name} \ 42 | --do_train \ 43 | --do_eval \ 44 | --save_strategy no \ 45 | --evaluation_strategy steps \ 46 | --logging_strategy steps \ 47 | --logging_steps 1000 \ 48 | --eval_steps 5000 \ 49 | --task_name ${task_name} \ 50 | --max_input_length 512 \ 51 | --max_target_length 128 \ 52 | --num_train_epochs ${num_epochs} \ 53 | --per_device_train_batch_size ${batch_size} \ 54 | --per_device_eval_batch_size ${batch_size} \ 55 | --tf32 True \ 56 | --warmup_ratio 0.06\ 57 | --learning_rate ${learning_rate}\ 58 | --weight_decay 0.1\ 59 | --lora_alpha ${lora_alpha} \ 60 | --lora_r ${lora_r} \ 61 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_distill_shorter.sh: -------------------------------------------------------------------------------- 1 | for mac_constraint in 0.05 0.1 0.2 0.3 0.4 0.5 2 | do 3 | for lora_r in 16 64 4 | do 5 | model_name='roberta-base' 6 | adapter_type=lora 7 | pruning_scheduler=once 8 | pruner_type=global 9 | task_name=mnli 10 | lora_alpha=16 11 | pruning_batches=256 12 | pruning_batch_size=4 13 | steppoint=1.0 14 | 15 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp_shorter/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}" 16 | echo $output_dir 17 | mkdir -p $output_dir 18 | 19 | python run_minus_training.py \ 20 | --output_dir ${output_dir}\ 21 | --task_name ${task_name} \ 22 | --model_name_or_path ${model_name} \ 23 | --do_train \ 24 | --do_eval \ 25 | --save_strategy no \ 26 | --evaluation_strategy steps \ 27 | --minus_scheduler \ 28 | --max_seq_length 128 \ 29 | --num_train_epochs 5 \ 30 | --per_device_train_batch_size 32 \ 31 | --per_device_eval_batch_size 32 \ 32 | --lr_scheduler_type linear\ 33 | --warmup_ratio 0.06\ 34 | --learning_rate 5e-4\ 35 | --weight_decay 0.1\ 36 | --apply_lora \ 37 | --lora_alpha ${lora_alpha} \ 38 | --lora_r ${lora_r} \ 39 | --report_to none \ 40 | --pruning_batches ${pruning_batches} \ 41 | --pruning_batch_size ${pruning_batch_size} \ 42 | --mac_constraint ${mac_constraint} \ 43 | --pruning_scheduler ${pruning_scheduler} \ 44 | --pruning_start ${steppoint} \ 45 | --head_scorer_type gradient_l2 \ 46 | --intermediate_scorer_type gradient_l2 \ 47 | --pruner_type ${pruner_type} \ 48 | --do_distill \ 49 | --distill_epoch 3 50 | done 51 | done -------------------------------------------------------------------------------- /scripts/train_ft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='roberta-base' 13 | task_name=sst2 14 | num_epochs=30 15 | learning_rate=2e-5 16 | batch_size=32 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | task_name=$2 20 | num_epochs=$3 21 | learning_rate=$4 22 | batch_size=$5 23 | fi 24 | 25 | lora_alpha=16 26 | lora_r=8 27 | student_param_tuning_config=q:0-11,v:0-11,i:0-11 28 | suffix='' 29 | 30 | if [ -d $model_name ] 31 | then 32 | output_dir="${model_name}/finetuned/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 33 | else 34 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 35 | fi 36 | 37 | echo $output_dir 38 | mkdir -p $output_dir 39 | 40 | python run_minus_training.py \ 41 | --output_dir ${output_dir}\ 42 | --task_name ${task_name} \ 43 | --model_name_or_path ${model_name} \ 44 | --do_train \ 45 | --do_eval \ 46 | --save_strategy no \ 47 | --evaluation_strategy steps \ 48 | --logging_strategy steps \ 49 | --logging_steps 1000 \ 50 | --eval_steps 5000 \ 51 | --max_seq_length 128 \ 52 | --num_train_epochs ${num_epochs} \ 53 | --per_device_train_batch_size ${batch_size} \ 54 | --per_device_eval_batch_size ${batch_size} \ 55 | --student_param_tuning_config ${student_param_tuning_config} \ 56 | --warmup_ratio 0.06\ 57 | --learning_rate ${learning_rate}\ 58 | --weight_decay 0.1\ 59 | --lora_alpha ${lora_alpha} \ 60 | --lora_r ${lora_r} \ 61 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/lora/roberta_base_qnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_name='roberta-base' 12 | task_name=qnli 13 | adapter_type=lora 14 | 15 | if [ "$#" -eq 0 ]; then 16 | num_epochs=25 17 | batch_size=32 18 | lora_r=8 19 | lora_alpha=16 20 | learning_rate=4e-4 21 | seed=128 22 | elif [ "$#" -eq 6 ]; then 23 | num_epochs=$1 24 | batch_size=$2 25 | lora_r=$3 26 | lora_alpha=$4 27 | learning_rate=$5 28 | seed=$6 29 | fi 30 | 31 | teacher_param_tuning_config=q:0-11,v:0-11 32 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 33 | echo $output_dir 34 | mkdir -p $output_dir 35 | 36 | python run_minus_training.py \ 37 | --output_dir ${output_dir}\ 38 | --task_name ${task_name} \ 39 | --model_name_or_path ${model_name} \ 40 | --do_train \ 41 | --do_eval \ 42 | --save_strategy epoch \ 43 | --evaluation_strategy steps \ 44 | --logging_strategy steps \ 45 | --logging_steps 1000 \ 46 | --log_level info \ 47 | --log_level_replica info \ 48 | --eval_steps 5000 \ 49 | --max_seq_length 128 \ 50 | --num_train_epochs ${num_epochs} \ 51 | --per_device_train_batch_size ${batch_size} \ 52 | --per_device_eval_batch_size ${batch_size} \ 53 | --warmup_ratio 0.06\ 54 | --learning_rate ${learning_rate}\ 55 | --weight_decay 0.1\ 56 | --seed ${seed} \ 57 | --apply_lora \ 58 | --lora_alpha ${lora_alpha} \ 59 | --lora_r ${lora_r} \ 60 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 61 | --report_to none \ -------------------------------------------------------------------------------- /scripts/lora/roberta_base_qqp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_name='roberta-base' 12 | task_name=qqp 13 | adapter_type=lora 14 | 15 | if [ "$#" -eq 0 ]; then 16 | num_epochs=25 17 | batch_size=32 18 | lora_r=8 19 | lora_alpha=16 20 | learning_rate=5e-4 21 | seed=128 22 | elif [ "$#" -eq 6 ]; then 23 | num_epochs=$1 24 | batch_size=$2 25 | lora_r=$3 26 | lora_alpha=$4 27 | learning_rate=$5 28 | seed=$6 29 | fi 30 | 31 | teacher_param_tuning_config=q:0-11,v:0-11 32 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}" 33 | echo $output_dir 34 | mkdir -p $output_dir 35 | 36 | python run_minus_training.py \ 37 | --output_dir ${output_dir}\ 38 | --task_name ${task_name} \ 39 | --model_name_or_path ${model_name} \ 40 | --do_train \ 41 | --do_eval \ 42 | --save_strategy epoch \ 43 | --evaluation_strategy steps \ 44 | --logging_strategy steps \ 45 | --logging_steps 1000 \ 46 | --log_level info \ 47 | --log_level_replica info \ 48 | --eval_steps 5000 \ 49 | --max_seq_length 128 \ 50 | --num_train_epochs ${num_epochs} \ 51 | --per_device_train_batch_size ${batch_size} \ 52 | --per_device_eval_batch_size ${batch_size} \ 53 | --warmup_ratio 0.06\ 54 | --learning_rate ${learning_rate}\ 55 | --weight_decay 0.1\ 56 | --seed ${seed} \ 57 | --apply_lora \ 58 | --lora_alpha ${lora_alpha} \ 59 | --lora_r ${lora_r} \ 60 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 61 | --report_to none \ -------------------------------------------------------------------------------- /test/test_gpu_base_speed.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | import torch 5 | from models import build_model 6 | from transformers import HfArgumentParser 7 | from args import DataTrainingArguments 8 | from models.model_args import ModelArguments 9 | from args import MinusTrainingArguments 10 | from utils.utils import * 11 | from utils.minus_utils import bench_latency 12 | 13 | NUM_GPUS=8 14 | 15 | def main(): 16 | sys.argv = ['neuron_importance.py', 17 | '--output_dir', 18 | './output/neuron_importance/', 19 | '--model_name_or_path', 20 | 'roberta-base', 21 | '--task_name', 22 | 'mnli', 23 | '--do_train', 24 | '--do_eval', 25 | '--max_seq_length', 26 | '128', 27 | '--per_device_train_batch_size', 28 | '32', 29 | '--per_device_eval_batch_size', 30 | '32', 31 | '--apply_lora', 32 | '--do_distill', 33 | '--lora_r', 34 | '64' 35 | ] 36 | parser = HfArgumentParser( 37 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 38 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 39 | # If we pass only one argument to the script and it's the path to a json file, 40 | # let's parse it to get our arguments. 41 | model_args, data_args, training_args = parser.parse_json_file( 42 | json_file=os.path.abspath(sys.argv[1])) 43 | else: 44 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 45 | # training_args.disable_tqdm = False 46 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 47 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 48 | 49 | results = {} 50 | for i in range(NUM_GPUS): 51 | model.cuda(i) 52 | results[i] = bench_latency(model, 128, 128, tokenizer)['t_mean'] * 1000 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | main() -------------------------------------------------------------------------------- /scripts/train_lora_squad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=32G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | lora_r=8 14 | lora_alpha=16 15 | learning_rate=2e-4 16 | teacher_param_tuning_config=q:0-11,v:0-11 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | lora_r=$2 20 | lora_alpha=$3 21 | learning_rate=$4 22 | teacher_param_tuning_config=$5 23 | fi 24 | 25 | adapter_type=lora 26 | num_epochs=20 27 | batch_size=32 28 | suffix='' 29 | 30 | if [ -d $model_name ] 31 | then 32 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 33 | else 34 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 35 | fi 36 | 37 | echo $output_dir 38 | mkdir -p $output_dir 39 | 40 | python run_minus_squad_training.py \ 41 | --output_dir ${output_dir}\ 42 | --model_name_or_path ${model_name} \ 43 | --do_train \ 44 | --do_eval \ 45 | --save_strategy no \ 46 | --evaluation_strategy steps \ 47 | --logging_strategy steps \ 48 | --logging_steps 1000 \ 49 | --eval_steps 5000 \ 50 | --max_seq_length 384 \ 51 | --doc_stride 128 \ 52 | --num_train_epochs ${num_epochs} \ 53 | --per_device_train_batch_size ${batch_size} \ 54 | --per_device_eval_batch_size ${batch_size} \ 55 | --tf32 True \ 56 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 57 | --warmup_ratio 0.06\ 58 | --learning_rate ${learning_rate}\ 59 | --weight_decay 0.1\ 60 | --apply_lora \ 61 | --lora_alpha ${lora_alpha} \ 62 | --lora_r ${lora_r} \ 63 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/train_lora_squadv2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | lora_r=8 14 | lora_alpha=16 15 | learning_rate=2e-4 16 | teacher_param_tuning_config=q:0-11,v:0-11 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | lora_r=$2 20 | lora_alpha=$3 21 | learning_rate=$4 22 | teacher_param_tuning_config=$5 23 | fi 24 | 25 | adapter_type=lora 26 | num_epochs=30 27 | batch_size=32 28 | suffix='' 29 | 30 | if [ -d $model_name ] 31 | then 32 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 33 | else 34 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 35 | fi 36 | 37 | echo $output_dir 38 | mkdir -p $output_dir 39 | 40 | python run_minus_squad_training.py \ 41 | --output_dir ${output_dir}\ 42 | --model_name_or_path ${model_name} \ 43 | --do_train \ 44 | --do_eval \ 45 | --save_strategy no \ 46 | --evaluation_strategy steps \ 47 | --logging_strategy steps \ 48 | --logging_steps 1000 \ 49 | --eval_steps 5000 \ 50 | --max_seq_length 384 \ 51 | --doc_stride 128 \ 52 | --version_2_with_negative \ 53 | --num_train_epochs ${num_epochs} \ 54 | --per_device_train_batch_size ${batch_size} \ 55 | --per_device_eval_batch_size ${batch_size} \ 56 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 57 | --warmup_ratio 0.06\ 58 | --learning_rate ${learning_rate}\ 59 | --weight_decay 0.1\ 60 | --apply_lora \ 61 | --lora_alpha ${lora_alpha} \ 62 | --lora_r ${lora_r} \ 63 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/train_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | task_name=mnli 14 | lora_r=8 15 | lora_alpha=16 16 | learning_rate=2e-4 17 | teacher_param_tuning_config=q:0-11,v:0-11 18 | elif [ "$#" -eq 6 ]; then 19 | model_name=$1 20 | task_name=$2 21 | lora_r=$3 22 | lora_alpha=$4 23 | learning_rate=$5 24 | teacher_param_tuning_config=$6 25 | fi 26 | 27 | adapter_type=lora 28 | num_epochs=30 29 | batch_size=32 30 | suffix='' 31 | 32 | if [ -d $model_name ] 33 | then 34 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 35 | else 36 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 37 | fi 38 | 39 | echo $output_dir 40 | mkdir -p $output_dir 41 | 42 | python run_minus_training.py \ 43 | --output_dir ${output_dir}\ 44 | --task_name ${task_name} \ 45 | --model_name_or_path ${model_name} \ 46 | --do_train \ 47 | --do_eval \ 48 | --save_strategy no \ 49 | --evaluation_strategy steps \ 50 | --logging_strategy steps \ 51 | --logging_steps 100 \ 52 | --eval_steps 500 \ 53 | --max_seq_length 128 \ 54 | --num_train_epochs ${num_epochs} \ 55 | --per_device_train_batch_size ${batch_size} \ 56 | --per_device_eval_batch_size ${batch_size} \ 57 | --tf32 True \ 58 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 59 | --warmup_ratio 0.06\ 60 | --learning_rate ${learning_rate}\ 61 | --weight_decay 0.1\ 62 | --apply_lora \ 63 | --lora_alpha ${lora_alpha} \ 64 | --lora_r ${lora_r} \ 65 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/train_lora_seq2seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | lora_r=8 14 | lora_alpha=16 15 | learning_rate=1e-4 16 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | lora_r=$2 20 | lora_alpha=$3 21 | learning_rate=$4 22 | teacher_param_tuning_config=$5 23 | fi 24 | 25 | adapter_type=lora 26 | task_name=cnndm 27 | num_epochs=10 28 | batch_size=16 29 | suffix='' 30 | 31 | if [ -d $model_name ] 32 | then 33 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 34 | else 35 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 36 | fi 37 | 38 | echo $output_dir 39 | mkdir -p $output_dir 40 | 41 | python run_minus_seq2seq_training.py \ 42 | --output_dir ${output_dir}\ 43 | --model_name_or_path ${model_name} \ 44 | --do_train \ 45 | --do_eval \ 46 | --save_strategy no \ 47 | --evaluation_strategy steps \ 48 | --logging_strategy steps \ 49 | --logging_steps 1000 \ 50 | --eval_steps 5000 \ 51 | --task_name ${task_name} \ 52 | --max_input_length 512 \ 53 | --max_target_length 128 \ 54 | --num_train_epochs ${num_epochs} \ 55 | --per_device_train_batch_size ${batch_size} \ 56 | --per_device_eval_batch_size ${batch_size} \ 57 | --tf32 True \ 58 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 59 | --warmup_ratio 0.06\ 60 | --learning_rate ${learning_rate}\ 61 | --weight_decay 0.1\ 62 | --apply_lora \ 63 | --lora_alpha ${lora_alpha} \ 64 | --lora_r ${lora_r} \ 65 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/train_lora_sft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p ckpt 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gpus=a100:1 # Number of GPUs requested 9 | #SBATCH --time=24:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | lora_r=8 14 | lora_alpha=16 15 | learning_rate=1e-4 16 | teacher_param_tuning_config=dq:0-31,dv:0-31 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | lora_r=$2 20 | lora_alpha=$3 21 | learning_rate=$4 22 | teacher_param_tuning_config=$5 23 | fi 24 | 25 | adapter_type=lora 26 | task_name=alpaca_gpt4 27 | num_epochs=2 28 | batch_size=4 29 | suffix='' 30 | 31 | if [ -d $model_name ] 32 | then 33 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 34 | else 35 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 36 | fi 37 | 38 | echo $output_dir 39 | mkdir -p $output_dir 40 | 41 | python run_llama_sft.py \ 42 | --output_dir ${output_dir}\ 43 | --task_name ${task_name} \ 44 | --model_name_or_path ${model_name} \ 45 | --bf16 True \ 46 | --data_path 'data/sft/alpaca_data_cleaned.json' \ 47 | --do_train \ 48 | --do_eval \ 49 | --save_strategy no \ 50 | --evaluation_strategy steps \ 51 | --logging_strategy steps \ 52 | --logging_steps 100 \ 53 | --eval_steps 500 \ 54 | --model_max_length 512 \ 55 | --num_train_epochs ${num_epochs} \ 56 | --per_device_train_batch_size ${batch_size} \ 57 | --per_device_eval_batch_size ${batch_size} \ 58 | --gradient_accumulation_steps 8 \ 59 | --warmup_ratio 0.03\ 60 | --learning_rate ${learning_rate}\ 61 | --weight_decay 0.\ 62 | --lr_scheduler_type cosine \ 63 | --tf32 True \ 64 | --apply_lora \ 65 | --lora_alpha ${lora_alpha} \ 66 | --lora_r ${lora_r} \ 67 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 68 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/lora/mt5_base_wmt_enro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=144:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_name=google/mt5-base 12 | task_name=wmt16 13 | adapter_type=lora 14 | source_lang=en 15 | target_lang=ro 16 | 17 | if [ "$#" -eq 0 ]; then 18 | num_epochs=2 19 | batch_size=16 20 | lora_r=102 21 | lora_alpha=408 22 | learning_rate=5e-5 23 | seed=42 24 | elif [ "$#" -eq 6 ]; then 25 | num_epochs=$1 26 | batch_size=$2 27 | lora_r=$3 28 | lora_alpha=$4 29 | learning_rate=$5 30 | seed=$6 31 | fi 32 | 33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 34 | output_dir="output/${model_name}/${task_name}_${source_lang}-${target_lang}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 35 | echo $output_dir 36 | mkdir -p $output_dir 37 | 38 | python run_minus_seq2seq_training.py \ 39 | --output_dir ${output_dir}\ 40 | --task_name ${task_name} \ 41 | --model_name_or_path ${model_name} \ 42 | --do_train \ 43 | --do_eval \ 44 | --save_strategy epoch \ 45 | --evaluation_strategy steps \ 46 | --logging_strategy steps \ 47 | --logging_steps 500 \ 48 | --eval_steps 2000 \ 49 | --max_input_length 150 \ 50 | --max_target_length 150 \ 51 | --lang_pair ${target_lang}-${source_lang} \ 52 | --source_lang ${source_lang} \ 53 | --target_lang ${target_lang} \ 54 | --num_train_epochs ${num_epochs} \ 55 | --per_device_train_batch_size ${batch_size} \ 56 | --per_device_eval_batch_size ${batch_size} \ 57 | --warmup_ratio 0.06\ 58 | --learning_rate ${learning_rate}\ 59 | --weight_decay 0.01\ 60 | --apply_lora \ 61 | --lora_alpha ${lora_alpha} \ 62 | --lora_r ${lora_r} \ 63 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 64 | --tf32 True \ 65 | --fp16 True \ 66 | --report_to none | tee ${output_dir}/log.txt \ -------------------------------------------------------------------------------- /scripts/lora/mt5_base_wmt_roen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=144:00:00 # Walltime (hh:mm:ss) 10 | 11 | model_name=google/mt5-base 12 | task_name=wmt16 13 | adapter_type=lora 14 | source_lang=ro 15 | target_lang=en 16 | 17 | if [ "$#" -eq 0 ]; then 18 | num_epochs=5 19 | batch_size=16 20 | lora_r=8 21 | lora_alpha=16 22 | learning_rate=1e-4 23 | seed=42 24 | elif [ "$#" -eq 6 ]; then 25 | num_epochs=$1 26 | batch_size=$2 27 | lora_r=$3 28 | lora_alpha=$4 29 | learning_rate=$5 30 | seed=$6 31 | fi 32 | 33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 34 | output_dir="output/${model_name}/${task_name}_${source_lang}-${target_lang}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}" 35 | echo $output_dir 36 | mkdir -p $output_dir 37 | 38 | python run_minus_seq2seq_training.py \ 39 | --output_dir ${output_dir}\ 40 | --task_name ${task_name} \ 41 | --model_name_or_path ${model_name} \ 42 | --do_train \ 43 | --do_eval \ 44 | --save_strategy epoch \ 45 | --evaluation_strategy steps \ 46 | --logging_strategy steps \ 47 | --logging_steps 500 \ 48 | --eval_steps 2000 \ 49 | --max_input_length 150 \ 50 | --max_target_length 150 \ 51 | --lang_pair ${target_lang}-${source_lang} \ 52 | --source_lang ${source_lang} \ 53 | --target_lang ${target_lang} \ 54 | --num_train_epochs ${num_epochs} \ 55 | --per_device_train_batch_size ${batch_size} \ 56 | --per_device_eval_batch_size ${batch_size} \ 57 | --warmup_ratio 0.06\ 58 | --learning_rate ${learning_rate}\ 59 | --weight_decay 0.01\ 60 | --label_smoothing 0.1 \ 61 | --apply_lora \ 62 | --lora_alpha ${lora_alpha} \ 63 | --lora_r ${lora_r} \ 64 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 65 | --tf32 True \ 66 | --fp16 True \ 67 | --report_to none | tee ${output_dir}/log.txt \ -------------------------------------------------------------------------------- /models/model_args.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from dataclasses import dataclass, field 3 | 4 | @dataclass 5 | class ModelArguments: 6 | """ 7 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 8 | """ 9 | 10 | model_name_or_path: str = field( 11 | default=None, 12 | metadata={ 13 | "help": "Path to pretrained model or model identifier from huggingface.co/models"} 14 | ) 15 | config_name: Optional[str] = field( 16 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 17 | ) 18 | tokenizer_name: Optional[str] = field( 19 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 20 | ) 21 | cache_dir: Optional[str] = field( 22 | default=None, 23 | metadata={ 24 | "help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 25 | ) 26 | use_fast_tokenizer: bool = field( 27 | default=True, 28 | metadata={ 29 | "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 30 | ) 31 | model_revision: str = field( 32 | default="main", 33 | metadata={ 34 | "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 35 | ) 36 | use_auth_token: bool = field( 37 | default=False, 38 | metadata={ 39 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 40 | "with private models)." 41 | }, 42 | ) 43 | apply_lora: bool = field( 44 | default=False, 45 | metadata={ 46 | "help": "Whether to apply LORA to the model or not." 47 | }, 48 | ) 49 | lora_alpha: int = field( 50 | default=16, 51 | metadata={ 52 | "help": "The alpha value for LoRA." 53 | }, 54 | ) 55 | lora_r: int = field( 56 | default=8, 57 | metadata={ 58 | "help": "The r value for LoRA." 59 | }, 60 | ) 61 | do_auto_pruning: bool = field( 62 | default=False, 63 | metadata={ 64 | "help": "Whether to apply auto pruning to the model when loading or not." 65 | } 66 | ) -------------------------------------------------------------------------------- /scripts/hyperparameter_searching/test_training_hypers.sh: -------------------------------------------------------------------------------- 1 | for teacher_loss_alpha in 0.2 0.3 0.4 0.5 0.6 0.7 2 | do 3 | for distill_loss_alpha in 0.4 4 | do 5 | distill_ce_loss_alpha=$(echo "1 - ${distill_loss_alpha}" | bc -l) 6 | mac_constraint=0.5 7 | model_name='roberta-base' 8 | adapter_type=lora 9 | pruning_scheduler=once 10 | pruner_type=global 11 | task_name=mnli 12 | lora_alpha=16 13 | lora_r=64 14 | pruning_batches=256 15 | pruning_batch_size=4 16 | steppoint=1.0 17 | 18 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_hyperexp/mac${mac_constraint}/teacher${teacher_loss_alpha}/ce${distill_ce_loss_alpha}" 19 | echo $output_dir 20 | mkdir -p $output_dir 21 | 22 | python run_minus_training.py \ 23 | --output_dir ${output_dir}\ 24 | --task_name ${task_name} \ 25 | --model_name_or_path ${model_name} \ 26 | --do_train \ 27 | --do_eval \ 28 | --save_strategy no \ 29 | --evaluation_strategy steps \ 30 | --minus_scheduler \ 31 | --max_seq_length 128 \ 32 | --num_train_epochs 10 \ 33 | --per_device_train_batch_size 32 \ 34 | --per_device_eval_batch_size 32 \ 35 | --lr_scheduler_type linear\ 36 | --warmup_ratio 0.06\ 37 | --learning_rate 5e-4\ 38 | --weight_decay 0.1\ 39 | --apply_lora \ 40 | --lora_alpha ${lora_alpha} \ 41 | --lora_r ${lora_r} \ 42 | --report_to none \ 43 | --pruning_batches ${pruning_batches} \ 44 | --pruning_batch_size ${pruning_batch_size} \ 45 | --mac_constraint ${mac_constraint} \ 46 | --pruning_scheduler ${pruning_scheduler} \ 47 | --pruning_start ${steppoint} \ 48 | --head_scorer_type gradient_l2 \ 49 | --intermediate_scorer_type gradient_l2 \ 50 | --pruner_type ${pruner_type} \ 51 | --do_distill \ 52 | --teacher_loss_alpha ${teacher_loss_alpha} \ 53 | --distill_loss_alpha ${distill_loss_alpha} \ 54 | --distill_ce_loss_alpha ${distill_ce_loss_alpha} \ 55 | --distill_epoch 8 56 | done 57 | done -------------------------------------------------------------------------------- /scripts/train_lora_wmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name='bert-base-uncased' 13 | lora_r=8 14 | lora_alpha=16 15 | learning_rate=1e-4 16 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 17 | elif [ "$#" -eq 5 ]; then 18 | model_name=$1 19 | lora_r=$2 20 | lora_alpha=$3 21 | learning_rate=$4 22 | teacher_param_tuning_config=$5 23 | fi 24 | 25 | adapter_type=lora 26 | num_epochs=30 27 | batch_size=8 28 | suffix='' 29 | task_name=wmt16 30 | source_lang=en 31 | target_lang=ro 32 | 33 | if [ -d $model_name ] 34 | then 35 | output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}" 36 | else 37 | output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}" 38 | fi 39 | 40 | echo $output_dir 41 | mkdir -p $output_dir 42 | 43 | python run_minus_seq2seq_training.py \ 44 | --output_dir ${output_dir}\ 45 | --model_name_or_path ${model_name} \ 46 | --do_train \ 47 | --do_eval \ 48 | --save_strategy no \ 49 | --evaluation_strategy steps \ 50 | --logging_strategy steps \ 51 | --logging_steps 1000 \ 52 | --eval_steps 5000 \ 53 | --task_name ${task_name} \ 54 | --max_input_length 256 \ 55 | --max_target_length 256 \ 56 | --lang_pair ${target_lang}-${source_lang} \ 57 | --source_lang ${source_lang} \ 58 | --target_lang ${target_lang} \ 59 | --max_input_length 512 \ 60 | --max_target_length 128 \ 61 | --num_train_epochs ${num_epochs} \ 62 | --per_device_train_batch_size ${batch_size} \ 63 | --per_device_eval_batch_size ${batch_size} \ 64 | --tf32 True \ 65 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 66 | --warmup_ratio 0.06\ 67 | --learning_rate ${learning_rate}\ 68 | --weight_decay 0.1\ 69 | --apply_lora \ 70 | --lora_alpha ${lora_alpha} \ 71 | --lora_r ${lora_r} \ 72 | --report_to none | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/eval/wmt_enro.sh: -------------------------------------------------------------------------------- 1 | model_name=$1 2 | 3 | mac_constraint=0.4 4 | lora_r=8 5 | pruning_start=-1 6 | pruning_scheduler=cubic_gradual 7 | pruner_type=none 8 | param_allocation_strategy=running_fisher 9 | distillation_type=self_momentum 10 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student 11 | 12 | 13 | task_name=wmt16 14 | adapter_type=lora 15 | source_lang=en 16 | target_lang=ro 17 | param_resizing_strategy=tophalf_limited 18 | pruning_start=-1 19 | pruning_stop=3 20 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated 21 | distill_epoch=5 22 | pruning_batches=64 23 | num_prunings=10 24 | pruning_batch_size=4 25 | # pre_pruning_tuning_epochs=1 26 | pre_pruning_tuning_steps=200 27 | sparsity_warmup_epochs=1 28 | 29 | learning_rate=1e-3 30 | training_batch_size=16 31 | num_train_epochs=10 32 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 34 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 35 | 36 | 37 | output_dir="${model_name}/eval" 38 | echo $output_dir 39 | mkdir -p $output_dir 40 | 41 | python run_minus_seq2seq_training.py \ 42 | --output_dir ${output_dir}\ 43 | --model_name_or_path ${model_name} \ 44 | --do_eval \ 45 | --save_strategy no \ 46 | --evaluation_strategy steps \ 47 | --logging_strategy steps \ 48 | --eval_steps 5000 \ 49 | --logging_steps 1000 \ 50 | --log_level info \ 51 | --log_level_replica info \ 52 | --minus_scheduler \ 53 | --task_name ${task_name} \ 54 | --max_input_length 256 \ 55 | --max_target_length 256 \ 56 | --lang_pair ${target_lang}-${source_lang} \ 57 | --source_lang ${source_lang} \ 58 | --target_lang ${target_lang} \ 59 | --num_train_epochs ${num_train_epochs} \ 60 | --per_device_train_batch_size ${training_batch_size} \ 61 | --per_device_eval_batch_size ${training_batch_size} \ 62 | --tf32 True \ 63 | --lr_scheduler_type linear\ 64 | --distillation_type ${distillation_type} \ 65 | --distill_mapping_strategy ${distill_mapping_strategy} \ 66 | --warmup_ratio 0.06\ 67 | --learning_rate ${learning_rate}\ 68 | --weight_decay 0.1\ 69 | --seed 128 \ 70 | --apply_lora \ 71 | --lora_alpha 16 \ 72 | --lora_r ${lora_r} \ 73 | --report_to none \ 74 | --pruner_type none \ 75 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /test/test_t5_efficiency.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | 5 | from transformers import HfArgumentParser 6 | from args import Seq2SeqDataTrainingArguments 7 | from models import build_model 8 | from models.model_args import ModelArguments 9 | from utils.utils import * 10 | from args import MinusTrainingArguments 11 | from utils.minus_utils import efficiency_testing, input_constructor 12 | 13 | def main(): 14 | sys.argv = ['test_t5.py', 15 | '--output_dir', 16 | './output/test_t5_grafting/', 17 | '--model_name_or_path', 18 | 'output/t5-large_lora_minus_cnndm_once_global_free_inout_nodistill/mac0.4/epoch5/bz16/numprune5/parameq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23,ei:0-23,di:0-23/lora_r8/lora_alpha16/best_model', 19 | '--task_name', 20 | 'cnndm', 21 | '--do_train', 22 | '--do_eval', 23 | '--max_input_length', 24 | '512', 25 | '--max_target_length', 26 | '128', 27 | '--per_device_train_batch_size', 28 | '32', 29 | '--per_device_eval_batch_size', 30 | '32', 31 | '--eval_accumulation_steps', 32 | '1', 33 | '--lora_r', 34 | '8', 35 | '--lora_alpha', 36 | '16', 37 | '--apply_lora', 38 | '--pruner_type', 39 | 'global', 40 | '--head_scorer_type', 41 | 'gradient_l2', 42 | '--intermediate_scorer_type', 43 | 'gradient_l2', 44 | '--pruning_batch_size', 45 | '4', 46 | '--pruning_batches', 47 | '64', 48 | '--pruning_scheduler', 49 | 'once', 50 | ] 51 | parser = HfArgumentParser( 52 | (ModelArguments, Seq2SeqDataTrainingArguments, MinusTrainingArguments)) 53 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 54 | # If we pass only one argument to the script and it's the path to a json file, 55 | # let's parse it to get our arguments. 56 | model_args, data_args, training_args = parser.parse_json_file( 57 | json_file=os.path.abspath(sys.argv[1])) 58 | else: 59 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 60 | 61 | config, tokenizer, model = build_model(model_args, data_args, training_args) 62 | 63 | efficiency_results = efficiency_testing(model, tokenizer, training_args.device) -------------------------------------------------------------------------------- /test/test_t5_prune_consistency.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | 5 | from transformers import HfArgumentParser 6 | from args import Seq2SeqDataTrainingArguments 7 | from models import build_model 8 | from models.model_args import ModelArguments 9 | from utils.utils import * 10 | from args import MinusTrainingArguments 11 | from utils.minus_utils import efficiency_testing, input_constructor 12 | 13 | def main(): 14 | sys.argv = ['test_t5.py', 15 | '--output_dir', 16 | './output/test_t5_grafting/', 17 | '--model_name_or_path', 18 | 'output/t5-large_lora_minus_xsum_once_global_free_inout_nodistill/mac0.05/epoch3/bz4/numprune3/parameq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23,ei:0-23,di:0-23/lora_r8/prunestart0.01/pre_pruning_model', 19 | '--task_name', 20 | 'xsum', 21 | '--do_train', 22 | '--do_eval', 23 | '--max_input_length', 24 | '936', 25 | '--max_target_length', 26 | '38', 27 | '--per_device_train_batch_size', 28 | '32', 29 | '--per_device_eval_batch_size', 30 | '32', 31 | '--eval_accumulation_steps', 32 | '1', 33 | '--lora_r', 34 | '8', 35 | '--lora_alpha', 36 | '16', 37 | '--apply_lora', 38 | '--pruner_type', 39 | 'global', 40 | '--head_scorer_type', 41 | 'gradient_l2', 42 | '--intermediate_scorer_type', 43 | 'gradient_l2', 44 | '--pruning_batch_size', 45 | '4', 46 | '--pruning_batches', 47 | '64', 48 | '--pruning_scheduler', 49 | 'once', 50 | ] 51 | parser = HfArgumentParser( 52 | (ModelArguments, Seq2SeqDataTrainingArguments, MinusTrainingArguments)) 53 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 54 | # If we pass only one argument to the script and it's the path to a json file, 55 | # let's parse it to get our arguments. 56 | model_args, data_args, training_args = parser.parse_json_file( 57 | json_file=os.path.abspath(sys.argv[1])) 58 | else: 59 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 60 | 61 | config, tokenizer, model = build_model(model_args, data_args, training_args) 62 | 63 | efficiency_results = efficiency_testing(model, tokenizer, training_args.device) -------------------------------------------------------------------------------- /eval/mmlu/categories.py: -------------------------------------------------------------------------------- 1 | subcategories = { 2 | "abstract_algebra": ["math"], 3 | "anatomy": ["health"], 4 | "astronomy": ["physics"], 5 | "business_ethics": ["business"], 6 | "clinical_knowledge": ["health"], 7 | "college_biology": ["biology"], 8 | "college_chemistry": ["chemistry"], 9 | "college_computer_science": ["computer science"], 10 | "college_mathematics": ["math"], 11 | "college_medicine": ["health"], 12 | "college_physics": ["physics"], 13 | "computer_security": ["computer science"], 14 | "conceptual_physics": ["physics"], 15 | "econometrics": ["economics"], 16 | "electrical_engineering": ["engineering"], 17 | "elementary_mathematics": ["math"], 18 | "formal_logic": ["philosophy"], 19 | "global_facts": ["other"], 20 | "high_school_biology": ["biology"], 21 | "high_school_chemistry": ["chemistry"], 22 | "high_school_computer_science": ["computer science"], 23 | "high_school_european_history": ["history"], 24 | "high_school_geography": ["geography"], 25 | "high_school_government_and_politics": ["politics"], 26 | "high_school_macroeconomics": ["economics"], 27 | "high_school_mathematics": ["math"], 28 | "high_school_microeconomics": ["economics"], 29 | "high_school_physics": ["physics"], 30 | "high_school_psychology": ["psychology"], 31 | "high_school_statistics": ["math"], 32 | "high_school_us_history": ["history"], 33 | "high_school_world_history": ["history"], 34 | "human_aging": ["health"], 35 | "human_sexuality": ["culture"], 36 | "international_law": ["law"], 37 | "jurisprudence": ["law"], 38 | "logical_fallacies": ["philosophy"], 39 | "machine_learning": ["computer science"], 40 | "management": ["business"], 41 | "marketing": ["business"], 42 | "medical_genetics": ["health"], 43 | "miscellaneous": ["other"], 44 | "moral_disputes": ["philosophy"], 45 | "moral_scenarios": ["philosophy"], 46 | "nutrition": ["health"], 47 | "philosophy": ["philosophy"], 48 | "prehistory": ["history"], 49 | "professional_accounting": ["other"], 50 | "professional_law": ["law"], 51 | "professional_medicine": ["health"], 52 | "professional_psychology": ["psychology"], 53 | "public_relations": ["politics"], 54 | "security_studies": ["politics"], 55 | "sociology": ["culture"], 56 | "us_foreign_policy": ["politics"], 57 | "virology": ["health"], 58 | "world_religions": ["philosophy"], 59 | } 60 | 61 | categories = { 62 | "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"], 63 | "humanities": ["history", "philosophy", "law"], 64 | "social sciences": ["politics", "culture", "economics", "geography", "psychology"], 65 | "other (business, health, misc.)": ["other", "business", "health"], 66 | } 67 | -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/t5_base_lm_adapt_mnli.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | pruning_start=1 5 | pruning_scheduler=once 6 | pruner_type=global 7 | param_allocation_strategy=none 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | pruning_start=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | pruning_start=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=google/t5-base-lm-adapt 27 | task_name=mnli 28 | adapter_type=lora 29 | continuous_alloc_interval=1 30 | pruning_batches=64 31 | num_prunings=5 32 | pruning_batch_size=4 33 | 34 | learning_rate=1e-3 35 | training_batch_size=32 36 | num_train_epochs=40 37 | pruning_stop=30 38 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 39 | 40 | 41 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 42 | echo $output_dir 43 | mkdir -p $output_dir 44 | 45 | python run_minus_training.py \ 46 | --output_dir ${output_dir}\ 47 | --model_name_or_path ${model_name} \ 48 | --do_train \ 49 | --do_eval \ 50 | --save_strategy no \ 51 | --evaluation_strategy steps \ 52 | --logging_strategy steps \ 53 | --eval_steps 5000 \ 54 | --logging_steps 5000 \ 55 | --minus_scheduler \ 56 | --task_name ${task_name} \ 57 | --max_seq_length 128 \ 58 | --num_train_epochs ${num_train_epochs} \ 59 | --per_device_train_batch_size ${training_batch_size} \ 60 | --per_device_eval_batch_size ${training_batch_size} \ 61 | --lr_scheduler_type linear\ 62 | --warmup_ratio 0.06\ 63 | --learning_rate ${learning_rate}\ 64 | --weight_decay 0.1\ 65 | --apply_lora \ 66 | --lora_alpha 16 \ 67 | --lora_r ${lora_r} \ 68 | --report_to none \ 69 | --pruning_batches ${pruning_batches} \ 70 | --pruning_batch_size ${pruning_batch_size} \ 71 | --mac_constraint ${mac_constraint} \ 72 | --pruning_scheduler ${pruning_scheduler} \ 73 | --param_allocation_strategy ${param_allocation_strategy} \ 74 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 75 | --head_scorer_type gradient_l2 \ 76 | --intermediate_scorer_type gradient_l2 \ 77 | --pruner_type ${pruner_type} \ 78 | --pruning_start ${pruning_start} \ 79 | --pruning_stop ${pruning_stop} \ 80 | --num_prunings ${num_prunings} \ 81 | --pruning_scheduler_strategy saliency -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/t5_base_lm_adapt_sst2.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | pruning_start=1 5 | pruning_scheduler=once 6 | pruner_type=global 7 | param_allocation_strategy=none 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | pruning_start=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | pruning_start=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=google/t5-base-lm-adapt 27 | task_name=sst2 28 | adapter_type=lora 29 | continuous_alloc_interval=1 30 | pruning_batches=64 31 | num_prunings=5 32 | pruning_batch_size=4 33 | 34 | learning_rate=1e-3 35 | training_batch_size=32 36 | num_train_epochs=40 37 | pruning_stop=30 38 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 39 | 40 | 41 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 42 | echo $output_dir 43 | mkdir -p $output_dir 44 | 45 | python run_minus_training.py \ 46 | --output_dir ${output_dir}\ 47 | --model_name_or_path ${model_name} \ 48 | --do_train \ 49 | --do_eval \ 50 | --save_strategy no \ 51 | --evaluation_strategy steps \ 52 | --logging_strategy steps \ 53 | --eval_steps 5000 \ 54 | --logging_steps 5000 \ 55 | --minus_scheduler \ 56 | --task_name ${task_name} \ 57 | --max_seq_length 128 \ 58 | --num_train_epochs ${num_train_epochs} \ 59 | --per_device_train_batch_size ${training_batch_size} \ 60 | --per_device_eval_batch_size ${training_batch_size} \ 61 | --lr_scheduler_type linear\ 62 | --warmup_ratio 0.06\ 63 | --learning_rate ${learning_rate}\ 64 | --weight_decay 0.1\ 65 | --apply_lora \ 66 | --lora_alpha 16 \ 67 | --lora_r ${lora_r} \ 68 | --report_to none \ 69 | --pruning_batches ${pruning_batches} \ 70 | --pruning_batch_size ${pruning_batch_size} \ 71 | --mac_constraint ${mac_constraint} \ 72 | --pruning_scheduler ${pruning_scheduler} \ 73 | --param_allocation_strategy ${param_allocation_strategy} \ 74 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 75 | --head_scorer_type gradient_l2 \ 76 | --intermediate_scorer_type gradient_l2 \ 77 | --pruner_type ${pruner_type} \ 78 | --pruning_start ${pruning_start} \ 79 | --pruning_stop ${pruning_stop} \ 80 | --num_prunings ${num_prunings} \ 81 | --pruning_scheduler_strategy saliency -------------------------------------------------------------------------------- /scripts/train_ft_distill_seq2seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name=roberta-base 13 | teacher_path=textattack/roberta-base-SST-2 14 | task_name=sst2 15 | learning_rate=2e-5 16 | training_batch_size=32 17 | num_train_epochs=20 18 | distill_mapping_strategy=static_teacher_static_student 19 | elif [ "$#" -eq 3 ]; then 20 | model_name=$1 21 | teacher_path=$2 22 | task_name=$3 23 | learning_rate=2e-5 24 | training_batch_size=32 25 | num_train_epochs=20 26 | distill_mapping_strategy=static_teacher_static_student 27 | elif [ "$#" -eq 7 ]; then 28 | model_name=$1 29 | teacher_path=$2 30 | task_name=$3 31 | learning_rate=$4 32 | training_batch_size=$5 33 | num_train_epochs=$6 34 | distill_mapping_strategy=$7 35 | fi 36 | 37 | lora_r=8 38 | lora_alpha=16 39 | 40 | if [ -d $model_name ] 41 | then 42 | output_dir="${model_name}/finetune_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/${distill_mapping_strategy}" 43 | else 44 | output_dir="output/${model_name}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/" 45 | fi 46 | 47 | 48 | echo $output_dir 49 | mkdir -p $output_dir 50 | 51 | python run_minus_seq2seq_training.py \ 52 | --output_dir ${output_dir}\ 53 | --task_name ${task_name} \ 54 | --model_name_or_path ${model_name} \ 55 | --do_train \ 56 | --do_eval \ 57 | --save_strategy no \ 58 | --log_level info \ 59 | --log_level_replica info \ 60 | --evaluation_strategy steps \ 61 | --logging_strategy steps \ 62 | --logging_steps 1000 \ 63 | --eval_steps 5000 \ 64 | --task_name ${task_name} \ 65 | --max_input_length 512 \ 66 | --max_target_length 128 \ 67 | --num_train_epochs ${num_train_epochs} \ 68 | --per_device_train_batch_size ${training_batch_size} \ 69 | --per_device_eval_batch_size ${training_batch_size} \ 70 | --tf32 True \ 71 | --distillation_type self_student \ 72 | --distill_mapping_strategy ${distill_mapping_strategy} \ 73 | --warmup_ratio 0.06\ 74 | --learning_rate ${learning_rate}\ 75 | --weight_decay 0.1\ 76 | --lora_alpha ${lora_alpha} \ 77 | --lora_r ${lora_r} \ 78 | --report_to none \ 79 | --do_distill \ 80 | --distill_start 0 \ 81 | --distill_epoch ${num_train_epochs} \ 82 | --teacher_path ${teacher_path} | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/bert_base_mnli.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | lora_alpha=16 5 | pruning_scheduler=once 6 | pruner_type=global 7 | param_allocation_strategy=free_inout 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | lora_alpha=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | lora_alpha=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=bert-base-uncased 27 | task_name=mnli 28 | adapter_type=lora 29 | pruning_start=1 30 | continuous_alloc_interval=1 31 | pruning_batches=256 32 | num_prunings=5 33 | pruning_batch_size=4 34 | 35 | learning_rate=2e-5 36 | training_batch_size=32 37 | num_train_epochs=30 38 | pruning_stop=20 39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 40 | 41 | 42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill_evensmallerlr/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 43 | echo $output_dir 44 | mkdir -p $output_dir 45 | 46 | python run_minus_training.py \ 47 | --output_dir ${output_dir}\ 48 | --task_name ${task_name} \ 49 | --model_name_or_path ${model_name} \ 50 | --do_train \ 51 | --do_eval \ 52 | --save_strategy no \ 53 | --evaluation_strategy epoch \ 54 | --logging_strategy epoch \ 55 | --minus_scheduler \ 56 | --max_seq_length 128 \ 57 | --num_train_epochs ${num_train_epochs} \ 58 | --per_device_train_batch_size ${training_batch_size} \ 59 | --per_device_eval_batch_size ${training_batch_size} \ 60 | --lr_scheduler_type linear\ 61 | --warmup_ratio 0.06\ 62 | --learning_rate ${learning_rate}\ 63 | --weight_decay 0.1\ 64 | --apply_lora \ 65 | --lora_alpha ${lora_alpha} \ 66 | --lora_r ${lora_r} \ 67 | --report_to none \ 68 | --pruning_batches ${pruning_batches} \ 69 | --mac_constraint ${mac_constraint} \ 70 | --pruning_scheduler ${pruning_scheduler} \ 71 | --param_allocation_strategy ${param_allocation_strategy} \ 72 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 73 | --continuous_allocation \ 74 | --continuous_alloc_interval ${continuous_alloc_interval} \ 75 | --pruning_start ${pruning_start} \ 76 | --pruning_stop ${pruning_stop} \ 77 | --head_scorer_type gradient_l2 \ 78 | --intermediate_scorer_type gradient_l2 \ 79 | --pruner_type ${pruner_type} \ 80 | --num_prunings ${num_prunings} \ 81 | --pruning_batch_size ${pruning_batch_size} \ 82 | --pruning_scheduler_strategy saliency -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/bert_base_squad.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | lora_alpha=16 5 | pruning_scheduler=once 6 | pruner_type=global 7 | param_allocation_strategy=free_inout 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | lora_alpha=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | lora_alpha=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=bert-base-uncased 27 | task_name=squad 28 | adapter_type=lora 29 | pruning_start=1 30 | continuous_alloc_interval=1 31 | pruning_batches=64 32 | num_prunings=5 33 | pruning_batch_size=4 34 | 35 | learning_rate=1e-4 36 | training_batch_size=48 37 | num_train_epochs=10 38 | pruning_stop=8 39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 40 | 41 | 42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 43 | echo $output_dir 44 | mkdir -p $output_dir 45 | 46 | python run_minus_squad_training.py \ 47 | --output_dir ${output_dir}\ 48 | --model_name_or_path ${model_name} \ 49 | --do_train \ 50 | --do_eval \ 51 | --save_strategy no \ 52 | --evaluation_strategy steps \ 53 | --logging_strategy steps \ 54 | --eval_steps 1000 \ 55 | --logging_steps 1000 \ 56 | --minus_scheduler \ 57 | --max_seq_length 384 \ 58 | --doc_stride 128 \ 59 | --num_train_epochs ${num_train_epochs} \ 60 | --per_device_train_batch_size ${training_batch_size} \ 61 | --per_device_eval_batch_size ${training_batch_size} \ 62 | --lr_scheduler_type linear\ 63 | --warmup_ratio 0.06\ 64 | --learning_rate ${learning_rate}\ 65 | --weight_decay 0.1\ 66 | --apply_lora \ 67 | --lora_alpha ${lora_alpha} \ 68 | --lora_r ${lora_r} \ 69 | --report_to none \ 70 | --pruning_batches ${pruning_batches} \ 71 | --mac_constraint ${mac_constraint} \ 72 | --pruning_scheduler ${pruning_scheduler} \ 73 | --param_allocation_strategy ${param_allocation_strategy} \ 74 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 75 | --continuous_allocation \ 76 | --continuous_alloc_interval ${continuous_alloc_interval} \ 77 | --pruning_start ${pruning_start} \ 78 | --pruning_stop ${pruning_stop} \ 79 | --head_scorer_type gradient_l2 \ 80 | --intermediate_scorer_type gradient_l2 \ 81 | --pruner_type ${pruner_type} \ 82 | --num_prunings ${num_prunings} \ 83 | --pruning_batch_size ${pruning_batch_size} \ 84 | --pruning_scheduler_strategy saliency -------------------------------------------------------------------------------- /test/test_param_tuning.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | import torch 5 | from transformers import HfArgumentParser 6 | from args import DataTrainingArguments 7 | from models import build_model 8 | from models.model_args import ModelArguments 9 | from utils.utils import * 10 | from args import MinusTrainingArguments 11 | from utils import build_trainer 12 | 13 | def main(): 14 | sys.argv = ['neuron_importance.py', 15 | '--output_dir', 16 | './output/neuron_importance/', 17 | '--model_name_or_path', 18 | 'output/debug_output', 19 | '--task_name', 20 | 'mnli', 21 | '--do_eval', 22 | '--max_seq_length', 23 | '128', 24 | '--per_device_train_batch_size', 25 | '32', 26 | '--per_device_eval_batch_size', 27 | '32', 28 | '--lora_r', 29 | '64', 30 | '--apply_lora', 31 | '--report_to', 32 | 'none', 33 | ] 34 | parser = HfArgumentParser( 35 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 36 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 37 | # If we pass only one argument to the script and it's the path to a json file, 38 | # let's parse it to get our arguments. 39 | model_args, data_args, training_args = parser.parse_json_file( 40 | json_file=os.path.abspath(sys.argv[1])) 41 | else: 42 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 43 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 44 | 45 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 46 | train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets) 47 | trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None) 48 | 49 | model_args.model_name_or_path = os.path.join(model_args.model_name_or_path, 'best_model') 50 | # training_args.disable_tqdm = False 51 | config, tokenizer, best_model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 52 | best_trainer = build_trainer(data_args, training_args, best_model, tokenizer, train_dataset, eval_dataset, param_controller=None) 53 | 54 | final_model_params = dict(model.named_parameters()) 55 | best_model_params = dict(best_model.named_parameters()) 56 | tuned_params = [ 57 | k for k in final_model_params if not torch.allclose(final_model_params[k], best_model_params[k]) 58 | ] 59 | sum_tuned_params = sum([torch.numel(final_model_params[k]) for k in tuned_params]) 60 | sum_changed_params = sum([(final_model_params[k] != best_model_params[k]).sum() for k in tuned_params]) 61 | 62 | if __name__ == '__main__': 63 | main() -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/roberta_base_squad.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | lora_alpha=16 5 | pruning_scheduler=once 6 | pruner_type=global 7 | param_allocation_strategy=free_inout 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | lora_alpha=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | lora_alpha=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=roberta-base 27 | task_name=squad 28 | adapter_type=lora 29 | pruning_start=0.1 30 | continuous_alloc_interval=1 31 | pruning_batches=64 32 | num_prunings=5 33 | pruning_batch_size=4 34 | 35 | learning_rate=1e-4 36 | training_batch_size=48 37 | num_train_epochs=10 38 | pruning_stop=8 39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 40 | 41 | 42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/prunestart${pruning_start}" 43 | echo $output_dir 44 | mkdir -p $output_dir 45 | 46 | python run_minus_squad_training.py \ 47 | --output_dir ${output_dir}\ 48 | --model_name_or_path ${model_name} \ 49 | --do_train \ 50 | --do_eval \ 51 | --save_strategy no \ 52 | --evaluation_strategy steps \ 53 | --logging_strategy steps \ 54 | --eval_steps 1000 \ 55 | --logging_steps 1000 \ 56 | --minus_scheduler \ 57 | --max_seq_length 384 \ 58 | --doc_stride 128 \ 59 | --num_train_epochs ${num_train_epochs} \ 60 | --per_device_train_batch_size ${training_batch_size} \ 61 | --per_device_eval_batch_size ${training_batch_size} \ 62 | --lr_scheduler_type linear\ 63 | --warmup_ratio 0.06\ 64 | --learning_rate ${learning_rate}\ 65 | --weight_decay 0.1\ 66 | --apply_lora \ 67 | --adapter_type ${adapter_type} \ 68 | --lora_alpha ${lora_alpha} \ 69 | --lora_r ${lora_r} \ 70 | --report_to none \ 71 | --pruning_batches ${pruning_batches} \ 72 | --mac_constraint ${mac_constraint} \ 73 | --pruning_scheduler ${pruning_scheduler} \ 74 | --param_allocation_strategy ${param_allocation_strategy} \ 75 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 76 | --continuous_allocation \ 77 | --continuous_alloc_interval ${continuous_alloc_interval} \ 78 | --pruning_start ${pruning_start} \ 79 | --pruning_stop ${pruning_stop} \ 80 | --head_scorer_type gradient_l2 \ 81 | --intermediate_scorer_type gradient_l2 \ 82 | --pruner_type ${pruner_type} \ 83 | --num_prunings ${num_prunings} \ 84 | --pruning_batch_size ${pruning_batch_size} \ 85 | --pruning_scheduler_strategy saliency -------------------------------------------------------------------------------- /scripts/train_ft_distill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name=roberta-base 13 | teacher_path=textattack/roberta-base-SST-2 14 | task_name=sst2 15 | learning_rate=2e-5 16 | training_batch_size=32 17 | num_train_epochs=20 18 | distill_mapping_strategy=static_teacher_static_student 19 | elif [ "$#" -eq 3 ]; then 20 | model_name=$1 21 | teacher_path=$2 22 | task_name=$3 23 | learning_rate=2e-5 24 | training_batch_size=32 25 | num_train_epochs=20 26 | distill_mapping_strategy=static_teacher_static_student 27 | elif [ "$#" -eq 7 ]; then 28 | model_name=$1 29 | teacher_path=$2 30 | task_name=$3 31 | learning_rate=$4 32 | training_batch_size=$5 33 | num_train_epochs=$6 34 | distill_mapping_strategy=$7 35 | fi 36 | 37 | lora_r=8 38 | lora_alpha=16 39 | teacher_param_tuning_config=q:0-11,v:0-11 40 | student_param_tuning_config=q:0-11,v:0-11 41 | 42 | if [ -d $model_name ] 43 | then 44 | output_dir="${model_name}/finetune_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/${distill_mapping_strategy}" 45 | else 46 | output_dir="output/${model_name}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/" 47 | fi 48 | 49 | 50 | echo $output_dir 51 | mkdir -p $output_dir 52 | 53 | python run_minus_training.py \ 54 | --output_dir ${output_dir}\ 55 | --task_name ${task_name} \ 56 | --model_name_or_path ${model_name} \ 57 | --do_train \ 58 | --do_eval \ 59 | --save_strategy no \ 60 | --evaluation_strategy steps \ 61 | --logging_strategy steps \ 62 | --logging_steps 1000 \ 63 | --log_level info \ 64 | --log_level_replica info \ 65 | --eval_steps 5000 \ 66 | --max_seq_length 128 \ 67 | --num_train_epochs ${num_train_epochs} \ 68 | --per_device_train_batch_size ${training_batch_size} \ 69 | --per_device_eval_batch_size ${training_batch_size} \ 70 | --distillation_type self_student \ 71 | --distill_mapping_strategy ${distill_mapping_strategy} \ 72 | --warmup_ratio 0.06\ 73 | --learning_rate ${learning_rate}\ 74 | --weight_decay 0.1\ 75 | --lora_alpha ${lora_alpha} \ 76 | --lora_r ${lora_r} \ 77 | --report_to none \ 78 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 79 | --student_param_tuning_config ${student_param_tuning_config} \ 80 | --do_distill \ 81 | --distill_start 0 \ 82 | --distill_epoch ${num_train_epochs} \ 83 | --teacher_path ${teacher_path} | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /test/test_pruned_teacher_training.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | os.environ["WANDB_DISABLED"] = "true" 4 | import sys 5 | import time 6 | 7 | from transformers import HfArgumentParser 8 | from args import DataTrainingArguments 9 | from models import build_model 10 | from models.model_args import ModelArguments 11 | from utils import build_trainer 12 | from utils.utils import * 13 | from args import MinusTrainingArguments 14 | from utils.cofi_utils import prune_model_with_z 15 | 16 | def main(): 17 | sys.argv = ['test_pruned_teacher_training.py', 18 | '--output_dir', 19 | './output/test_pruned_teacher_training/', 20 | '--model_name_or_path', 21 | 'output/roberta-base_lora_minus_mnli_once_global_co_learning_loratransform_distill/mac0.4/epoch25/bz128/numprune5/lora_r64/lora_alpha16/pre_pruning_model', 22 | '--task_name', 23 | 'mnli', 24 | '--evaluation_strategy', 25 | 'steps', 26 | '--save_strategy', 27 | 'no', 28 | '--do_train', 29 | '--do_eval', 30 | '--max_seq_length', 31 | '128', 32 | '--per_device_train_batch_size', 33 | '32', 34 | '--per_device_eval_batch_size', 35 | '32', 36 | '--apply_lora', 37 | '--lora_r', 38 | '8', 39 | '--lora_alpha', 40 | '16', 41 | '--num_train_epochs', 42 | '30', 43 | '--learning_rate', 44 | '5e-4', 45 | '--warmup_ratio', 46 | '0.06', 47 | '--weight_decay', 48 | '0.1', 49 | ] 50 | parser = HfArgumentParser( 51 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 52 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 53 | # If we pass only one argument to the script and it's the path to a json file, 54 | # let's parse it to get our arguments. 55 | model_args, data_args, training_args = parser.parse_json_file( 56 | json_file=os.path.abspath(sys.argv[1])) 57 | else: 58 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 59 | os.makedirs(training_args.output_dir, exist_ok=True) 60 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 61 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 62 | train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets) 63 | training_args.disable_tqdm = True 64 | head_mask, intermediate_mask = torch.load(os.path.join(model_args.model_name_or_path, 'head_mask.pt')), torch.load(os.path.join(model_args.model_name_or_path, 'intermediate_mask.pt')) 65 | head_mask[-4:, :] = 1 66 | intermediate_mask[-4:, :] = 1 67 | zs = { 68 | 'head_z': [v.to('cpu') for v in head_mask], 69 | 'intermediate_z': [v.to('cpu') for v in intermediate_mask], 70 | } 71 | prune_model_with_z(zs, model) 72 | 73 | model.head_mask, model.intermediate_mask = None, None 74 | trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset) -------------------------------------------------------------------------------- /merge_model_lora.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | sns.set_theme(style="darkgrid") 3 | import os 4 | os.environ["WANDB_DISABLED"] = "true" 5 | import sys 6 | import torch 7 | import loralib as lora 8 | 9 | from transformers import (HfArgumentParser) 10 | from args import DataTrainingArguments 11 | from models.model_args import ModelArguments 12 | from utils.utils import * 13 | from utils import build_trainer 14 | from utils.minus_utils import lora_to_linear 15 | from args import MinusTrainingArguments 16 | from models import build_model 17 | from torch.utils.data import Subset 18 | from utils.fisher_utils.efficiency.param import * 19 | 20 | def main(): 21 | parser = HfArgumentParser( 22 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 23 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 24 | # If we pass only one argument to the script and it's the path to a json file, 25 | # let's parse it to get our arguments. 26 | model_args, data_args, training_args = parser.parse_json_file( 27 | json_file=os.path.abspath(sys.argv[1])) 28 | else: 29 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 30 | os.makedirs(training_args.output_dir, exist_ok=True) 31 | # training_args.disable_tqdm = False 32 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 33 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets, force_model_shape_deduction=True) 34 | train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets) 35 | IS_SQUAD = 'squad' in data_args.task_name.lower() 36 | model.head_mask = torch.load(os.path.join(model_args.model_name_or_path, 'head_mask.pt')).to(training_args.device) 37 | model.intermediate_mask = torch.load(os.path.join(model_args.model_name_or_path, 'intermediate_mask.pt')).to(training_args.device) 38 | model.hidden_mask = torch.load(os.path.join(model_args.model_name_or_path, 'hidden_mask.pt')).to(training_args.device) 39 | 40 | trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None) 41 | fixed_scaling = True 42 | if fixed_scaling: 43 | for m in model.modules(): 44 | if isinstance(m, lora.Linear): 45 | m.scaling = model_args.lora_alpha / model_args.lora_r 46 | model_param_num = sum(p.numel() for p in model.parameters()) 47 | print("Unmerged model's performance: ", trainer.evaluate()) 48 | for n, m in dict(model.named_modules()).items(): 49 | for child_name, child in dict(m.named_children()).items(): 50 | if isinstance(child, lora.Linear): 51 | print("Merging layer {}".format(n + '.' + child_name)) 52 | delattr(m, child_name) 53 | merged_layer = lora_to_linear(child) 54 | setattr(m, child_name, merged_layer) 55 | 56 | model_param_num_merged = sum(p.numel() for p in model.parameters()) 57 | print("Merged model's performance: ", trainer.evaluate()) 58 | print("Parmeter number reduced from {} to {}, with {} parameters removed".format(model_param_num, model_param_num_merged, model_param_num - model_param_num_merged)) 59 | 60 | trainer.save_model() 61 | 62 | if __name__ == '__main__': 63 | main() -------------------------------------------------------------------------------- /eval/dispatch_openai_requests.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file is copied and modified from https://gist.github.com/neubig/80de662fb3e225c18172ec218be4917a. 3 | Thanks to Graham Neubig for sharing the original code. 4 | ''' 5 | 6 | import openai 7 | import asyncio 8 | from typing import Any, List, Dict 9 | 10 | async def dispatch_openai_chat_requesets( 11 | messages_list: List[List[Dict[str,Any]]], 12 | model: str, 13 | **completion_kwargs: Any, 14 | ) -> List[str]: 15 | """Dispatches requests to OpenAI chat completion API asynchronously. 16 | 17 | Args: 18 | messages_list: List of messages to be sent to OpenAI chat completion API. 19 | model: OpenAI model to use. 20 | completion_kwargs: Keyword arguments to be passed to OpenAI ChatCompletion API. See https://platform.openai.com/docs/api-reference/chat for details. 21 | Returns: 22 | List of responses from OpenAI API. 23 | """ 24 | async_responses = [ 25 | openai.ChatCompletion.acreate( 26 | model=model, 27 | messages=x, 28 | **completion_kwargs, 29 | ) 30 | for x in messages_list 31 | ] 32 | return await asyncio.gather(*async_responses) 33 | 34 | 35 | async def dispatch_openai_prompt_requesets( 36 | prompt_list: List[str], 37 | model: str, 38 | **completion_kwargs: Any, 39 | ) -> List[str]: 40 | """Dispatches requests to OpenAI text completion API asynchronously. 41 | 42 | Args: 43 | prompt_list: List of prompts to be sent to OpenAI text completion API. 44 | model: OpenAI model to use. 45 | completion_kwargs: Keyword arguments to be passed to OpenAI text completion API. See https://platform.openai.com/docs/api-reference/completions for details. 46 | Returns: 47 | List of responses from OpenAI API. 48 | """ 49 | async_responses = [ 50 | openai.Completion.acreate( 51 | model=model, 52 | prompt=x, 53 | **completion_kwargs, 54 | ) 55 | for x in prompt_list 56 | ] 57 | return await asyncio.gather(*async_responses) 58 | 59 | 60 | if __name__ == "__main__": 61 | chat_completion_responses = asyncio.run( 62 | dispatch_openai_chat_requesets( 63 | messages_list=[ 64 | [{"role": "user", "content": "Write a poem about asynchronous execution."}], 65 | [{"role": "user", "content": "Write a poem about asynchronous pirates."}], 66 | ], 67 | model="gpt-3.5-turbo", 68 | temperature=0.3, 69 | max_tokens=200, 70 | top_p=1.0, 71 | 72 | ) 73 | ) 74 | 75 | for i, x in enumerate(chat_completion_responses): 76 | print(f"Chat completion response {i}:\n{x['choices'][0]['message']['content']}\n\n") 77 | 78 | 79 | prompt_completion_responses = asyncio.run( 80 | dispatch_openai_prompt_requesets( 81 | prompt_list=[ 82 | "Write a poem about asynchronous execution.\n", 83 | "Write a poem about asynchronous pirates.\n", 84 | ], 85 | model="text-davinci-003", 86 | temperature=0.3, 87 | max_tokens=200, 88 | top_p=1.0, 89 | ) 90 | ) 91 | 92 | for i, x in enumerate(prompt_completion_responses): 93 | print(f"Prompt completion response {i}:\n{x['choices'][0]['text']}\n\n") -------------------------------------------------------------------------------- /scripts/train_lora_distill_squadv2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name=roberta-base 13 | teacher_path=textattack/roberta-base-SST-2 14 | lora_r=8 15 | lora_alpha=16 16 | learning_rate=2e-4 17 | training_batch_size=32 18 | num_train_epochs=20 19 | distill_mapping_strategy=static_teacher_static_student 20 | para_config=q:0-11,v:0-11 21 | elif [ "$#" -eq 2 ]; then 22 | model_name=$1 23 | teacher_path=$2 24 | lora_r=8 25 | lora_alpha=16 26 | learning_rate=2e-4 27 | training_batch_size=32 28 | num_train_epochs=20 29 | para_config=q:0-11,v:0-11 30 | distill_mapping_strategy=static_teacher_static_student 31 | elif [ "$#" -eq 9 ]; then 32 | model_name=$1 33 | teacher_path=$2 34 | lora_r=$3 35 | lora_alpha=$4 36 | learning_rate=$5 37 | training_batch_size=$6 38 | num_train_epochs=$7 39 | distill_mapping_strategy=$8 40 | para_config=$9 41 | fi 42 | 43 | adapter_type=lora 44 | teacher_param_tuning_config=${para_config} 45 | student_param_tuning_config=${para_config} 46 | 47 | if [ -d $model_name ] 48 | then 49 | output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}" 50 | else 51 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 52 | fi 53 | 54 | 55 | echo $output_dir 56 | mkdir -p $output_dir 57 | 58 | python run_minus_squad_training.py \ 59 | --output_dir ${output_dir}\ 60 | --model_name_or_path ${model_name} \ 61 | --do_train \ 62 | --do_eval \ 63 | --save_strategy no \ 64 | --evaluation_strategy steps \ 65 | --logging_strategy steps \ 66 | --logging_steps 1000 \ 67 | --log_level info \ 68 | --log_level_replica info \ 69 | --eval_steps 5000 \ 70 | --max_seq_length 384 \ 71 | --doc_stride 128 \ 72 | --version_2_with_negative \ 73 | --num_train_epochs ${num_train_epochs} \ 74 | --per_device_train_batch_size ${training_batch_size} \ 75 | --per_device_eval_batch_size ${training_batch_size} \ 76 | --distillation_type self_student \ 77 | --distill_mapping_strategy ${distill_mapping_strategy} \ 78 | --warmup_ratio 0.06\ 79 | --learning_rate ${learning_rate}\ 80 | --weight_decay 0.1\ 81 | --apply_lora \ 82 | --lora_alpha ${lora_alpha} \ 83 | --lora_r ${lora_r} \ 84 | --report_to none \ 85 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 86 | --student_param_tuning_config ${student_param_tuning_config} \ 87 | --do_distill \ 88 | --distill_start 0 \ 89 | --distill_epoch ${num_train_epochs} \ 90 | --teacher_path ${teacher_path} | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/train_lora_distill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name=roberta-base 13 | teacher_path=textattack/roberta-base-SST-2 14 | task_name=sst2 15 | lora_r=8 16 | lora_alpha=16 17 | learning_rate=2e-4 18 | training_batch_size=32 19 | num_train_epochs=20 20 | distill_mapping_strategy=static_teacher_static_student 21 | para_config=q:0-11,v:0-11 22 | elif [ "$#" -eq 3 ]; then 23 | model_name=$1 24 | teacher_path=$2 25 | task_name=$3 26 | lora_r=8 27 | lora_alpha=16 28 | learning_rate=2e-4 29 | training_batch_size=32 30 | num_train_epochs=20 31 | para_config=q:0-11,v:0-11 32 | distill_mapping_strategy=static_teacher_static_student 33 | elif [ "$#" -eq 10 ]; then 34 | model_name=$1 35 | teacher_path=$2 36 | task_name=$3 37 | lora_r=$4 38 | lora_alpha=$5 39 | learning_rate=$6 40 | training_batch_size=$7 41 | num_train_epochs=$8 42 | distill_mapping_strategy=$9 43 | para_config=${10} 44 | fi 45 | 46 | adapter_type=lora 47 | teacher_param_tuning_config=${para_config} 48 | student_param_tuning_config=${para_config} 49 | 50 | if [ -d $model_name ] 51 | then 52 | output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}" 53 | else 54 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 55 | fi 56 | 57 | 58 | echo $output_dir 59 | mkdir -p $output_dir 60 | 61 | python run_minus_training.py \ 62 | --output_dir ${output_dir}\ 63 | --task_name ${task_name} \ 64 | --model_name_or_path ${model_name} \ 65 | --do_train \ 66 | --do_eval \ 67 | --save_strategy no \ 68 | --evaluation_strategy steps \ 69 | --logging_strategy steps \ 70 | --logging_steps 1000 \ 71 | --log_level info \ 72 | --log_level_replica info \ 73 | --eval_steps 5000 \ 74 | --max_seq_length 128 \ 75 | --num_train_epochs ${num_train_epochs} \ 76 | --per_device_train_batch_size ${training_batch_size} \ 77 | --per_device_eval_batch_size ${training_batch_size} \ 78 | --tf32 True \ 79 | --distillation_type self_student \ 80 | --distill_mapping_strategy ${distill_mapping_strategy} \ 81 | --warmup_ratio 0.06\ 82 | --learning_rate ${learning_rate}\ 83 | --weight_decay 0.1\ 84 | --apply_lora \ 85 | --lora_alpha ${lora_alpha} \ 86 | --lora_r ${lora_r} \ 87 | --report_to none \ 88 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 89 | --student_param_tuning_config ${student_param_tuning_config} \ 90 | --do_distill \ 91 | --distill_start 0 \ 92 | --distill_epoch ${num_train_epochs} \ 93 | --teacher_path ${teacher_path} | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/eval/cnndm.sh: -------------------------------------------------------------------------------- 1 | model_name=$1 2 | 3 | mac_constraint=0.4 4 | lora_r=8 5 | pruning_start=-1 6 | pruning_scheduler=cubic_gradual 7 | param_allocation_strategy=running_fisher 8 | distillation_type=self_momentum 9 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student 10 | 11 | 12 | task_name=cnndm 13 | adapter_type=lora 14 | param_resizing_strategy=tophalf_limited 15 | pruning_start=-1 16 | pruning_stop=3 17 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated 18 | distill_epoch=5 19 | pruning_batches=64 20 | num_prunings=10 21 | pruning_batch_size=4 22 | # pre_pruning_tuning_epochs=1 23 | pre_pruning_tuning_steps=200 24 | sparsity_warmup_epochs=1 25 | 26 | learning_rate=1e-3 27 | training_batch_size=16 28 | num_train_epochs=10 29 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 30 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 31 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 32 | 33 | 34 | output_dir="${model_name}/eval" 35 | echo $output_dir 36 | mkdir -p $output_dir 37 | 38 | python run_minus_seq2seq_training.py \ 39 | --output_dir ${output_dir}\ 40 | --task_name ${task_name} \ 41 | --model_name_or_path ${model_name} \ 42 | --do_eval \ 43 | --save_strategy no \ 44 | --evaluation_strategy steps \ 45 | --logging_strategy steps \ 46 | --eval_steps 5000 \ 47 | --logging_steps 1000 \ 48 | --log_level info \ 49 | --log_level_replica info \ 50 | --minus_scheduler \ 51 | --max_input_length 512 \ 52 | --max_target_length 128 \ 53 | --num_train_epochs ${num_train_epochs} \ 54 | --per_device_train_batch_size ${training_batch_size} \ 55 | --per_device_eval_batch_size ${training_batch_size} \ 56 | --tf32 True \ 57 | --lr_scheduler_type linear\ 58 | --distillation_type ${distillation_type} \ 59 | --distill_mapping_strategy ${distill_mapping_strategy} \ 60 | --warmup_ratio 0.06\ 61 | --learning_rate ${learning_rate}\ 62 | --weight_decay 0.1\ 63 | --seed 128 \ 64 | --apply_lora \ 65 | --lora_alpha 16 \ 66 | --lora_r ${lora_r} \ 67 | --report_to none \ 68 | --pruning_batches ${pruning_batches} \ 69 | --pruning_batch_size ${pruning_batch_size} \ 70 | --mac_constraint ${mac_constraint} \ 71 | --pruning_scheduler ${pruning_scheduler} \ 72 | --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ 73 | --param_allocation_strategy ${param_allocation_strategy} \ 74 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 75 | --student_param_tuning_config ${student_param_tuning_config} \ 76 | --head_scorer_type gradient_l2 \ 77 | --intermediate_scorer_type gradient_l2 \ 78 | --pruner_type none \ 79 | --do_distill \ 80 | --do_virtual_prune \ 81 | --distill_start ${distill_start} \ 82 | --distill_epoch ${distill_epoch} \ 83 | --pruning_start ${pruning_start} \ 84 | --pruning_stop ${pruning_stop} \ 85 | --num_prunings ${num_prunings} \ 86 | --pruning_scheduler_strategy saliency \ 87 | --collect_salience \ 88 | --salience_collecting_start 200 \ 89 | --salience_collecting_end -1 \ 90 | --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ 91 | --mask_lr 0.01 \ 92 | --grafting_top_k -1 \ 93 | --param_resizing_strategy ${param_resizing_strategy} \ 94 | --tuning_expanding_ratio 4.0 \ 95 | --max_lora_r $(($lora_r * 8)) \ 96 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/eval/xsum.sh: -------------------------------------------------------------------------------- 1 | model_name=$1 2 | 3 | mac_constraint=0.4 4 | lora_r=8 5 | pruning_start=-1 6 | pruning_scheduler=cubic_gradual 7 | param_allocation_strategy=running_fisher 8 | distillation_type=self_momentum 9 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student 10 | 11 | 12 | task_name=xsum 13 | adapter_type=lora 14 | param_resizing_strategy=tophalf_limited 15 | pruning_start=-1 16 | pruning_stop=3 17 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated 18 | distill_epoch=5 19 | pruning_batches=64 20 | num_prunings=10 21 | pruning_batch_size=4 22 | # pre_pruning_tuning_epochs=1 23 | pre_pruning_tuning_steps=200 24 | sparsity_warmup_epochs=1 25 | 26 | learning_rate=1e-3 27 | training_batch_size=16 28 | num_train_epochs=10 29 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 30 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 31 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11 32 | 33 | 34 | output_dir="${model_name}/eval" 35 | echo $output_dir 36 | mkdir -p $output_dir 37 | 38 | python run_minus_seq2seq_training.py \ 39 | --output_dir ${output_dir}\ 40 | --task_name ${task_name} \ 41 | --model_name_or_path ${model_name} \ 42 | --do_eval \ 43 | --save_strategy no \ 44 | --evaluation_strategy steps \ 45 | --logging_strategy steps \ 46 | --eval_steps 5000 \ 47 | --logging_steps 1000 \ 48 | --log_level info \ 49 | --log_level_replica info \ 50 | --minus_scheduler \ 51 | --max_input_length 512 \ 52 | --max_target_length 128 \ 53 | --num_train_epochs ${num_train_epochs} \ 54 | --per_device_train_batch_size ${training_batch_size} \ 55 | --per_device_eval_batch_size ${training_batch_size} \ 56 | --tf32 True \ 57 | --lr_scheduler_type linear\ 58 | --distillation_type ${distillation_type} \ 59 | --distill_mapping_strategy ${distill_mapping_strategy} \ 60 | --warmup_ratio 0.06\ 61 | --learning_rate ${learning_rate}\ 62 | --weight_decay 0.1\ 63 | --seed 128 \ 64 | --apply_lora \ 65 | --lora_alpha 16 \ 66 | --lora_r ${lora_r} \ 67 | --report_to none \ 68 | --pruning_batches ${pruning_batches} \ 69 | --pruning_batch_size ${pruning_batch_size} \ 70 | --mac_constraint ${mac_constraint} \ 71 | --pruning_scheduler ${pruning_scheduler} \ 72 | --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ 73 | --param_allocation_strategy ${param_allocation_strategy} \ 74 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 75 | --student_param_tuning_config ${student_param_tuning_config} \ 76 | --head_scorer_type gradient_l2 \ 77 | --intermediate_scorer_type gradient_l2 \ 78 | --pruner_type none \ 79 | --do_distill \ 80 | --do_virtual_prune \ 81 | --distill_start ${distill_start} \ 82 | --distill_epoch ${distill_epoch} \ 83 | --pruning_start ${pruning_start} \ 84 | --pruning_stop ${pruning_stop} \ 85 | --num_prunings ${num_prunings} \ 86 | --pruning_scheduler_strategy saliency \ 87 | --collect_salience \ 88 | --salience_collecting_start 200 \ 89 | --salience_collecting_end -1 \ 90 | --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ 91 | --mask_lr 0.01 \ 92 | --grafting_top_k -1 \ 93 | --param_resizing_strategy ${param_resizing_strategy} \ 94 | --tuning_expanding_ratio 4.0 \ 95 | --max_lora_r $(($lora_r * 8)) \ 96 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | import torch 5 | import json 6 | from transformers import HfArgumentParser 7 | from deepspeed.profiling.flops_profiler import get_model_profile 8 | from args import DataTrainingArguments 9 | from models import build_model 10 | from utils import build_dataloader, build_trainer 11 | from models.model_args import ModelArguments 12 | from utils.utils import * 13 | from utils.minus_utils import efficiency_testing, input_constructor, compare_parameters 14 | from utils.analysis_utils import gen_run_report 15 | from args import MinusTrainingArguments 16 | from loralib.layers import LoRALayer 17 | 18 | def main(): 19 | parser = HfArgumentParser( 20 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 21 | 22 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 23 | # If we pass only one argument to the script and it's the path to a json file, 24 | # let's parse it to get our arguments. 25 | model_args, data_args, training_args = parser.parse_json_file( 26 | json_file=os.path.abspath(sys.argv[1])) 27 | else: 28 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 29 | 30 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 31 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 32 | MODEL_GENERATIVE = any(['decoder' in n for n, _ in model.named_parameters()]) 33 | train_dataset, eval_dataset, predict_dataset, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets, generative=MODEL_GENERATIVE) 34 | 35 | model = model.to(training_args.device) 36 | model.eval() 37 | for p in model.parameters(): 38 | p.requires_grad = False 39 | for m in model.modules(): 40 | if isinstance(m, LoRALayer): 41 | m.eval() 42 | 43 | model.eval() 44 | trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None) 45 | model.clear_masks() 46 | efficiency_results = efficiency_testing(model, tokenizer, training_args.device) 47 | 48 | flops, macs, params = get_model_profile( 49 | model, 50 | kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()}, print_profile=True, 51 | detailed=True, 52 | output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'), 53 | ) 54 | efficiency_results['model_flops'] = flops 55 | efficiency_results['model_macs'] = macs 56 | json.dump(efficiency_results, open(os.path.join(training_args.output_dir, 'efficiency_results.json'), 'w'), indent=4, sort_keys=True) 57 | # run_report = gen_run_report(training_args.output_dir) 58 | # run_report['train_runtime_per_epoch'] = run_report['train_runtime'] / training_args.num_train_epochs 59 | # json.dump(run_report, open(os.path.join(training_args.output_dir, 'run_report.json'), 'w'), indent=4, sort_keys=True) 60 | 61 | result = trainer.evaluate() 62 | json.dump(result, open(os.path.join(training_args.output_dir, 'eval_results.json'), 'w'), indent=4, sort_keys=True) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() -------------------------------------------------------------------------------- /scripts/train_lora_distill_seq2seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p gpu-rtx6k 3 | #SBATCH -A h2lab 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) 6 | #SBATCH --cpus-per-task=8 # Number of CPU cores per task 7 | #SBATCH --mem=64G # Memory per node (total memory) 8 | #SBATCH --gres=gpu:1 # Number of GPUs requested 9 | #SBATCH --time=48:00:00 # Walltime (hh:mm:ss) 10 | 11 | if [ "$#" -eq 0 ]; then 12 | model_name=roberta-base 13 | teacher_path=textattack/roberta-base-SST-2 14 | task_name=sst2 15 | lora_r=8 16 | lora_alpha=16 17 | learning_rate=2e-4 18 | training_batch_size=32 19 | num_train_epochs=20 20 | distill_mapping_strategy=static_teacher_static_student 21 | para_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 22 | elif [ "$#" -eq 3 ]; then 23 | model_name=$1 24 | teacher_path=$2 25 | task_name=$3 26 | lora_r=8 27 | lora_alpha=16 28 | learning_rate=2e-4 29 | training_batch_size=32 30 | num_train_epochs=20 31 | para_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11 32 | distill_mapping_strategy=static_teacher_static_student 33 | elif [ "$#" -eq 10 ]; then 34 | model_name=$1 35 | teacher_path=$2 36 | task_name=$3 37 | lora_r=$4 38 | lora_alpha=$5 39 | learning_rate=$6 40 | training_batch_size=$7 41 | num_train_epochs=$8 42 | distill_mapping_strategy=$9 43 | para_config=${10} 44 | fi 45 | 46 | adapter_type=lora 47 | teacher_param_tuning_config=${para_config} 48 | student_param_tuning_config=${para_config} 49 | 50 | if [ -d $model_name ] 51 | then 52 | output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}" 53 | else 54 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 55 | fi 56 | 57 | 58 | echo $output_dir 59 | mkdir -p $output_dir 60 | 61 | python run_minus_seq2seq_training.py \ 62 | --output_dir ${output_dir}\ 63 | --task_name ${task_name} \ 64 | --model_name_or_path ${model_name} \ 65 | --do_train \ 66 | --do_eval \ 67 | --save_strategy no \ 68 | --log_level info \ 69 | --log_level_replica info \ 70 | --evaluation_strategy steps \ 71 | --logging_strategy steps \ 72 | --logging_steps 1000 \ 73 | --eval_steps 5000 \ 74 | --task_name ${task_name} \ 75 | --max_input_length 512 \ 76 | --max_target_length 128 \ 77 | --num_train_epochs ${num_train_epochs} \ 78 | --per_device_train_batch_size ${training_batch_size} \ 79 | --per_device_eval_batch_size ${training_batch_size} \ 80 | --tf32 True \ 81 | --distillation_type self_student \ 82 | --distill_mapping_strategy ${distill_mapping_strategy} \ 83 | --warmup_ratio 0.06\ 84 | --learning_rate ${learning_rate}\ 85 | --weight_decay 0.1\ 86 | --apply_lora \ 87 | --lora_alpha ${lora_alpha} \ 88 | --lora_r ${lora_r} \ 89 | --report_to none \ 90 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 91 | --student_param_tuning_config ${student_param_tuning_config} \ 92 | --do_distill \ 93 | --distill_start 0 \ 94 | --distill_epoch ${num_train_epochs} \ 95 | --teacher_path ${teacher_path} | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /scripts/adaptpruning_nodistill/bert_base_sst2.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -eq 0 ]; then 2 | mac_constraint=0.4 3 | lora_r=8 4 | lora_alpha=16 5 | pruning_scheduler=cubic_gradual 6 | pruner_type=running_fisher 7 | param_allocation_strategy=running_fisher 8 | elif [ "$#" -eq 6 ]; then 9 | mac_constraint=$1 10 | lora_r=$2 11 | lora_alpha=$3 12 | pruning_scheduler=$4 13 | pruner_type=$5 14 | param_allocation_strategy=$6 15 | elif [ "$#" -eq 7 ]; then 16 | mac_constraint=$1 17 | lora_r=$2 18 | lora_alpha=$3 19 | pruning_scheduler=$4 20 | pruner_type=$5 21 | param_allocation_strategy=$6 22 | gpu_id=$7 23 | export CUDA_VISIBLE_DEVICES=$gpu_id 24 | fi 25 | 26 | model_name=bert-base-uncased 27 | task_name=sst2 28 | adapter_type=lora 29 | param_resizing_strategy=tophalf_limited 30 | pruning_start=-1 31 | pruning_stop=3 32 | num_prunings=10 33 | pruning_batches=256 34 | pruning_batch_size=4 35 | 36 | learning_rate=2e-4 37 | training_batch_size=32 38 | num_train_epochs=30 39 | warmup_param_tuning_config=q:0-11,v:0-11 40 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 41 | pre_pruning_tuning_epochs=0.5 42 | pre_pruning_layer_warmup_epochs=1.75 43 | suffix='_noffnstart' 44 | 45 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_${param_resizing_strategy}_resizing_nodistill${suffix}/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}" 46 | echo $output_dir 47 | mkdir -p $output_dir 48 | 49 | python run_minus_training.py \ 50 | --output_dir ${output_dir}\ 51 | --task_name ${task_name} \ 52 | --model_name_or_path ${model_name} \ 53 | --do_train \ 54 | --do_eval \ 55 | --save_strategy no \ 56 | --evaluation_strategy steps \ 57 | --logging_strategy steps \ 58 | --logging_steps 1000 \ 59 | --log_level info \ 60 | --log_level_replica info \ 61 | --eval_steps 5000 \ 62 | --max_seq_length 128 \ 63 | --num_train_epochs ${num_train_epochs} \ 64 | --per_device_train_batch_size ${training_batch_size} \ 65 | --per_device_eval_batch_size ${training_batch_size} \ 66 | --lr_scheduler_type linear\ 67 | --warmup_ratio 0.06\ 68 | --learning_rate ${learning_rate}\ 69 | --weight_decay 0.1\ 70 | --apply_lora \ 71 | --lora_alpha ${lora_alpha} \ 72 | --lora_r ${lora_r} \ 73 | --report_to none \ 74 | --pruning_batches ${pruning_batches} \ 75 | --mac_constraint ${mac_constraint} \ 76 | --pruning_scheduler ${pruning_scheduler} \ 77 | --param_allocation_strategy ${param_allocation_strategy} \ 78 | --warmup_param_tuning_config ${warmup_param_tuning_config} \ 79 | --teacher_param_tuning_config ${teacher_param_tuning_config} \ 80 | --pruning_start ${pruning_start} \ 81 | --pruning_stop ${pruning_stop} \ 82 | --pre_pruning_layer_warmup_epochs ${pre_pruning_layer_warmup_epochs} \ 83 | --head_scorer_type gradient_l2 \ 84 | --intermediate_scorer_type gradient_l2 \ 85 | --pruner_type ${pruner_type} \ 86 | --num_prunings ${num_prunings} \ 87 | --pruning_batch_size ${pruning_batch_size} \ 88 | --pruning_scheduler_strategy saliency \ 89 | --collect_salience \ 90 | --salience_collecting_start 200 \ 91 | --salience_collecting_end -1 \ 92 | --pre_pruning_tuning_epochs ${pre_pruning_tuning_epochs} \ 93 | --mask_lr 0.01 \ 94 | --grafting_top_k -1 \ 95 | --param_resizing_strategy ${param_resizing_strategy} \ 96 | | tee ${output_dir}/log.txt -------------------------------------------------------------------------------- /test/test_param_controller.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["WANDB_DISABLED"] = "true" 3 | import sys 4 | import torch 5 | from transformers import HfArgumentParser, default_data_collator, DataCollatorWithPadding 6 | from args import DataTrainingArguments 7 | from models import build_model 8 | from models.model_args import ModelArguments 9 | from utils.utils import * 10 | from args import MinusTrainingArguments 11 | from torch.utils.data import DataLoader, Subset 12 | from trainer.param_control import ParamController 13 | from utils.minus_utils import count_params 14 | 15 | def main(): 16 | sys.argv = ['neuron_importance.py', 17 | '--output_dir', 18 | './output/neuron_importance/', 19 | '--model_name_or_path', 20 | 'output/roberta-base_lora_minus_mnli_once_fisher_distill_full/step1.0/batchuse64/mac0.6', 21 | '--task_name', 22 | 'mnli', 23 | '--do_eval', 24 | '--max_seq_length', 25 | '128', 26 | '--per_device_train_batch_size', 27 | '32', 28 | '--per_device_eval_batch_size', 29 | '32', 30 | '--apply_lora', 31 | '--do_distill' 32 | ] 33 | parser = HfArgumentParser( 34 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 35 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 36 | # If we pass only one argument to the script and it's the path to a json file, 37 | # let's parse it to get our arguments. 38 | model_args, data_args, training_args = parser.parse_json_file( 39 | json_file=os.path.abspath(sys.argv[1])) 40 | else: 41 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 42 | os.makedirs(training_args.output_dir, exist_ok=True) 43 | # training_args.disable_tqdm = False 44 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 45 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 46 | _, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets) 47 | # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if 48 | # we already did the padding. 49 | if data_args.pad_to_max_length: 50 | data_collator = default_data_collator 51 | elif training_args.fp16: 52 | data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) 53 | else: 54 | data_collator = None 55 | dataloader = DataLoader( 56 | Subset(eval_dataset, torch.randperm(len(eval_dataset)).tolist()[:training_args.per_device_eval_batch_size * 64]), 57 | batch_size=training_args.per_device_eval_batch_size, 58 | collate_fn=data_collator, 59 | ) 60 | inputs = next(iter(dataloader)) 61 | 62 | teacher_config = { 63 | 'key': [9,10,11], 64 | 'query': [9, 10, 11], 65 | 'value': [9, 10, 11], 66 | } 67 | student_config = { 68 | 'intermediate': [9,10,11], 69 | } 70 | controller = ParamController(model, teacher_config, student_config) 71 | results = {} 72 | results['original'] = count_params(model, mode='tuned') 73 | controller.freeze() 74 | results['freeze'] = count_params(model, mode='tuned') 75 | controller.model_as_teacher() 76 | results['teacher'] = count_params(model, mode='tuned') 77 | controller.model_as_student() 78 | results['student'] = count_params(model, mode='tuned') 79 | 80 | if __name__ == '__main__': 81 | main() -------------------------------------------------------------------------------- /test/test_deepspeed_profiler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import sys 4 | os.environ["WANDB_DISABLED"] = "true" 5 | from deepspeed.profiling.flops_profiler import get_model_profile, get_module_duration 6 | from transformers import HfArgumentParser 7 | from args import DataTrainingArguments, MinusTrainingArguments 8 | from models import build_model 9 | from models.model_args import ModelArguments 10 | from utils.utils import * 11 | from trainer.model_arch import get_layers 12 | from utils.cofi_utils import update_params, prune_model_with_z 13 | from utils.minus_utils import input_constructor 14 | 15 | def main(): 16 | sys.argv = ['neuron_importance.py', 17 | '--output_dir', 18 | './output/neuron_importance/', 19 | '--model_name_or_path', 20 | 'roberta-base', 21 | '--task_name', 22 | 'mnli', 23 | '--do_train', 24 | '--do_eval', 25 | '--max_seq_length', 26 | '128', 27 | '--per_device_train_batch_size', 28 | '128', 29 | '--per_device_eval_batch_size', 30 | '128', 31 | '--apply_lora', 32 | '--do_distill', 33 | '--lora_r', 34 | '8' 35 | ] 36 | parser = HfArgumentParser( 37 | (ModelArguments, DataTrainingArguments, MinusTrainingArguments)) 38 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 39 | # If we pass only one argument to the script and it's the path to a json file, 40 | # let's parse it to get our arguments. 41 | model_args, data_args, training_args = parser.parse_json_file( 42 | json_file=os.path.abspath(sys.argv[1])) 43 | else: 44 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 45 | os.makedirs(training_args.output_dir, exist_ok=True) 46 | # training_args.disable_tqdm = False 47 | t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args) 48 | config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets) 49 | if model.head_mask is not None: 50 | mask_prefix = 'final_' if os.path.exists(os.path.join(model_args.model_name_or_path, 'final_head_mask.pt')) else '' 51 | zs = { 52 | 'head_z': torch.load(os.path.join(model_args.model_name_or_path, mask_prefix + 'head_mask.pt'), map_location='cpu'), 53 | 'intermediate_z': torch.load(os.path.join(model_args.model_name_or_path, mask_prefix + 'intermediate_mask.pt'), map_location='cpu'), 54 | } 55 | update_params(model, zs) 56 | prune_model_with_z(zs, model) 57 | model.head_mask, model.intermediate_mask = None, None 58 | model.eval() 59 | for i in range(model.config.num_hidden_layers): 60 | module = get_layers(model)[i].intermediate.dense 61 | module.eval() 62 | module.weight.data += (module.lora_B @ module.lora_A)* module.scaling 63 | module.merged=True 64 | 65 | with torch.cuda.device(0): 66 | model=model.cuda() 67 | batch_size = training_args.per_device_eval_batch_size 68 | seq_len = 128 69 | enable_profile = True 70 | if enable_profile: 71 | flops, macs, params = get_model_profile( 72 | model, 73 | kwargs={k: v.to(model.device) for k, v in input_constructor(batch_size, seq_len, tokenizer).items()}, 74 | print_profile=True, 75 | detailed=True, 76 | output_file='roberta-base-profile.txt' 77 | ) 78 | else: 79 | inputs = input_constructor((batch_size, seq_len), tokenizer) 80 | outputs = model(inputs) --------------------------------------------------------------------------------