├── scripts
    ├── ft
    │   ├── llama_2_7b_alpaca_gpt4.sh
    │   ├── bert_base_squad.sh
    │   ├── t5_base_lm_adapt_sst2.sh
    │   ├── t5_base_xsum.sh
    │   ├── t5_base_lm_adapt_cnndm.sh
    │   ├── bert_base_mnli.sh
    │   ├── roberta_base_squad.sh
    │   ├── roberta_base_sst2.sh
    │   ├── bert_base_sst2.sh
    │   ├── roberta_base_mnli.sh
    │   ├── t5_xl_lm_adapt_sst2.sh
    │   └── roberta_base_squadv2.sh
    ├── adaptpruning
    │   └── roberta_base_squad_momentum.sh
    ├── eval
    │   ├── query_alpaca_eval.sh
    │   ├── truthfulqa.sh
    │   ├── alpaca_eval.sh
    │   ├── mmlu.sh
    │   ├── wmt_enro.sh
    │   ├── cnndm.sh
    │   └── xsum.sh
    ├── tradeoff
    │   ├── mt5_base_lora_tradeoff.sh
    │   ├── t5_base_lm_adapt_lora_tradeoff.sh
    │   ├── roberta_base_ft_mask_tuning.sh
    │   ├── roberta_base_mask_tuning.sh
    │   ├── roberta_base_sst2_tuning.sh
    │   └── roberta_base_sst2.sh
    ├── eval.sh
    ├── eval_multiple_lora_roberta_mnli.sh
    ├── test_pruning_efficiency.sh
    ├── merge_lora.sh
    ├── sbatch_scripts
    │   ├── submit_job.sbatch
    │   ├── ft
    │   │   ├── bert_base_sst2.sbatch
    │   │   └── roberta_base_sst2.sbatch
    │   ├── lora
    │   │   ├── bert_base_sst2.sbatch
    │   │   ├── roberta_base_mnli.sbatch
    │   │   ├── roberta_base_squad.sbatch
    │   │   └── roberta_base_sst2.sbatch
    │   ├── elastictuning
    │   │   ├── roberta_base_mnli_selfmomentum.sh
    │   │   ├── t5_xl_lm_adapt_sst2_selfmomentum.sh
    │   │   ├── roberta_base_squadv2_selfmomentum.sh
    │   │   ├── t5_base_lm_adapt_sst2_selfmomentum.sh
    │   │   ├── bert_base_squad_selfmomentum_noffnstart.sh
    │   │   └── roberta_base_sst2_selfmomentum_noffnstart.sh
    │   └── submit_job_a100.sbatch
    ├── post_training_prune.sh
    ├── post_training_squad_prune.sh
    ├── main_results
    │   └── bert_glue_big_momentum.sh
    ├── post_training_cnndm_prune.sh
    ├── test_fisher_prune.sh
    ├── test_random_prune.sh
    ├── hyperparameter_searching
    │   ├── test_cutoff_prune_step.sh
    │   ├── test_throughout_prune.sh
    │   ├── test_once_rescaled.sh
    │   ├── test_once_prune_step.sh
    │   ├── test_distill.sh
    │   ├── test_distill_fisher.sh
    │   ├── test_distill_shorter.sh
    │   └── test_training_hypers.sh
    ├── prepare_data.sh
    ├── eval_lora_roberta_mnli.sh
    ├── post_training_wmt_prune.sh
    ├── ablation
    │   ├── roberta_base_sst2_distillation.sh
    │   └── roberta_base_mnli_distillation.sh
    ├── merge_llama_lora.sh
    ├── lora
    │   ├── bert_base_squad.sh
    │   ├── t5_base_lm_adapt_sst2.sh
    │   ├── t5_base_lm_adapt_mnli.sh
    │   ├── t5_xl_lm_adapt_sst2.sh
    │   ├── bert_base_mnli.sh
    │   ├── bert_base_sst2.sh
    │   ├── t5_xl_lm_adapt_cnndm.sh
    │   ├── roberta_base_cola.sh
    │   ├── roberta_base_mrpc.sh
    │   ├── roberta_base_rte.sh
    │   ├── roberta_base_stsb.sh
    │   ├── roberta_base_mnli.sh
    │   ├── roberta_base_squad.sh
    │   ├── roberta_base_sst2.sh
    │   ├── t5_base_xsum.sh
    │   ├── t5_base_lm_adapt_cnndm.sh
    │   ├── roberta_base_squadv2.sh
    │   ├── llama_13b_alpaca_cleaned.sh
    │   ├── llama_2_7b_alpaca_gpt4.sh
    │   ├── llama_7b_alpaca_cleaned.sh
    │   ├── llama_2_13b_alpaca_gpt4.sh
    │   ├── roberta_base_qnli.sh
    │   ├── roberta_base_qqp.sh
    │   ├── mt5_base_wmt_enro.sh
    │   └── mt5_base_wmt_roen.sh
    ├── efficiency_testing.sh
    ├── post_training_sft_prune.sh
    ├── efficiency_testing_llama.sh
    ├── train_ft_seq2seq.sh
    ├── train_ft.sh
    ├── train_lora_squad.sh
    ├── train_lora_squadv2.sh
    ├── train_lora.sh
    ├── train_lora_seq2seq.sh
    ├── train_lora_sft.sh
    ├── train_lora_wmt.sh
    ├── adaptpruning_nodistill
    │   ├── t5_base_lm_adapt_mnli.sh
    │   ├── t5_base_lm_adapt_sst2.sh
    │   ├── bert_base_mnli.sh
    │   ├── bert_base_squad.sh
    │   ├── roberta_base_squad.sh
    │   └── bert_base_sst2.sh
    ├── train_ft_distill_seq2seq.sh
    ├── train_ft_distill.sh
    ├── train_lora_distill_squadv2.sh
    ├── train_lora_distill.sh
    └── train_lora_distill_seq2seq.sh
├── test
    ├── test_optimizer_state_passing.py
    ├── test_salience.py
    ├── test_rewarmup_lr_scheduling.py
    ├── test_gpu_base_speed.py
    ├── test_t5_efficiency.py
    ├── test_t5_prune_consistency.py
    ├── test_param_tuning.py
    ├── test_pruned_teacher_training.py
    ├── test_param_controller.py
    └── test_deepspeed_profiler.py
├── figures
    └── APT_arch.png
├── loralib
    ├── __init__.py
    └── utils.py
├── utils
    └── fisher_utils
    │   ├── schedule.py
    │   ├── meter.py
    │   ├── efficiency
    │       ├── mem.py
    │       └── latency.py
    │   ├── linalg.py
    │   └── timer.py
├── .gitignore
├── run.sh
├── requirements.txt
├── run_glue_multigpu.sh
├── check_param_num.py
├── LICENSE
├── models
    ├── modeling_outputs.py
    └── model_args.py
├── eval
    ├── mmlu
    │   └── categories.py
    └── dispatch_openai_requests.py
├── merge_model_lora.py
└── evaluate.py


/scripts/ft/llama_2_7b_alpaca_gpt4.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/test_optimizer_state_passing.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/adaptpruning/roberta_base_squad_momentum.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figures/APT_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROIM1998/APT/HEAD/figures/APT_arch.png


--------------------------------------------------------------------------------
/loralib/__init__.py:
--------------------------------------------------------------------------------
1 | name = "lora"
2 | 
3 | from .layers import *
4 | from .utils import *


--------------------------------------------------------------------------------
/scripts/eval/query_alpaca_eval.sh:
--------------------------------------------------------------------------------
1 | export OPENAI_API_KEY=$NEW_OPENAI_KEY
2 | echo $OPENAI_API_KEY
3 | model_output_path=$1
4 | 
5 | alpaca_eval --model_outputs $model_output_path


--------------------------------------------------------------------------------
/scripts/tradeoff/mt5_base_lora_tradeoff.sh:
--------------------------------------------------------------------------------
1 | lora_rs=(102 64 32 16 8)
2 | for lora_r in ${lora_rs[@]}; do
3 |     echo "lora_r: $lora_r"
4 |     bash scripts/lora/mt5_base_wmt_enro.sh 2 16 $lora_r $(($lora_r * 4)) 5e-5 42
5 | done


--------------------------------------------------------------------------------
/utils/fisher_utils/schedule.py:
--------------------------------------------------------------------------------
1 | import math
2 | 
3 | 
4 | def get_pruning_schedule(target, num_iter):
5 |     p = math.pow(target, 1 / num_iter)
6 |     schedule = [p ** i for i in range(1, num_iter)] + [target]
7 |     return schedule
8 | 


--------------------------------------------------------------------------------
/scripts/tradeoff/t5_base_lm_adapt_lora_tradeoff.sh:
--------------------------------------------------------------------------------
1 | lora_rs=(102 64 32 16 8)
2 | for lora_r in ${lora_rs[@]}; do
3 |     echo "lora_r: $lora_r"
4 |     bash scripts/lora/t5_base_lm_adapt_cnndm.sh 6 16 $lora_r $(($lora_r * 4)) 5e-5 42
5 | done


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.vscode/**
 2 | **/__pycache__/**
 3 | 
 4 | *output
 5 | *output/
 6 | all_res/
 7 | **/*.xlsx
 8 | *-profile.txt
 9 | # log files on slurm
10 | **/*.log 
11 | **/*.out
12 | legacy_scripts
13 | legacy_scripts/**
14 | 
15 | **/*backup*
16 | data
17 | data/**


--------------------------------------------------------------------------------
/scripts/eval.sh:
--------------------------------------------------------------------------------
 1 | model_name=$1
 2 | task_name=$2
 3 | output_dir="$model_name/results"
 4 | mkdir -p $output_dir
 5 | 
 6 | # Evaluate
 7 | python evaluate.py \
 8 |     --output_dir ${output_dir}\
 9 |     --model_name_or_path ${model_name} \
10 |     --do_eval \
11 |     --task_name ${task_name}


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | bash scripts/adaptpruning_nodistill/llama_2_7b_alpaca_gpt4_preprune.sh
2 | bash scripts/adaptpruning_nodistill/llama_2_13b_alpaca_gpt4_preprune.sh
3 | bash scripts/lora/llama_2_7b_alpaca_gpt4.sh
4 | bash scripts/lora/llama_2_13b_alpaca_gpt4.sh
5 | bash scripts/lora/llama_7b_alpaca_cleaned.sh
6 | bash scripts/lora/llama_13b_alpaca_cleaned.sh


--------------------------------------------------------------------------------
/scripts/eval_multiple_lora_roberta_mnli.sh:
--------------------------------------------------------------------------------
1 | for mac_constraint in 0.2 0.4 0.6
2 | do
3 |     for pruning_frequency in 0.1 0.5 1.5
4 |     do
5 |         echo "Using mac_constraint ${mac_constraint}, pruning_frequency ${pruning_frequency}"
6 |         bash scripts/eval_lora_roberta_mnli.sh ${pruning_frequency} 64 ${mac_constraint}
7 |     done
8 | done


--------------------------------------------------------------------------------
/scripts/tradeoff/roberta_base_ft_mask_tuning.sh:
--------------------------------------------------------------------------------
1 | mac_constraints=(0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9)
2 | 
3 | for mac_constraint in ${mac_constraints[@]}; do
4 |     echo "mac_constraint: $mac_constraint"
5 |     bash scripts/post_training_prune.sh 'output/roberta-base/sst2/bz32/ft/epoch60/lr2e-5/seed42/best_model' sst2 $mac_constraint 64 
6 | done


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.10.0
 2 | deepspeed==0.8.0
 3 | matplotlib==3.7.1
 4 | numpy==1.24.3
 5 | ortools==9.6.2534
 6 | pandas==1.5.2
 7 | scikit_learn==1.1.3
 8 | scipy==1.10.1
 9 | seaborn==0.12.2
10 | tqdm==4.65.0
11 | transformers==4.28.1
12 | nltk==3.8.1
13 | rouge-score==0.1.2
14 | torch==1.10.2+cu113
15 | --extra-index-url https://download.pytorch.org/whl/cu113


--------------------------------------------------------------------------------
/scripts/tradeoff/roberta_base_mask_tuning.sh:
--------------------------------------------------------------------------------
1 | mac_constraints=(0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9)
2 | 
3 | for mac_constraint in ${mac_constraints[@]}; do
4 |     echo "mac_constraint: $mac_constraint"
5 |     bash scripts/post_training_prune.sh 'output/roberta-base/sst2/bz32/lora/epoch60/lora_r8/lora_alpha16/lr2e-4/seed42/best_model' sst2 $mac_constraint 64 
6 | done


--------------------------------------------------------------------------------
/scripts/eval/truthfulqa.sh:
--------------------------------------------------------------------------------
 1 | # # export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | # zero-shot
 4 | python -m eval.truthfulqa.run_eval \
 5 |     --ntrain 0 \
 6 |     --data_dir data/eval/truthfulqa \
 7 |     --save_dir results/truthfulqa/llama-7B-0shot/ \
 8 |     --model_name_or_path /mmfs1/gscratch/cse/yizhongw/llama_checkpoints/7B/ \
 9 |     --tokenizer_name_or_path /mmfs1/gscratch/cse/yizhongw/llama_checkpoints/7B/ \
10 |     --eval_batch_size 2 \
11 |     --load_in_8bit \
12 |     --use_chat_format


--------------------------------------------------------------------------------
/scripts/test_pruning_efficiency.sh:
--------------------------------------------------------------------------------
 1 | output_dir="output/efficiency_testing"
 2 | mkdir -p $output_dir
 3 | 
 4 | python test_pruning_efficiency.py \
 5 |     --output_dir ${output_dir}\
 6 |     --task_name mnli \
 7 |     --model_name_or_path roberta-base \
 8 |     --do_train \
 9 |     --do_eval \
10 |     --max_seq_length 128 \
11 |     --per_device_train_batch_size 16 \
12 |     --per_device_eval_batch_size 16 \
13 |     --apply_lora \
14 |     --lora_alpha 16 \
15 |     --lora_r 8 \
16 |     --report_to none\


--------------------------------------------------------------------------------
/scripts/merge_lora.sh:
--------------------------------------------------------------------------------
 1 | model_path=$1
 2 | output_dir=$2
 3 | task_name=$3
 4 | lora_r=$4
 5 | lora_alpha=$5
 6 | 
 7 | python merge_model_lora.py \
 8 |     --output_dir ${output_dir}\
 9 |     --model_name_or_path ${model_path} \
10 |     --task_name ${task_name} \
11 |     --do_train \
12 |     --do_eval \
13 |     --max_seq_length 128 \
14 |     --per_device_train_batch_size 32 \
15 |     --per_device_eval_batch_size 32 \
16 |     --apply_lora \
17 |     --lora_r ${lora_r} \
18 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/utils/fisher_utils/meter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter:
 2 | 
 3 |      def __init__(self, name):
 4 |          self.name = name
 5 |          self.reset()
 6 | 
 7 |      def reset(self):
 8 |          self.val = 0
 9 |          self.avg = 0
10 |          self.sum = 0
11 |          self.count = 0
12 | 
13 |      def update(self, val, n=1):
14 |          self.val = val
15 |          self.sum += val * n
16 |          self.count += n
17 |          self.avg = self.sum / self.count
18 | 
19 |      def __str__(self):
20 |          return f"{self.name}: {self.avg:.4f}"
21 | 


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/submit_job.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
10 | 
11 | bash scripts/adaptpruning/roberta_base_squadv2_momentum.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/ft/bert_base_sst2.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/ft/bert_base_sst2.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/ft/roberta_base_sst2.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/ft/roberta_base_sst2.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/lora/bert_base_sst2.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/lora/bert_base_sst2.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/lora/roberta_base_mnli.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/lora/roberta_base_mnli.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/lora/roberta_base_squad.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/lora/roberta_base_squad.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/lora/roberta_base_sst2.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/lora/roberta_base_sst2.sh


--------------------------------------------------------------------------------
/run_glue_multigpu.sh:
--------------------------------------------------------------------------------
 1 | model_name=roberta_base
 2 | 
 3 | gpu_available="0,1,2,3,4,5,6"
 4 | # Split the gpu_available string into an array
 5 | gpu_ids=(${gpu_available//,/ })
 6 | 
 7 | task_name=(sst2 stsb qqp mnli cola mrpc qnli)
 8 | 
 9 | # For each GPU, run the script with a different mac_constraint
10 | for i in "${!gpu_ids[@]}"; do
11 |     gpu_id=${gpu_ids[$i]}
12 |     task_name=${task_name[$i]}
13 |     echo "Running on GPU $gpu_id with mac_constraint $mac_constraint"
14 |     bash scripts/adaptpruning_nodistill/roberta_base_${task_name}.sh 0.4 8 16 cubic_gradual global free_inout $gpu_id &
15 | done


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/roberta_base_mnli_selfmomentum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=500:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/roberta_base_mnli_momentum.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/t5_xl_lm_adapt_sst2_selfmomentum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=500:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/t5_xl_lm_adapt_sst2_momentum.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/roberta_base_squadv2_selfmomentum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/roberta_base_squadv2_momentum.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/t5_base_lm_adapt_sst2_selfmomentum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/t5_base_lm_adapt_sst2_momentum.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/bert_base_squad_selfmomentum_noffnstart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-a100
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/bert_base_squad_momentum_noffnstart.sh


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/elastictuning/roberta_base_sst2_selfmomentum_noffnstart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-a100
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Execute the run.sh script
12 | bash scripts/adaptpruning/roberta_base_sst2_selfmomentum_noffnstart.sh


--------------------------------------------------------------------------------
/utils/fisher_utils/efficiency/mem.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | MB = 1024 * 1024
 4 | 
 5 | def bert_forward(batch_size: int = 32, seq_len: int = 128, num_heads: List[int] = [12] * 12, num_neurons: List[int] = [3072] * 12, hidden_size: int = 768, intermediate_size: int = 3072, attn_head_size: int = 64, output_hidden_states: bool = True, output_attention: bool = False, dtype=32)-> float:
 6 |     assert len(num_heads) == len(num_neurons)
 7 |     mha_size = sum(num_heads) * ((hidden_size * attn_head_size) + 1) * 4
 8 |     ffn_size = sum(num_neurons) * hidden_size * 2 + sum(num_neurons)
 9 |     total = mha_size + ffn_size
10 |     return total * dtype / 8 / MB


--------------------------------------------------------------------------------
/scripts/sbatch_scripts/submit_job_a100.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-a100
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | # Load the conda environment
12 | cd ../../ # cd to the root directory of the project
13 | 
14 | # Execute the run.sh script
15 | bash scripts/adaptpruning/roberta_base_sst2_selfmomentum_noffnstart.sh


--------------------------------------------------------------------------------
/scripts/post_training_prune.sh:
--------------------------------------------------------------------------------
 1 | model_path=$1
 2 | task_name=$2
 3 | mac_constraint=$3
 4 | num_batches=$4
 5 | if [ "$#" -eq 5 ]; then
 6 |     lora_alpha=$5
 7 | else
 8 |     lora_alpha=16
 9 | fi
10 | output_dir="${model_path}/new_pruned/constraint_${mac_constraint}/batches_${num_batches}"
11 | 
12 | python post_training_prune.py \
13 |     --output_dir ${output_dir}\
14 |     --model_name_or_path ${model_path} \
15 |     --task_name ${task_name} \
16 |     --do_train \
17 |     --do_eval \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size 32 \
20 |     --per_device_eval_batch_size 32 \
21 |     --pruning_batch_size 32 \
22 |     --pruning_batches ${num_batches} \
23 |     --mac_constraint $3 \
24 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/scripts/post_training_squad_prune.sh:
--------------------------------------------------------------------------------
 1 | model_path=$1
 2 | task_name=$2
 3 | mac_constraint=$3
 4 | num_batches=$4
 5 | if [ "$#" -eq 5 ]; then
 6 |     lora_alpha=$5
 7 | else
 8 |     lora_alpha=16
 9 | fi
10 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}"
11 | 
12 | python post_training_squad_prune.py \
13 |     --output_dir ${output_dir}\
14 |     --model_name_or_path ${model_path} \
15 |     --do_train \
16 |     --do_eval \
17 |     --max_seq_length 384 \
18 |     --doc_stride 128 \
19 |     --version_2_with_negative \
20 |     --per_device_train_batch_size 32 \
21 |     --per_device_eval_batch_size 32 \
22 |     --pruning_batch_size 32 \
23 |     --pruning_batches ${num_batches} \
24 |     --mac_constraint $3 \
25 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/scripts/main_results/bert_glue_big_momentum.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-a100
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=200:00:00             # Walltime (hh:mm:ss)
10 | 
11 | constraint=$1
12 | task_names=(qnli qqp mnli sst2)
13 | 
14 | for task in ${task_names[@]}; do
15 |     bash scripts/adaptpruning/bert_base_${task}_momentum.sh $constraint 8 -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student
16 | done


--------------------------------------------------------------------------------
/scripts/post_training_cnndm_prune.sh:
--------------------------------------------------------------------------------
 1 | model_path=$1
 2 | mac_constraint=$2
 3 | num_batches=$3
 4 | if [ "$#" -eq 5 ]; then
 5 |     lora_alpha=$5
 6 | else
 7 |     lora_alpha=16
 8 | fi
 9 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}"
10 | 
11 | python post_training_seq2seq_prune.py \
12 |     --output_dir ${output_dir}\
13 |     --model_name_or_path ${model_path} \
14 |     --do_train \
15 |     --do_eval \
16 |     --task_name cnndm \
17 |     --max_input_length 512 \
18 |     --max_target_length 128 \
19 |     --per_device_train_batch_size 32 \
20 |     --per_device_eval_batch_size 32 \
21 |     --tf32 True \
22 |     --pruning_batch_size 32 \
23 |     --pruning_batches ${num_batches} \
24 |     --mac_constraint ${mac_constraint} \
25 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/scripts/tradeoff/roberta_base_sst2_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
10 | 
11 | 
12 | lora_rs=(16 32 64 128 256)
13 | 
14 | for lora_r in ${lora_rs[@]}; do
15 |     echo "lora_r: $lora_r"
16 |     bash scripts/adaptpruning/roberta_base_sst2_momentum.sh 0.4 $lora_r -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student
17 | done


--------------------------------------------------------------------------------
/utils/fisher_utils/linalg.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import cupy
 3 | from cupyx.scipy.sparse.linalg import lsmr
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def closed_form_solver(A, B):
 8 |     if B.shape[0] == 1:
 9 |         X = B / A[0, 0]
10 |     else:
11 |         # NOTE: for safety, compute matrix inverse on CPU
12 |         X = torch.inverse(A.cpu()).to(A.device) @ B
13 |     return X
14 | 
15 | 
16 | @torch.no_grad()
17 | def lsmr_cupy_solver(A, B):
18 |     B = B - A.sum(dim=1)
19 |     if B.shape[0] == 1:
20 |         X = B / A[0, 0]
21 |     else:
22 |         CU_A = cupy.asarray(A.cpu().numpy())
23 |         CU_B = cupy.asarray(B.cpu().numpy())
24 |         solution = lsmr(CU_A, CU_B, damp=1)
25 |         X = cupy.asnumpy(solution[0])
26 |         X = torch.from_numpy(X).to(A.device)
27 |     X = X + 1
28 |     return X, solution[1] < 3
29 | 


--------------------------------------------------------------------------------
/scripts/tradeoff/roberta_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
10 | 
11 | 
12 | mac_constraints=(0.1 0.3 0.4 0.5)
13 | 
14 | for mac_constraint in ${mac_constraints[@]}; do
15 |     echo "mac_constraint: $mac_constraint"
16 |     bash scripts/adaptpruning/roberta_base_sst2_momentum.sh $mac_constraint 8 -1 cubic_gradual running_fisher running_fisher self_momentum dynamic_block_teacher_dynamic_student
17 | done


--------------------------------------------------------------------------------
/scripts/test_fisher_prune.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     model_name_or_path='roberta-base'
 3 |     test_mode='correlation'
 4 | elif [ "$#" -eq 1 ]; then
 5 |     model_name_or_path=$1
 6 |     test_mode='correlation'
 7 | elif [ "$#" -eq 3 ]; then
 8 |     model_name_or_path=$1
 9 |     test_mode=$2
10 | fi
11 | 
12 | output_dir='./output/test_prune/'
13 | mkdir -p $output_dir
14 | 
15 | python run_pruning.py \
16 |     --output_dir ${output_dir}\
17 |     --task_name mnli \
18 |     --model_name_or_path ${model_name_or_path} \
19 |     --max_seq_length 128 \
20 |     --per_device_train_batch_size 32 \
21 |     --per_device_eval_batch_size 32 \
22 |     --report_to none\
23 |     --do_train\
24 |     --do_eval\
25 |     --test_mode ${test_mode}\
26 |     --ratio_bound 0.1\
27 |     --ratio_step 0.01\
28 |     --apply_lora\
29 |     --prune_mode fisher\


--------------------------------------------------------------------------------
/scripts/test_random_prune.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     model_name_or_path='roberta-base'
 3 |     test_mode='stability'
 4 |     mask_mode='all'
 5 | elif [ "$#" -eq 1 ]; then
 6 |     model_name_or_path=$1
 7 |     test_mode='stability'
 8 |     mask_mode='all'
 9 | elif [ "$#" -eq 3 ]; then
10 |     model_name_or_path=$1
11 |     test_mode=$2
12 |     mask_mode=$3
13 | fi
14 | 
15 | output_dir='./output/test_prune/'
16 | mkdir -p $output_dir
17 | 
18 | python run_pruning.py \
19 |     --output_dir ${output_dir}\
20 |     --task_name mnli \
21 |     --model_name_or_path ${model_name_or_path} \
22 |     --max_seq_length 128 \
23 |     --per_device_train_batch_size 32 \
24 |     --per_device_eval_batch_size 32 \
25 |     --report_to none\
26 |     --do_eval\
27 |     --test_mode ${test_mode}\
28 |     --mask_mode ${mask_mode}\
29 |     --ratio_bound 1.\
30 |     --ratio_step 0.01\
31 |     --apply_lora\


--------------------------------------------------------------------------------
/scripts/ft/bert_base_squad.sh:
--------------------------------------------------------------------------------
 1 | model_name="bert-base-uncased"
 2 | task_name="squad"
 3 | num_epochs=10
 4 | learning_rate=1e-5
 5 | batch_size=48
 6 | output_dir="output/${model_name}_${task_name}_full/epoch${num_epochs}/bz${batch_size}"
 7 | 
 8 | 
 9 | echo $output_dir
10 | mkdir -p $output_dir
11 | 
12 | 
13 | python run_minus_squad_training.py \
14 |     --output_dir ${output_dir}\
15 |     --model_name_or_path ${model_name} \
16 |     --do_train \
17 |     --do_eval \
18 |     --save_strategy no \
19 |     --logging_strategy epoch \
20 |     --evaluation_strategy epoch \
21 |     --max_seq_length 384 \
22 |     --doc_stride 128 \
23 |     --num_train_epochs ${num_epochs} \
24 |     --per_device_train_batch_size ${batch_size} \
25 |     --per_device_eval_batch_size ${batch_size} \
26 |     --warmup_ratio 0.06\
27 |     --learning_rate ${learning_rate}\
28 |     --weight_decay 0.1\
29 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_cutoff_prune_step.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=3
 2 | 
 3 | for pruning_frequency in 0.1 0.5 1.0 1.5
 4 | do
 5 |     output_dir="output/roberta_lora_minus_mnli_cutoff/freq${pruning_frequency}/batchuse64/mac0.6/"
 6 |     mkdir -p $output_dir
 7 | 
 8 |     python run_minus_training.py \
 9 |         --output_dir ${output_dir}\
10 |         --task_name mnli \
11 |         --model_name_or_path roberta-base \
12 |         --do_train \
13 |         --do_eval \
14 |         --max_seq_length 128 \
15 |         --per_device_train_batch_size 32 \
16 |         --per_device_eval_batch_size 32 \
17 |         --apply_lora \
18 |         --lora_alpha 16 \
19 |         --lora_r 8 \
20 |         --report_to none\
21 |         --pruning_batches 64 \
22 |         --mac_constraint 0.6 \
23 |         --pruning_frequency ${pruning_frequency}\
24 |         --pruning_scheduler cutoff
25 | done


--------------------------------------------------------------------------------
/scripts/prepare_data.sh:
--------------------------------------------------------------------------------
 1 | mkdir -p data/sft
 2 | mkdir -p data/eval
 3 | 
 4 | # Download alpaca data for sft
 5 | wget -O data/sft/alpaca_data.json https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
 6 | 
 7 | # Download MMLU eval data
 8 | 
 9 | # MMLU dataset
10 | wget -O data/eval/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar
11 | mkdir -p data/eval/mmlu_data
12 | tar -xvf data/eval/mmlu_data.tar -C data/eval/mmlu_data
13 | mv data/eval/mmlu_data/data data/eval/mmlu && rm -r data/eval/mmlu_data data/eval/mmlu_data.tar
14 | 
15 | # TruthfulQA dataset, open-ended and multiple-choice versions
16 | mkdir -p data/eval/truthfulqa
17 | wget -O data/eval/truthfulqa/truthfulqa.csv https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v0/TruthfulQA.csv
18 | wget -O data/eval/truthfulqa/truthfulqa_mc.json https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/mc_task.json


--------------------------------------------------------------------------------
/scripts/eval_lora_roberta_mnli.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     pruning_frequency=0.1
 3 |     pruning_batches=64
 4 |     mac_constraint=0.6
 5 | elif [ "$#" -eq 3 ]; then
 6 |     pruning_frequency=$1
 7 |     pruning_batches=$2
 8 |     mac_constraint=$3
 9 | fi
10 | 
11 | model_dir="output/roberta_lora_minus_mnli/freq${pruning_frequency}/batchuse${pruning_batches}/mac${mac_constraint}/"
12 | 
13 | python run_minus_training.py \
14 |     --output_dir ${model_dir}\
15 |     --task_name mnli \
16 |     --model_name_or_path "./${model_dir}" \
17 |     --do_eval \
18 |     --max_seq_length 128 \
19 |     --per_device_train_batch_size 32 \
20 |     --per_device_eval_batch_size 32 \
21 |     --apply_lora \
22 |     --lora_alpha 16 \
23 |     --lora_r 8 \
24 |     --report_to none\
25 |     --pruning_frequency ${pruning_frequency}\
26 |     --pruning_batches ${pruning_batches} \
27 |     --mac_constraint ${mac_constraint}


--------------------------------------------------------------------------------
/scripts/post_training_wmt_prune.sh:
--------------------------------------------------------------------------------
 1 | model_path=$1
 2 | mac_constraint=$2
 3 | num_batches=$3
 4 | 
 5 | source_lang=en
 6 | target_lang=ro
 7 | task_name=wmt16
 8 | 
 9 | lora_alpha=16
10 | output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}"
11 | 
12 | python post_training_seq2seq_prune.py \
13 |     --output_dir ${output_dir}\
14 |     --model_name_or_path ${model_path} \
15 |     --do_train \
16 |     --do_eval \
17 |     --task_name ${task_name} \
18 |     --max_input_length 256 \
19 |     --max_target_length 256 \
20 |     --lang_pair ${target_lang}-${source_lang} \
21 |     --source_lang ${source_lang} \
22 |     --target_lang ${target_lang} \
23 |     --per_device_train_batch_size 8 \
24 |     --per_device_eval_batch_size 8 \
25 |     --tf32 True \
26 |     --pruning_batch_size 32 \
27 |     --pruning_batches ${num_batches} \
28 |     --mac_constraint ${mac_constraint} \
29 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/utils/fisher_utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class CPUTimer:
 7 |     def __init__(self, timelogs):
 8 |         self.timelogs = timelogs
 9 | 
10 |     def __enter__(self):
11 |         self.start = time.time()
12 | 
13 |     def __exit__(self, type, value, traceback):
14 |         end = time.time()
15 |         self.timelogs.append((end - self.start) * 1000) # ms
16 | 
17 | 
18 | class GPUTimer:
19 |     def __init__(self, timelogs):
20 |         self.timelogs = timelogs
21 | 
22 |     def __enter__(self):
23 |         self.start_event = torch.cuda.Event(enable_timing=True)
24 |         self.end_event = torch.cuda.Event(enable_timing=True)
25 |         self.start_event.record()
26 | 
27 |     def __exit__(self, type, value, traceback):
28 |         self.end_event.record()
29 |         self.end_event.synchronize()
30 |         elapsed_time = self.start_event.elapsed_time(self.end_event)
31 |         self.timelogs.append(elapsed_time)
32 | 


--------------------------------------------------------------------------------
/check_param_num.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import torch
 4 | from tqdm import tqdm
 5 | 
 6 | if __name__ == '__main__':
 7 |     root = sys.argv[1]
 8 |     weights = [os.path.join(root, v) for v in os.listdir(root) if v.endswith('.bin') and 'arg' not in v]
 9 |     total_param_nums = 0
10 |     param_nums = 0
11 |     for weight in tqdm(weights):
12 |         state_dict = torch.load(weight, map_location='cpu')
13 |         for k, v in state_dict.items():
14 |             if 'lora' in k or 'transform' in k:
15 |                 continue
16 |             total_param_nums += v.numel()
17 |             if 'lm_head' in k or 'embed' in k or 'shared' in k or 'classifier' in k or 'pooler' in k or 'qa_output' in k:
18 |                 print("Excluding %s with number of params %d" % (k, v.numel()))
19 |                 continue
20 |             param_nums += v.numel()
21 |             
22 |     print("Total param nums: {}".format(total_param_nums))
23 |     print("Encoder/decoder param nums: {}".format(param_nums))


--------------------------------------------------------------------------------
/scripts/ablation/roberta_base_sst2_distillation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=300:00:00             # Walltime (hh:mm:ss)
10 | 
11 | 
12 | distill_mapping_strategies=(static_teacher_dynamic_cofi_student static_teacher_dynamic_student static_teacher_static_student dynamic_block_teacher_dynamic_cofi_student dynamic_block_teacher_static_student)
13 | 
14 | for distill_mapping_strategy in ${distill_mapping_strategies[@]}; do
15 |     echo "distill_mapping_strategy: $distill_mapping_strategy"
16 |     bash scripts/adaptpruning/roberta_base_sst2_momentum.sh 0.4 8 -1 cubic_gradual running_fisher running_fisher self_momentum $distill_mapping_strategy
17 | done


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_throughout_prune.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=2
 2 | 
 3 | for mac_constraint in 0.4 0.5 0.6
 4 | do
 5 |     for pruning_frequency in 0.1 0.5 1.5
 6 |     do
 7 |         output_dir="output/roberta_lora_minus_mnli/freq${pruning_frequency}/batchuse64/mac${mac_constraint}/"
 8 |         mkdir -p $output_dir
 9 | 
10 |         python run_minus_training.py \
11 |             --output_dir ${output_dir}\
12 |             --task_name mnli \
13 |             --model_name_or_path roberta-base \
14 |             --do_train \
15 |             --do_eval \
16 |             --max_seq_length 128 \
17 |             --per_device_train_batch_size 32 \
18 |             --per_device_eval_batch_size 32 \
19 |             --apply_lora \
20 |             --lora_alpha 16 \
21 |             --lora_r 8 \
22 |             --report_to none\
23 |             --pruning_frequency ${pruning_frequency}\
24 |             --pruning_batches 64 \
25 |             --mac_constraint ${mac_constraint} \
26 |             --pruning_scheduler none
27 |     done
28 | done


--------------------------------------------------------------------------------
/scripts/merge_llama_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_path=$1
12 | output_dir=$2
13 | lora_r=$3
14 | lora_alpha=$4
15 | 
16 | python merge_llama_model_lora.py \
17 |     --output_dir ${output_dir}\
18 |     --model_name_or_path ${model_path} \
19 |     --task_name alpaca_gpt4 \
20 |     --do_train \
21 |     --do_eval \
22 |     --bf16 True \
23 |     --data_path 'data/sft/alpaca_data_gpt4.json' \
24 |     --model_max_length 512 \
25 |     --per_device_train_batch_size 4 \
26 |     --per_device_eval_batch_size 4 \
27 |     --tf32 True \
28 |     --apply_lora \
29 |     --lora_r ${lora_r} \
30 |     --lora_alpha ${lora_alpha}


--------------------------------------------------------------------------------
/scripts/ablation/roberta_base_mnli_distillation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=300:00:00             # Walltime (hh:mm:ss)
10 | 
11 | 
12 | distill_mapping_strategies=(dynamic_block_teacher_dynamic_student dynamic_block_teacher_dynamic_cofi_student static_teacher_dynamic_student none static_teacher_dynamic_cofi_student static_teacher_static_student dynamic_block_teacher_static_student)
13 | 
14 | for distill_mapping_strategy in ${distill_mapping_strategies[@]}; do
15 |     echo "distill_mapping_strategy: $distill_mapping_strategy"
16 |     bash scripts/adaptpruning/roberta_base_mnli_momentum.sh 0.4 8 -1 cubic_gradual running_fisher running_fisher self_momentum $distill_mapping_strategy
17 | done


--------------------------------------------------------------------------------
/scripts/ft/t5_base_lm_adapt_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name="google/t5-base-lm-adapt"
 2 | task_name="sst2"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=1e-4
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --logging_strategy epoch \
32 |     --max_seq_length 128 \
33 |     --num_train_epochs ${epoch} \
34 |     --per_device_train_batch_size ${batch_size} \
35 |     --per_device_eval_batch_size ${batch_size} \
36 |     --warmup_ratio 0.06\
37 |     --learning_rate ${learning_rate}\
38 |     --weight_decay 0.1\
39 |     --seed ${seed} \
40 |     --report_to none 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Bowen Zhao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/ft/t5_base_xsum.sh:
--------------------------------------------------------------------------------
 1 | model_name="t5-base"
 2 | task_name="xsum"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     num_epochs=10
 6 |     batch_size=16
 7 |     learning_rate=1e-4
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     num_epochs=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_seq2seq_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --logging_strategy epoch \
32 |     --max_input_length 512 \
33 |     --max_target_length 128 \
34 |     --num_train_epochs ${num_epochs} \
35 |     --per_device_train_batch_size ${batch_size} \
36 |     --per_device_eval_batch_size ${batch_size} \
37 |     --warmup_ratio 0.06\
38 |     --learning_rate ${learning_rate}\
39 |     --weight_decay 0.1\
40 |     --report_to none \
41 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/ft/t5_base_lm_adapt_cnndm.sh:
--------------------------------------------------------------------------------
 1 | model_name="google/t5-base-lm-adapt"
 2 | task_name="cnndm"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     num_epochs=10
 6 |     batch_size=16
 7 |     learning_rate=1e-4
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     num_epochs=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_seq2seq_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --evaluation_strategy epoch \
31 |     --logging_strategy epoch \
32 |     --max_input_length 512 \
33 |     --max_target_length 128 \
34 |     --num_train_epochs ${num_epochs} \
35 |     --per_device_train_batch_size ${batch_size} \
36 |     --per_device_eval_batch_size ${batch_size} \
37 |     --warmup_ratio 0.06\
38 |     --learning_rate ${learning_rate}\
39 |     --weight_decay 0.1\
40 |     --report_to none \
41 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/ft/bert_base_mnli.sh:
--------------------------------------------------------------------------------
 1 | model_name="bert-base-uncased"
 2 | task_name="mnli"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --logging_strategy steps \
31 |     --evaluation_strategy steps \
32 |     --log_level info \
33 |     --log_level_replica info \
34 |     --logging_steps 1000 \
35 |     --eval_steps 5000 \
36 |     --max_seq_length 128 \
37 |     --num_train_epochs ${epoch} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/eval/alpaca_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_name_or_path=$1
12 | if [ -d "$model_name_or_path" ]; then
13 |     output_dir="${model_name_or_path}/alpaca_eval"
14 | else
15 |     output_dir="output/${model_name_or_path}/alpaca_eval"
16 | fi
17 | echo $output_dir
18 | mkdir -p $output_dir
19 | 
20 | training_batch_size=4
21 | 
22 | python run_alpaca_eval.py \
23 |     --output_dir ${output_dir}\
24 |     --task_name alpaca_eval \
25 |     --model_name_or_path ${model_name_or_path} \
26 |     --bf16 True \
27 |     --data_path 'data/eval/alpaca/alpaca_eval.json' \
28 |     --do_train \
29 |     --do_eval \
30 |     --model_max_length 512 \
31 |     --per_device_train_batch_size ${training_batch_size} \
32 |     --per_device_eval_batch_size ${training_batch_size} \
33 |     --tf32 True


--------------------------------------------------------------------------------
/scripts/ft/roberta_base_squad.sh:
--------------------------------------------------------------------------------
 1 | model_name="roberta-base"
 2 | task_name="squad"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | echo $output_dir
19 | mkdir -p $output_dir
20 | 
21 | 
22 | python run_minus_squad_training.py \
23 |     --output_dir ${output_dir}\
24 |     --model_name_or_path ${model_name} \
25 |     --do_train \
26 |     --do_eval \
27 |     --save_strategy epoch \
28 |     --logging_strategy steps \
29 |     --evaluation_strategy steps \
30 |     --log_level info \
31 |     --log_level_replica info \
32 |     --logging_steps 100 \
33 |     --eval_steps 500 \
34 |     --max_seq_length 384 \
35 |     --doc_stride 128 \
36 |     --num_train_epochs ${epoch} \
37 |     --per_device_train_batch_size ${batch_size} \
38 |     --per_device_eval_batch_size ${batch_size} \
39 |     --warmup_ratio 0.06\
40 |     --learning_rate ${learning_rate}\
41 |     --weight_decay 0.1\
42 |     --seed ${seed} \
43 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/ft/roberta_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name="roberta-base"
 2 | task_name="sst2"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --logging_strategy steps \
31 |     --evaluation_strategy steps \
32 |     --log_level info \
33 |     --log_level_replica info \
34 |     --logging_steps 100 \
35 |     --eval_steps 500 \
36 |     --max_seq_length 128 \
37 |     --num_train_epochs ${epoch} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --seed ${seed} \
44 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_once_rescaled.sh:
--------------------------------------------------------------------------------
 1 | for mac_constraint in 0.3 0.8
 2 | do
 3 |     for steppoint in 0.25 0.5 0.75 1.0
 4 |     do
 5 |         output_dir="output/roberta_lora_minus_mnli_once_rescaled/step${steppoint}/batchuse${pruning_batches}/mac${mac_constraint}/"
 6 |         mkdir -p $output_dir
 7 | 
 8 |         python run_minus_training.py \
 9 |             --output_dir ${output_dir}\
10 |             --task_name mnli \
11 |             --model_name_or_path roberta-base \
12 |             --do_train \
13 |             --do_eval \
14 |             --minus_scheduler \
15 |             --save_strategy no \
16 |             --max_seq_length 128 \
17 |             --per_device_train_batch_size 32 \
18 |             --per_device_eval_batch_size 32 \
19 |             --lr_scheduler_type linear\
20 |             --warmup_ratio 0.06\
21 |             --learning_rate 5e-4\
22 |             --weight_decay 0.1\
23 |             --apply_lora \
24 |             --lora_alpha 16 \
25 |             --lora_r 8 \
26 |             --report_to none \
27 |             --pruning_batches 64 \
28 |             --mac_constraint ${mac_constraint} \
29 |             --pruning_scheduler once \
30 |             --pruning_start ${steppoint}
31 |     done
32 | done


--------------------------------------------------------------------------------
/scripts/ft/bert_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name="bert-base-uncased"
 2 | task_name="sst2"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --logging_strategy steps \
31 |     --evaluation_strategy steps \
32 |     --log_level info \
33 |     --log_level_replica info \
34 |     --logging_steps 100 \
35 |     --eval_steps 500 \
36 |     --max_seq_length 128 \
37 |     --num_train_epochs ${epoch} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --seed ${seed} \
44 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/ft/roberta_base_mnli.sh:
--------------------------------------------------------------------------------
 1 | model_name="roberta-base"
 2 | task_name="mnli"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | 
19 | echo $output_dir
20 | mkdir -p $output_dir
21 | 
22 | 
23 | python run_minus_training.py \
24 |     --output_dir ${output_dir}\
25 |     --task_name ${task_name} \
26 |     --model_name_or_path ${model_name} \
27 |     --do_train \
28 |     --do_eval \
29 |     --save_strategy epoch \
30 |     --logging_strategy steps \
31 |     --evaluation_strategy steps \
32 |     --log_level info \
33 |     --log_level_replica info \
34 |     --logging_steps 1000 \
35 |     --eval_steps 5000 \
36 |     --max_seq_length 128 \
37 |     --num_train_epochs ${epoch} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --seed ${seed} \
44 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/ft/t5_xl_lm_adapt_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-xl-lm-adapt
 2 | task_name=sst2
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=1e-3
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | echo $output_dir
19 | mkdir -p $output_dir
20 | 
21 | 
22 | python run_minus_training.py \
23 |     --output_dir ${output_dir}\
24 |     --task_name ${task_name} \
25 |     --model_name_or_path ${model_name} \
26 |     --do_train \
27 |     --do_eval \
28 |     --save_strategy epoch \
29 |     --logging_strategy steps \
30 |     --evaluation_strategy steps \
31 |     --log_level info \
32 |     --log_level_replica info \
33 |     --logging_steps 100 \
34 |     --eval_steps 500 \
35 |     --max_seq_length 128 \
36 |     --num_train_epochs ${epoch} \
37 |     --per_device_train_batch_size ${batch_size} \
38 |     --per_device_eval_batch_size ${batch_size} \
39 |     --warmup_ratio 0.06\
40 |     --learning_rate ${learning_rate}\
41 |     --weight_decay 0.1\
42 |     --seed ${seed} \
43 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/ft/roberta_base_squadv2.sh:
--------------------------------------------------------------------------------
 1 | model_name="roberta-base"
 2 | task_name="squad_v2"
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     epoch=10
 6 |     batch_size=32
 7 |     learning_rate=2e-5
 8 |     seed=128
 9 | elif [ "$#" -eq 4 ]; then
10 |     epoch=$1
11 |     batch_size=$2
12 |     learning_rate=$3
13 |     seed=$4
14 | fi
15 | 
16 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/ft/epoch${epoch}/lr${learning_rate}/seed${seed}/"
17 | 
18 | echo $output_dir
19 | mkdir -p $output_dir
20 | 
21 | 
22 | python run_minus_squad_training.py \
23 |     --output_dir ${output_dir}\
24 |     --model_name_or_path ${model_name} \
25 |     --do_train \
26 |     --do_eval \
27 |     --save_strategy epoch \
28 |     --logging_strategy steps \
29 |     --evaluation_strategy steps \
30 |     --log_level info \
31 |     --log_level_replica info \
32 |     --logging_steps 100 \
33 |     --eval_steps 500 \
34 |     --max_seq_length 384 \
35 |     --doc_stride 128 \
36 |     --version_2_with_negative \
37 |     --num_train_epochs ${epoch} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --seed ${seed} \
44 |     --report_to none 


--------------------------------------------------------------------------------
/scripts/lora/bert_base_squad.sh:
--------------------------------------------------------------------------------
 1 | model_name='bert-base-uncased'
 2 | task_name=squad
 3 | adapter_type=lora
 4 | learning_rate=2e-4
 5 | num_epochs=30
 6 | batch_size=32
 7 | 
 8 | if [ "$#" -eq 0 ]; then
 9 |     lora_r=8
10 |     lora_alpha=16
11 | elif [ "$#" -eq 2 ]; then
12 |     lora_r=$1
13 |     lora_alpha=$2
14 | fi
15 | 
16 | teacher_param_tuning_config=q:0-11,v:0-11
17 | output_dir="output/${model_name}_${adapter_type}_${task_name}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
18 | echo $output_dir
19 | mkdir -p $output_dir
20 | 
21 | python run_minus_squad_training.py \
22 |     --output_dir ${output_dir}\
23 |     --model_name_or_path ${model_name} \
24 |     --do_train \
25 |     --do_eval \
26 |     --save_strategy no \
27 |     --evaluation_strategy epoch \
28 |     --logging_strategy epoch \
29 |     --max_seq_length 384 \
30 |     --doc_stride 128 \
31 |     --num_train_epochs ${num_epochs} \
32 |     --per_device_train_batch_size ${batch_size} \
33 |     --per_device_eval_batch_size ${batch_size} \
34 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
35 |     --warmup_ratio 0.06\
36 |     --learning_rate ${learning_rate}\
37 |     --weight_decay 0.1\
38 |     --apply_lora \
39 |     --lora_alpha ${lora_alpha} \
40 |     --lora_r ${lora_r} \
41 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/efficiency_testing.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     id=default
 3 |     backbone_name='roberta-base'
 4 |     model_name='roberta-base'
 5 |     task_name=mnli
 6 |     lora_r=8
 7 |     lora_alpha=16
 8 |     batch_size=128
 9 | elif [ "$#" -eq 3 ]; then
10 |     id=$1
11 |     backbone_name='roberta-base'
12 |     model_name=$2
13 |     task_name=$3
14 |     lora_r=8
15 |     lora_alpha=16
16 |     batch_size=128
17 | elif [ "$#" -eq 7 ]; then
18 |     id=$1
19 |     backbone_name=$2
20 |     model_name=$3
21 |     task_name=$4
22 |     lora_r=$5
23 |     lora_alpha=$6
24 |     batch_size=$7
25 | fi
26 | 
27 | output_dir="output/efficiency_testing/${backbone_name}/${task_name}/${id}/bz${batch_size}/"
28 | 
29 | echo $output_dir
30 | mkdir -p $output_dir
31 | 
32 | python efficiency_test.py \
33 |     --output_dir ${output_dir}\
34 |     --task_name ${task_name} \
35 |     --model_name_or_path ${model_name} \
36 |     --do_eval \
37 |     --save_strategy no \
38 |     --evaluation_strategy steps \
39 |     --logging_strategy steps \
40 |     --logging_steps 100 \
41 |     --eval_steps 500 \
42 |     --max_seq_length 128 \
43 |     --per_device_train_batch_size ${batch_size} \
44 |     --per_device_eval_batch_size ${batch_size} \
45 |     --apply_lora \
46 |     --lora_alpha ${lora_alpha} \
47 |     --lora_r ${lora_r} \
48 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_once_prune_step.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=2
 2 | 
 3 | steppoint=0.5
 4 | for mac_constraint in 0.4 0.5 0.6 0.7
 5 | do
 6 |     for steppoint in 0.25 0.5 0.75 1.0
 7 |     do
 8 |         output_dir="output/roberta_lora_minus_mnli_once_const_warmup_scheduler/step${steppoint}/batchuse${pruning_batches}/mac${mac_constraint}/"
 9 |         mkdir -p $output_dir
10 | 
11 |         python run_minus_training.py \
12 |             --output_dir ${output_dir}\
13 |             --task_name mnli \
14 |             --model_name_or_path roberta-base \
15 |             --do_train \
16 |             --do_eval \
17 |             --minus_scheduler \
18 |             --save_strategy no \
19 |             --evaluation_strategy steps \
20 |             --max_seq_length 128 \
21 |             --per_device_train_batch_size 32 \
22 |             --per_device_eval_batch_size 32 \
23 |             --lr_scheduler_type linear\
24 |             --warmup_ratio 0.06\
25 |             --learning_rate 5e-4\
26 |             --weight_decay 0.1\
27 |             --apply_lora \
28 |             --lora_alpha 16 \
29 |             --lora_r 8 \
30 |             --report_to none \
31 |             --pruning_batches 64 \
32 |             --mac_constraint ${mac_constraint} \
33 |             --pruning_scheduler once \
34 |             --pruning_start ${steppoint}
35 |     done
36 | done


--------------------------------------------------------------------------------
/scripts/lora/t5_base_lm_adapt_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-base-lm-adapt
 2 | task_name=sst2
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-3
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy epoch \
34 |     --logging_strategy epoch \
35 |     --max_seq_length 128 \
36 |     --num_train_epochs ${num_epochs} \
37 |     --per_device_train_batch_size ${batch_size} \
38 |     --per_device_eval_batch_size ${batch_size} \
39 |     --warmup_ratio 0.06\
40 |     --learning_rate ${learning_rate}\
41 |     --weight_decay 0.1\
42 |     --apply_lora \
43 |     --lora_alpha ${lora_alpha} \
44 |     --lora_r ${lora_r} \
45 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
46 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/t5_base_lm_adapt_mnli.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-base-lm-adapt
 2 | task_name=mnli
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-3
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy epoch \
34 |     --logging_strategy epoch \
35 |     --tf32 True \
36 |     --max_seq_length 128 \
37 |     --num_train_epochs ${num_epochs} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --warmup_ratio 0.06\
41 |     --learning_rate ${learning_rate}\
42 |     --weight_decay 0.1\
43 |     --apply_lora \
44 |     --lora_alpha ${lora_alpha} \
45 |     --lora_r ${lora_r} \
46 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
47 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/t5_xl_lm_adapt_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-xl-lm-adapt
 2 | task_name=sst2
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=30
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-3
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy epoch \
34 |     --logging_strategy epoch \
35 |     --max_seq_length 128 \
36 |     --num_train_epochs ${num_epochs} \
37 |     --per_device_train_batch_size ${batch_size} \
38 |     --per_device_eval_batch_size ${batch_size} \
39 |     --tf32 True \
40 |     --bf16 True \
41 |     --warmup_ratio 0.06\
42 |     --learning_rate ${learning_rate}\
43 |     --weight_decay 0.1\
44 |     --apply_lora \
45 |     --lora_alpha ${lora_alpha} \
46 |     --lora_r ${lora_r} \
47 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
48 |     --report_to none \


--------------------------------------------------------------------------------
/test/test_salience.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | salience = torch.load('output/bert-base-uncased_lora_minus_rte_cubic_gradual_running_fisher_alloc_running_fisher_momentum_mapping_static_teacher_dynamic_cofi_student_distill_tophalf_limited_resizing_nonormalize_correctweight_clippedmoving_correctuncertain_bothsquare_freeteacher/mac0.4/epoch120/bz32/numprune5/paramq:0-11,v:0-11,i:0-11/lora_r8/pruning_start-1/distill_epoch96/first_salience.pt', map_location='cpu')
 4 | vanilla_score = salience['mask_salience']['intermediate_mask'] * salience['mask_uncertainty']['intermediate_mask']
 5 | sorted_score, sorted_idx = vanilla_score.sort(descending=False)
 6 | neuron_tuning_score = torch.cat([salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['u'] for i in range(12)])
 7 | sorted_tuning_score, sorted_tuning_idx = neuron_tuning_score.sort(descending=False)
 8 | 
 9 | combined_score = vanilla_score * neuron_tuning_score
10 | sorted_combined_score, sorted_combined_idx = combined_score.sort(descending=False)
11 | 
12 | torch.cat([salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['intermediate']['output_mask']['u'] for i in range(12)]).mean()
13 | 
14 | for i in range(12):
15 |     print
16 | 
17 | for i in range(12):
18 |     print((salience['grafting_mask_salience']['modules'][i]['value']['bottleneck_mask']['s'] * salience['grafting_mask_salience']['modules'][i]['value']['bottleneck_mask']['u']).mean())


--------------------------------------------------------------------------------
/scripts/post_training_sft_prune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_path=$1
12 | mac_constraint=$2
13 | num_batches=$3
14 | if [ "$#" -eq 5 ]; then
15 |     lora_alpha=$5
16 | else
17 |     lora_alpha=16
18 | fi
19 | 
20 | if [ -d $model_path ]; then
21 |     echo "Model path exists"
22 |     output_dir="${model_path}/pruned/constraint_${mac_constraint}/batches_${num_batches}"
23 | else
24 |     echo "Model path does not exist"
25 |     output_dir="llama_output/${model_path}/${task_name}/mt_pruned/constraint_${mac_constraint}/batches_${num_batches}"
26 | fi
27 | 
28 | echo $output_dir
29 | mkdir -p $output_dir
30 | 
31 | python post_training_sft_prune.py \
32 |     --output_dir ${output_dir}\
33 |     --model_name_or_path ${model_path} \
34 |     --task_name alpaca_gpt4 \
35 |     --data_path 'data/sft/alpaca_data_gpt4.json' \
36 |     --do_train \
37 |     --do_eval \
38 |     --model_max_length 512 \
39 |     --per_device_train_batch_size 1 \
40 |     --per_device_eval_batch_size 1 \
41 |     --pruning_batch_size 1 \
42 |     --pruning_batches ${num_batches} \
43 |     --mac_constraint ${mac_constraint} \
44 |     --lora_alpha ${lora_alpha} \


--------------------------------------------------------------------------------
/scripts/lora/bert_base_mnli.sh:
--------------------------------------------------------------------------------
 1 | model_name='bert-base-uncased'
 2 | task_name=mnli
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 1000 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 5000 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --apply_lora \
47 |     --lora_alpha ${lora_alpha} \
48 |     --lora_r ${lora_r} \
49 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
50 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/bert_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name='bert-base-uncased'
 2 | task_name=sst2
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --apply_lora \
47 |     --lora_alpha ${lora_alpha} \
48 |     --lora_r ${lora_r} \
49 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
50 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/t5_xl_lm_adapt_cnndm.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-xl-lm-adapt
 2 | task_name=cnndm
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=10
 7 |     batch_size=4
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-3
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_seq2seq_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy epoch \
34 |     --logging_strategy epoch \
35 |     --max_input_length 512 \
36 |     --max_target_length 128 \
37 |     --num_train_epochs ${num_epochs} \
38 |     --per_device_train_batch_size ${batch_size} \
39 |     --per_device_eval_batch_size ${batch_size} \
40 |     --tf32 True \
41 |     --bf16 True \
42 |     --warmup_ratio 0.06\
43 |     --learning_rate ${learning_rate}\
44 |     --weight_decay 0.1\
45 |     --apply_lora \
46 |     --lora_alpha ${lora_alpha} \
47 |     --lora_r ${lora_r} \
48 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
49 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_cola.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=cola
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=80
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_mrpc.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=mrpc
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=80
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_rte.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=rte
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=80
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_stsb.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=stsb
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=80
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy no \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_mnli.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=mnli
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 1000 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 5000 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_squad.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=squad
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_squad_training.py \
27 |     --output_dir ${output_dir}\
28 |     --model_name_or_path ${model_name} \
29 |     --do_train \
30 |     --do_eval \
31 |     --save_strategy epoch \
32 |     --evaluation_strategy steps \
33 |     --logging_strategy steps \
34 |     --logging_steps 100 \
35 |     --log_level info \
36 |     --log_level_replica info \
37 |     --eval_steps 500 \
38 |     --max_seq_length 384 \
39 |     --doc_stride 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
44 |     --warmup_ratio 0.06\
45 |     --learning_rate ${learning_rate}\
46 |     --weight_decay 0.1\
47 |     --seed ${seed} \
48 |     --apply_lora \
49 |     --lora_alpha ${lora_alpha} \
50 |     --lora_r ${lora_r} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=sst2
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 100 \
36 |     --log_level info \
37 |     --log_level_replica info \
38 |     --eval_steps 500 \
39 |     --max_seq_length 128 \
40 |     --num_train_epochs ${num_epochs} \
41 |     --per_device_train_batch_size ${batch_size} \
42 |     --per_device_eval_batch_size ${batch_size} \
43 |     --warmup_ratio 0.06\
44 |     --learning_rate ${learning_rate}\
45 |     --weight_decay 0.1\
46 |     --seed ${seed} \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/t5_base_xsum.sh:
--------------------------------------------------------------------------------
 1 | model_name=t5-base
 2 | task_name=xsum
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=15
 7 |     batch_size=16
 8 |     lora_r=102
 9 |     lora_alpha=408
10 |     learning_rate=1e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_seq2seq_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 500 \
36 |     --eval_steps 2000 \
37 |     --max_input_length 512 \
38 |     --max_target_length 128 \
39 |     --num_train_epochs ${num_epochs} \
40 |     --per_device_train_batch_size ${batch_size} \
41 |     --per_device_eval_batch_size ${batch_size} \
42 |     --tf32 True \
43 |     --fp16 True \
44 |     --warmup_ratio 0.06\
45 |     --learning_rate ${learning_rate}\
46 |     --weight_decay 0.01 \
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none | tee ${output_dir}/log.txt \


--------------------------------------------------------------------------------
/scripts/lora/t5_base_lm_adapt_cnndm.sh:
--------------------------------------------------------------------------------
 1 | model_name=google/t5-base-lm-adapt
 2 | task_name=cnndm
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=6
 7 |     batch_size=16
 8 |     lora_r=102
 9 |     lora_alpha=408
10 |     learning_rate=5e-5
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_seq2seq_training.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --do_train \
31 |     --do_eval \
32 |     --save_strategy epoch \
33 |     --evaluation_strategy steps \
34 |     --logging_strategy steps \
35 |     --logging_steps 500 \
36 |     --eval_steps 2000 \
37 |     --max_input_length 512 \
38 |     --max_target_length 128 \
39 |     --num_train_epochs ${num_epochs} \
40 |     --per_device_train_batch_size ${batch_size} \
41 |     --per_device_eval_batch_size ${batch_size} \
42 |     --tf32 True \
43 |     --fp16 True \
44 |     --warmup_ratio 0.06\
45 |     --learning_rate ${learning_rate}\
46 |     --weight_decay 0.01\
47 |     --apply_lora \
48 |     --lora_alpha ${lora_alpha} \
49 |     --lora_r ${lora_r} \
50 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
51 |     --report_to none | tee ${output_dir}/log.txt \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_squadv2.sh:
--------------------------------------------------------------------------------
 1 | model_name='roberta-base'
 2 | task_name=squad_v2
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=60
 7 |     batch_size=32
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=2e-4
11 |     seed=128
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=q:0-11,v:0-11
22 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_minus_squad_training.py \
27 |     --output_dir ${output_dir}\
28 |     --model_name_or_path ${model_name} \
29 |     --do_train \
30 |     --do_eval \
31 |     --save_strategy epoch \
32 |     --evaluation_strategy steps \
33 |     --logging_strategy steps \
34 |     --logging_steps 1000 \
35 |     --log_level info \
36 |     --log_level_replica info \
37 |     --eval_steps 5000 \
38 |     --max_seq_length 384 \
39 |     --doc_stride 128 \
40 |     --version_2_with_negative \
41 |     --num_train_epochs ${num_epochs} \
42 |     --per_device_train_batch_size ${batch_size} \
43 |     --per_device_eval_batch_size ${batch_size} \
44 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
45 |     --warmup_ratio 0.06\
46 |     --learning_rate ${learning_rate}\
47 |     --weight_decay 0.1\
48 |     --seed ${seed} \
49 |     --apply_lora \
50 |     --lora_alpha ${lora_alpha} \
51 |     --lora_r ${lora_r} \
52 |     --report_to none \


--------------------------------------------------------------------------------
/test/test_rewarmup_lr_scheduling.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | from matplotlib import pyplot as plt
 3 | 
 4 | num_epochs = 20
 5 | epoch_size = 3068
 6 | reset_epochs = [2, 5, 8, 11, 14]
 7 | reset_steps = [epoch * epoch_size for epoch in reset_epochs]
 8 | num_training_steps = num_epochs * epoch_size
 9 | num_warmup_steps = 0.06 * num_training_steps
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     steppoints = []
14 |     if not reset_steps[0] == 0:
15 |         reset_steps = [0] + reset_steps
16 |     warmup_starts = set(reset_steps)
17 |     for step in reset_steps:
18 |         steppoints.append(step)
19 |         steppoints.append(step + num_warmup_steps)
20 |     steppoints.append(num_training_steps)
21 | 
22 |     # Determine which range an integer belongs to using binary search
23 |     def find_range(n):
24 |         for idx, step in enumerate(steppoints):
25 |             if step <= n < steppoints[idx + 1]:
26 |                 if step in warmup_starts:
27 |                     return step, steppoints[idx + 1], True # is warmup
28 |                 else:
29 |                     return step, steppoints[idx + 1], False # is not warmup
30 | 
31 |     def lr_lambda(current_step: int):
32 |         range_start, range_end, is_warmup = find_range(current_step)
33 |         if is_warmup:
34 |             return float(current_step - range_start) / float(max(1, range_end - range_start))
35 |         else:
36 |             return max(
37 |                 0.0, float(range_end - current_step) / float(max(1, range_end - range_start))
38 |             )
39 |     
40 |     steps = list(range(num_training_steps))
41 |     lrs = [lr_lambda(step) for step in steps]
42 |     sns.lineplot(x=steps, y=lrs)
43 |     plt.savefig('lr_test.png')


--------------------------------------------------------------------------------
/scripts/lora/llama_13b_alpaca_cleaned.sh:
--------------------------------------------------------------------------------
 1 | model_name='huggyllama/llama-13b'
 2 | task_name=alpaca_gpt4
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=2
 7 |     batch_size=4
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=dq:0-39,dv:0-39
22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_llama_sft.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --bf16 True \
31 |     --data_path 'data/sft/alpaca_data_cleaned.json' \
32 |     --do_train \
33 |     --do_eval \
34 |     --save_strategy steps \
35 |     --save_steps 2000 \
36 |     --save_total_limit 1 \
37 |     --evaluation_strategy steps \
38 |     --logging_strategy steps \
39 |     --logging_steps 100 \
40 |     --eval_steps 500 \
41 |     --model_max_length 512 \
42 |     --num_train_epochs ${num_epochs} \
43 |     --per_device_train_batch_size ${batch_size} \
44 |     --per_device_eval_batch_size ${batch_size} \
45 |     --gradient_accumulation_steps 8 \
46 |     --warmup_ratio 0.03\
47 |     --learning_rate ${learning_rate}\
48 |     --weight_decay 0.\
49 |     --lr_scheduler_type cosine \
50 |     --tf32 True \
51 |     --apply_lora \
52 |     --lora_alpha ${lora_alpha} \
53 |     --lora_r ${lora_r} \
54 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
55 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/utils/fisher_utils/efficiency/latency.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | @torch.no_grad()
 6 | def lookup_latency(lut, mask):
 7 |     n = int(torch.sum(mask != 0))
 8 |     if n == 0:
 9 |         return 0
10 |     else:
11 |         return lut[n - 1]
12 | 
13 | 
14 | def estimate_latency(mha_lut, ffn_lut, head_mask, neuron_mask):
15 |     num_hidden_layers = head_mask.shape[0]
16 |     total = 0
17 |     for i in range(num_hidden_layers):
18 |         total += lookup_latency(mha_lut, head_mask[i])
19 |         total += lookup_latency(ffn_lut, neuron_mask[i])
20 |     return total
21 | 
22 | 
23 | class PiecewiseLinearLatency:
24 | 
25 |     def __init__(self, threshold=None, c=None, slope=None):
26 |         self.threshold = threshold
27 |         self.c = c
28 |         self.slope = slope
29 | 
30 | 
31 | def fit_latency_fn(lut):
32 |     lut = np.asarray(lut)
33 |     latency_fn = PiecewiseLinearLatency()
34 | 
35 |     min_error = 10000
36 |     for threshold in range(1, len(lut) + 1):
37 |         c = lut[:threshold].sum() / threshold
38 |         y = lut[threshold:] - c
39 |         x = np.arange(1, len(y) + 1)
40 | 
41 |         if threshold == len(lut):
42 |             slope = 0
43 |         else:
44 |             slope = (x * y).sum() / (x * x).sum()
45 |         slope = 0 if slope < 0 else slope
46 | 
47 |         approximated = [c] * threshold
48 |         for i in range(1, len(lut) - threshold + 1):
49 |             approximated.append(slope * i + c)
50 |         approximated = np.asarray(approximated)
51 | 
52 |         squared_error = ((lut - approximated) * (lut - approximated)).sum()
53 |         if squared_error < min_error:
54 |             min_error = squared_error
55 |             latency_fn.threshold = threshold
56 |             latency_fn.c = c
57 |             latency_fn.slope = slope
58 | 
59 |     return latency_fn
60 | 


--------------------------------------------------------------------------------
/scripts/lora/llama_2_7b_alpaca_gpt4.sh:
--------------------------------------------------------------------------------
 1 | model_name='meta-llama/Llama-2-7b-hf'
 2 | task_name=alpaca_gpt4
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=2
 7 |     batch_size=4
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=dq:0-31,dv:0-31,di0:0-31
22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_llama_sft.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --bf16 True \
31 |     --data_path 'data/sft/alpaca_data_gpt4.json' \
32 |     --do_train \
33 |     --do_eval \
34 |     --save_strategy steps \
35 |     --save_steps 2000 \
36 |     --save_total_limit 1 \
37 |     --evaluation_strategy steps \
38 |     --logging_strategy steps \
39 |     --logging_steps 100 \
40 |     --eval_steps 500 \
41 |     --model_max_length 512 \
42 |     --num_train_epochs ${num_epochs} \
43 |     --per_device_train_batch_size ${batch_size} \
44 |     --per_device_eval_batch_size ${batch_size} \
45 |     --gradient_accumulation_steps 8 \
46 |     --warmup_ratio 0.03\
47 |     --learning_rate ${learning_rate}\
48 |     --weight_decay 0.\
49 |     --lr_scheduler_type cosine \
50 |     --tf32 True \
51 |     --apply_lora \
52 |     --lora_alpha ${lora_alpha} \
53 |     --lora_r ${lora_r} \
54 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
55 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/lora/llama_7b_alpaca_cleaned.sh:
--------------------------------------------------------------------------------
 1 | model_name='huggyllama/llama-7b'
 2 | task_name=alpaca_gpt4
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=2
 7 |     batch_size=4
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=dq:0-31,dv:0-31,di0:0-31
22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_llama_sft.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --bf16 True \
31 |     --data_path 'data/sft/alpaca_data_cleaned.json' \
32 |     --do_train \
33 |     --do_eval \
34 |     --save_strategy steps \
35 |     --save_steps 2000 \
36 |     --save_total_limit 1 \
37 |     --evaluation_strategy steps \
38 |     --logging_strategy steps \
39 |     --logging_steps 100 \
40 |     --eval_steps 500 \
41 |     --model_max_length 512 \
42 |     --num_train_epochs ${num_epochs} \
43 |     --per_device_train_batch_size ${batch_size} \
44 |     --per_device_eval_batch_size ${batch_size} \
45 |     --gradient_accumulation_steps 8 \
46 |     --warmup_ratio 0.03\
47 |     --learning_rate ${learning_rate}\
48 |     --weight_decay 0.\
49 |     --lr_scheduler_type cosine \
50 |     --tf32 True \
51 |     --apply_lora \
52 |     --lora_alpha ${lora_alpha} \
53 |     --lora_r ${lora_r} \
54 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
55 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/lora/llama_2_13b_alpaca_gpt4.sh:
--------------------------------------------------------------------------------
 1 | model_name='meta-llama/Llama-2-13b-hf'
 2 | task_name=alpaca_gpt4
 3 | adapter_type=lora
 4 | 
 5 | if [ "$#" -eq 0 ]; then
 6 |     num_epochs=2
 7 |     batch_size=4
 8 |     lora_r=8
 9 |     lora_alpha=16
10 |     learning_rate=1e-4
11 |     seed=42
12 | elif [ "$#" -eq 6 ]; then
13 |     num_epochs=$1
14 |     batch_size=$2
15 |     lora_r=$3
16 |     lora_alpha=$4
17 |     learning_rate=$5
18 |     seed=$6
19 | fi
20 | 
21 | teacher_param_tuning_config=dq:0-39,dv:0-39,di0:0-39
22 | output_dir="llama_output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/teacher_${teacher_param_tuning_config}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
23 | echo $output_dir
24 | mkdir -p $output_dir
25 | 
26 | python run_llama_sft.py \
27 |     --output_dir ${output_dir}\
28 |     --task_name ${task_name} \
29 |     --model_name_or_path ${model_name} \
30 |     --bf16 True \
31 |     --data_path 'data/sft/alpaca_data_gpt4.json' \
32 |     --do_train \
33 |     --do_eval \
34 |     --save_strategy steps \
35 |     --save_steps 2000 \
36 |     --save_total_limit 1 \
37 |     --evaluation_strategy steps \
38 |     --logging_strategy steps \
39 |     --logging_steps 100 \
40 |     --eval_steps 500 \
41 |     --model_max_length 512 \
42 |     --num_train_epochs ${num_epochs} \
43 |     --per_device_train_batch_size ${batch_size} \
44 |     --per_device_eval_batch_size ${batch_size} \
45 |     --gradient_accumulation_steps 8 \
46 |     --warmup_ratio 0.03\
47 |     --learning_rate ${learning_rate}\
48 |     --weight_decay 0.\
49 |     --lr_scheduler_type cosine \
50 |     --tf32 True \
51 |     --apply_lora \
52 |     --lora_alpha ${lora_alpha} \
53 |     --lora_r ${lora_r} \
54 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
55 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/efficiency_testing_llama.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     id=default
13 |     backbone_name='roberta-base'
14 |     model_name='roberta-base'
15 |     lora_r=8
16 |     lora_alpha=16
17 |     batch_size=4
18 | elif [ "$#" -eq 2 ]; then
19 |     id=$1
20 |     backbone_name='roberta-base'
21 |     model_name=$2
22 |     lora_r=8
23 |     lora_alpha=16
24 |     batch_size=4
25 | elif [ "$#" -eq 6 ]; then
26 |     id=$1
27 |     backbone_name=$2
28 |     model_name=$3
29 |     lora_r=$4
30 |     lora_alpha=$5
31 |     batch_size=$6
32 | fi
33 | 
34 | task_name=alpaca_gpt4
35 | output_dir="output/efficiency_testing/${backbone_name}/${task_name}/${id}/bz${batch_size}/"
36 | 
37 | echo $output_dir
38 | mkdir -p $output_dir
39 | 
40 | python efficiency_test_llama.py \
41 |     --output_dir ${output_dir}\
42 |     --task_name alpaca_gpt4 \
43 |     --model_name_or_path ${model_name} \
44 |     --bf16 True \
45 |     --tf32 True \
46 |     --data_path 'data/sft/alpaca_data_gpt4.json' \
47 |     --do_eval \
48 |     --save_strategy no \
49 |     --evaluation_strategy steps \
50 |     --logging_strategy steps \
51 |     --logging_steps 100 \
52 |     --eval_steps 500 \
53 |     --model_max_length 512 \
54 |     --per_device_train_batch_size ${batch_size} \
55 |     --per_device_eval_batch_size ${batch_size} \
56 |     --apply_lora \
57 |     --lora_alpha ${lora_alpha} \
58 |     --lora_r ${lora_r} \
59 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/models/modeling_outputs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Optional, Tuple
 5 | 
 6 | from transformers.file_utils import ModelOutput
 7 | from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, BaseModelOutputWithPastAndCrossAttentions
 8 | 
 9 | 
10 | @dataclass
11 | class NewQuestionAnsweringModelOutput(QuestionAnsweringModelOutput):
12 |     masked_loss: Optional[torch.FloatTensor] = None
13 |     masked_start_logits: Optional[torch.FloatTensor] = None
14 |     masked_end_logits: Optional[torch.FloatTensor] = None
15 |     masked_states: Optional[Tuple[torch.FloatTensor]] = None
16 |     
17 | 
18 | @dataclass
19 | class NewBaseModelOutputWithPooling(ModelOutput):
20 |     last_hidden_state: torch.FloatTensor = None
21 |     pooler_output: torch.FloatTensor = None
22 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
23 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
24 |     attention_layers: Optional[Tuple[torch.FloatTensor]] = None
25 |     masked_states: Optional[Tuple[torch.FloatTensor]] = None
26 |     masked_pooler_output: Optional[torch.FloatTensor] = None
27 | 
28 | @dataclass
29 | class NewBaseModelOutput(ModelOutput):
30 |     last_hidden_state: torch.FloatTensor = None
31 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
32 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
33 |     attention_layers: Optional[Tuple[torch.FloatTensor]] = None
34 |     masked_states: Optional[Tuple[torch.FloatTensor]] = None
35 |     
36 | @dataclass 
37 | class NewSequenceClassifierOutput(SequenceClassifierOutput):
38 |     masked_states: Optional[Tuple[torch.FloatTensor]] = None
39 |     masked_logits: Optional[torch.FloatTensor] = None
40 |     masked_loss: Optional[torch.FloatTensor] = None
41 |     
42 | class AdaPBaseModelOutputWithPastAndCrossAttentions(BaseModelOutputWithPastAndCrossAttentions):
43 |     masked_hidden_states: torch.FloatTensor = None


--------------------------------------------------------------------------------
/scripts/eval/mmlu.sh:
--------------------------------------------------------------------------------
 1 | # # export CUDA_VISIBLE_DEVICES=0
 2 | # zero-shot
 3 | model_name_or_path=$1
 4 | 
 5 | mkdir -p output/results/mmlu/llama-7B-5shot/
 6 | 
 7 | python run_eval_llama_mmlu.py \
 8 |     --ntrain 5 \
 9 |     --data_dir /mmfs1/home/bowen98/projects/AdaptPruning/data/eval/mmlu \
10 |     --output_dir output/results/mmlu/llama2-7B-0shot/ \
11 |     --model_name_or_path ${model_name_or_path} \
12 |     --tokenizer_name meta-llama/Llama-2-7b-hf \
13 |     --eval_batch_size 2 | tee "${model_name_or_path}/mmlu-5shot.log"
14 |     
15 | # python -m eval.mmlu.run_eval \
16 | #     --ntrain 0 \
17 | #     --data_dir /mmfs1/home/bowen98/projects/AdaptPruning/data/eval/mmlu \
18 | #     --save_dir results/mmlu/llama2-7B-0shot/ \
19 | #     --model_name_or_path meta-llama/Llama-2-7b-hf \
20 | #     --tokenizer_name_or_path meta-llama/Llama-2-7b-hf \
21 | #     --eval_batch_size 2 \
22 | #     --use_chat_format
23 | 
24 | # # zero-shot with chatgpt
25 | # python -m eval.mmlu.run_eval \
26 | #     --ntrain 0 \
27 | #     --data_dir data/eval/mmlu \
28 | #     --save_dir results/mmlu/chatgpt-0shot/ \
29 | #     --openai_engine "gpt-3.5-turbo-0301" \
30 | #     --eval_batch_size 20
31 | 
32 | 
33 | # # few-shot with chatgpt
34 | # python -m eval.mmlu.run_eval \
35 | #     --ntrain 5 \
36 | #     --data_dir data/eval/mmlu \
37 | #     --save_dir results/mmlu/chatgpt-5shot/ \
38 | #     --openai_engine "gpt-3.5-turbo-0301" \
39 | #     --eval_batch_size 20
40 | 
41 | 
42 | # # zero-shot with gpt4
43 | # python -m eval.mmlu.run_eval \
44 | #     --ntrain 0 \
45 | #     --data_dir data/eval/mmlu \
46 | #     --save_dir results/mmlu/gpt4-0shot/ \
47 | #     --openai_engine "gpt-4-0314" \
48 | #     --n_instances 100 \
49 | #     --eval_batch_size 20
50 | 
51 | 
52 | # # few-shot with gpt4
53 | # python -m eval.mmlu.run_eval \
54 | #     --ntrain 5 \
55 | #     --data_dir data/eval/mmlu \
56 | #     --save_dir results/mmlu/gpt4-5shot/ \
57 | #     --openai_engine "gpt-4-0314" \
58 | #     --n_instances 100 \
59 | #     --eval_batch_size 2
60 | 


--------------------------------------------------------------------------------
/loralib/utils.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) Microsoft Corporation. All rights reserved.
 3 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 4 | #  ------------------------------------------------------------------------------------------
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from typing import Dict
 9 | 
10 | from .layers import LoRALayer
11 | 
12 | 
13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
14 |     for n, p in model.named_parameters():
15 |         if 'lora_' not in n:
16 |             p.requires_grad = False
17 |     if bias == 'none':
18 |         return
19 |     elif bias == 'all':
20 |         for n, p in model.named_parameters():
21 |             if 'bias' in n:
22 |                 p.requires_grad = True
23 |     elif bias == 'lora_only':
24 |         for m in model.modules():
25 |             if isinstance(m, LoRALayer) and \
26 |                 hasattr(m, 'bias') and \
27 |                 m.bias is not None:
28 |                     m.bias.requires_grad = True
29 |     else:
30 |         raise NotImplementedError
31 | 
32 | 
33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
34 |     my_state_dict = model.state_dict()
35 |     if bias == 'none':
36 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
37 |     elif bias == 'all':
38 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
39 |     elif bias == 'lora_only':
40 |         to_return = {}
41 |         for k in my_state_dict:
42 |             if 'lora_' in k:
43 |                 to_return[k] = my_state_dict[k]
44 |                 bias_name = k.split('lora_')[0]+'bias'
45 |                 if bias_name in my_state_dict:
46 |                     to_return[bias_name] = my_state_dict[bias_name]
47 |         return to_return
48 |     else:
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_distill.sh:
--------------------------------------------------------------------------------
 1 | for mac_constraint in 0.1 0.2 0.3
 2 | do
 3 |     for lora_r in 8 32 128
 4 |     do
 5 |         model_name='roberta-base'
 6 |         adapter_type=lora
 7 |         pruning_scheduler=once
 8 |         pruner_type=global
 9 |         task_name=mnli
10 |         lora_alpha=16
11 |         pruning_batches=256
12 |         pruning_batch_size=4
13 |         steppoint=1.0
14 | 
15 |         output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}"
16 |         echo $output_dir
17 |         mkdir -p $output_dir
18 | 
19 |         python run_minus_training.py \
20 |             --output_dir ${output_dir}\
21 |             --task_name ${task_name} \
22 |             --model_name_or_path ${model_name} \
23 |             --do_train \
24 |             --do_eval \
25 |             --save_strategy no \
26 |             --evaluation_strategy steps \
27 |             --minus_scheduler \
28 |             --max_seq_length 128 \
29 |             --num_train_epochs 10 \
30 |             --per_device_train_batch_size 32 \
31 |             --per_device_eval_batch_size 32 \
32 |             --lr_scheduler_type linear\
33 |             --warmup_ratio 0.06\
34 |             --learning_rate 5e-4\
35 |             --weight_decay 0.1\
36 |             --apply_lora \
37 |             --lora_alpha ${lora_alpha} \
38 |             --lora_r ${lora_r} \
39 |             --report_to none \
40 |             --pruning_batches ${pruning_batches} \
41 |             --pruning_batch_size ${pruning_batch_size} \
42 |             --mac_constraint ${mac_constraint} \
43 |             --pruning_scheduler ${pruning_scheduler} \
44 |             --pruning_start ${steppoint} \
45 |             --head_scorer_type gradient_l2 \
46 |             --intermediate_scorer_type gradient_l2 \
47 |             --pruner_type ${pruner_type} \
48 |             --do_distill \
49 |             --distill_epoch 8
50 |     done
51 | done


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_distill_fisher.sh:
--------------------------------------------------------------------------------
 1 | for mac_constraint in 0.05 0.4 0.5
 2 | do
 3 |     for lora_r in 8 32 128
 4 |     do
 5 |         model_name='roberta-base'
 6 |         adapter_type=lora
 7 |         pruning_scheduler=once
 8 |         pruner_type=fisher
 9 |         task_name=mnli
10 |         lora_alpha=16
11 |         pruning_batches=256
12 |         pruning_batch_size=4
13 |         steppoint=1.0
14 | 
15 |         output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}"
16 |         echo $output_dir
17 |         mkdir -p $output_dir
18 | 
19 |         python run_minus_training.py \
20 |             --output_dir ${output_dir}\
21 |             --task_name ${task_name} \
22 |             --model_name_or_path ${model_name} \
23 |             --do_train \
24 |             --do_eval \
25 |             --save_strategy no \
26 |             --evaluation_strategy steps \
27 |             --minus_scheduler \
28 |             --max_seq_length 128 \
29 |             --num_train_epochs 10 \
30 |             --per_device_train_batch_size 32 \
31 |             --per_device_eval_batch_size 32 \
32 |             --lr_scheduler_type linear\
33 |             --warmup_ratio 0.06\
34 |             --learning_rate 5e-4\
35 |             --weight_decay 0.1\
36 |             --apply_lora \
37 |             --lora_alpha ${lora_alpha} \
38 |             --lora_r ${lora_r} \
39 |             --report_to none \
40 |             --pruning_batches ${pruning_batches} \
41 |             --pruning_batch_size ${pruning_batch_size} \
42 |             --mac_constraint ${mac_constraint} \
43 |             --pruning_scheduler ${pruning_scheduler} \
44 |             --pruning_start ${steppoint} \
45 |             --head_scorer_type gradient_l2 \
46 |             --intermediate_scorer_type gradient_l2 \
47 |             --pruner_type ${pruner_type} \
48 |             --do_distill \
49 |             --distill_epoch 8
50 |     done
51 | done


--------------------------------------------------------------------------------
/scripts/train_ft_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='roberta-base'
13 |     task_name=sst2
14 |     num_epochs=30
15 |     learning_rate=2e-5
16 |     batch_size=32
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     task_name=$2
20 |     num_epochs=$3
21 |     learning_rate=$4
22 |     batch_size=$5
23 | fi
24 | 
25 | lora_alpha=16
26 | lora_r=8
27 | suffix=''
28 | 
29 | if [ -d $model_name ]
30 | then
31 |     output_dir="${model_name}/finetuned/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
32 | else
33 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
34 | fi
35 | 
36 | echo $output_dir
37 | mkdir -p $output_dir
38 | 
39 | python run_minus_seq2seq_training.py \
40 |     --output_dir ${output_dir}\
41 |     --model_name_or_path ${model_name} \
42 |     --do_train \
43 |     --do_eval \
44 |     --save_strategy no \
45 |     --evaluation_strategy steps \
46 |     --logging_strategy steps \
47 |     --logging_steps 1000 \
48 |     --eval_steps 5000 \
49 |     --task_name ${task_name} \
50 |     --max_input_length 512 \
51 |     --max_target_length 128 \
52 |     --num_train_epochs ${num_epochs} \
53 |     --per_device_train_batch_size ${batch_size} \
54 |     --per_device_eval_batch_size ${batch_size} \
55 |     --tf32 True \
56 |     --warmup_ratio 0.06\
57 |     --learning_rate ${learning_rate}\
58 |     --weight_decay 0.1\
59 |     --lora_alpha ${lora_alpha} \
60 |     --lora_r ${lora_r} \
61 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_distill_shorter.sh:
--------------------------------------------------------------------------------
 1 | for mac_constraint in 0.05 0.1 0.2 0.3 0.4 0.5
 2 | do
 3 |     for lora_r in 16 64
 4 |     do
 5 |         model_name='roberta-base'
 6 |         adapter_type=lora
 7 |         pruning_scheduler=once
 8 |         pruner_type=global
 9 |         task_name=mnli
10 |         lora_alpha=16
11 |         pruning_batches=256
12 |         pruning_batch_size=4
13 |         steppoint=1.0
14 | 
15 |         output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_exp_shorter/mac${mac_constraint}/lora_r${lora_r}/lora_alpha${lora_alpha}"
16 |         echo $output_dir
17 |         mkdir -p $output_dir
18 | 
19 |         python run_minus_training.py \
20 |             --output_dir ${output_dir}\
21 |             --task_name ${task_name} \
22 |             --model_name_or_path ${model_name} \
23 |             --do_train \
24 |             --do_eval \
25 |             --save_strategy no \
26 |             --evaluation_strategy steps \
27 |             --minus_scheduler \
28 |             --max_seq_length 128 \
29 |             --num_train_epochs 5 \
30 |             --per_device_train_batch_size 32 \
31 |             --per_device_eval_batch_size 32 \
32 |             --lr_scheduler_type linear\
33 |             --warmup_ratio 0.06\
34 |             --learning_rate 5e-4\
35 |             --weight_decay 0.1\
36 |             --apply_lora \
37 |             --lora_alpha ${lora_alpha} \
38 |             --lora_r ${lora_r} \
39 |             --report_to none \
40 |             --pruning_batches ${pruning_batches} \
41 |             --pruning_batch_size ${pruning_batch_size} \
42 |             --mac_constraint ${mac_constraint} \
43 |             --pruning_scheduler ${pruning_scheduler} \
44 |             --pruning_start ${steppoint} \
45 |             --head_scorer_type gradient_l2 \
46 |             --intermediate_scorer_type gradient_l2 \
47 |             --pruner_type ${pruner_type} \
48 |             --do_distill \
49 |             --distill_epoch 3
50 |     done
51 | done


--------------------------------------------------------------------------------
/scripts/train_ft.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='roberta-base'
13 |     task_name=sst2
14 |     num_epochs=30
15 |     learning_rate=2e-5
16 |     batch_size=32
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     task_name=$2
20 |     num_epochs=$3
21 |     learning_rate=$4
22 |     batch_size=$5
23 | fi
24 | 
25 | lora_alpha=16
26 | lora_r=8
27 | student_param_tuning_config=q:0-11,v:0-11,i:0-11
28 | suffix=''
29 | 
30 | if [ -d $model_name ]
31 | then
32 |     output_dir="${model_name}/finetuned/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
33 | else
34 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
35 | fi
36 | 
37 | echo $output_dir
38 | mkdir -p $output_dir
39 | 
40 | python run_minus_training.py \
41 |     --output_dir ${output_dir}\
42 |     --task_name ${task_name} \
43 |     --model_name_or_path ${model_name} \
44 |     --do_train \
45 |     --do_eval \
46 |     --save_strategy no \
47 |     --evaluation_strategy steps \
48 |     --logging_strategy steps \
49 |     --logging_steps 1000 \
50 |     --eval_steps 5000 \
51 |     --max_seq_length 128 \
52 |     --num_train_epochs ${num_epochs} \
53 |     --per_device_train_batch_size ${batch_size} \
54 |     --per_device_eval_batch_size ${batch_size} \
55 |     --student_param_tuning_config ${student_param_tuning_config} \
56 |     --warmup_ratio 0.06\
57 |     --learning_rate ${learning_rate}\
58 |     --weight_decay 0.1\
59 |     --lora_alpha ${lora_alpha} \
60 |     --lora_r ${lora_r} \
61 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_qnli.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_name='roberta-base'
12 | task_name=qnli
13 | adapter_type=lora
14 | 
15 | if [ "$#" -eq 0 ]; then
16 |     num_epochs=25
17 |     batch_size=32
18 |     lora_r=8
19 |     lora_alpha=16
20 |     learning_rate=4e-4
21 |     seed=128
22 | elif [ "$#" -eq 6 ]; then
23 |     num_epochs=$1
24 |     batch_size=$2
25 |     lora_r=$3
26 |     lora_alpha=$4
27 |     learning_rate=$5
28 |     seed=$6
29 | fi
30 | 
31 | teacher_param_tuning_config=q:0-11,v:0-11
32 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
33 | echo $output_dir
34 | mkdir -p $output_dir
35 | 
36 | python run_minus_training.py \
37 |     --output_dir ${output_dir}\
38 |     --task_name ${task_name} \
39 |     --model_name_or_path ${model_name} \
40 |     --do_train \
41 |     --do_eval \
42 |     --save_strategy epoch \
43 |     --evaluation_strategy steps \
44 |     --logging_strategy steps \
45 |     --logging_steps 1000 \
46 |     --log_level info \
47 |     --log_level_replica info \
48 |     --eval_steps 5000 \
49 |     --max_seq_length 128 \
50 |     --num_train_epochs ${num_epochs} \
51 |     --per_device_train_batch_size ${batch_size} \
52 |     --per_device_eval_batch_size ${batch_size} \
53 |     --warmup_ratio 0.06\
54 |     --learning_rate ${learning_rate}\
55 |     --weight_decay 0.1\
56 |     --seed ${seed} \
57 |     --apply_lora \
58 |     --lora_alpha ${lora_alpha} \
59 |     --lora_r ${lora_r} \
60 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
61 |     --report_to none \


--------------------------------------------------------------------------------
/scripts/lora/roberta_base_qqp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_name='roberta-base'
12 | task_name=qqp
13 | adapter_type=lora
14 | 
15 | if [ "$#" -eq 0 ]; then
16 |     num_epochs=25
17 |     batch_size=32
18 |     lora_r=8
19 |     lora_alpha=16
20 |     learning_rate=5e-4
21 |     seed=128
22 | elif [ "$#" -eq 6 ]; then
23 |     num_epochs=$1
24 |     batch_size=$2
25 |     lora_r=$3
26 |     lora_alpha=$4
27 |     learning_rate=$5
28 |     seed=$6
29 | fi
30 | 
31 | teacher_param_tuning_config=q:0-11,v:0-11
32 | output_dir="output/${model_name}/${task_name}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/param_tuning_${teacher_param_tuning_config}/lr${learning_rate}/seed${seed}"
33 | echo $output_dir
34 | mkdir -p $output_dir
35 | 
36 | python run_minus_training.py \
37 |     --output_dir ${output_dir}\
38 |     --task_name ${task_name} \
39 |     --model_name_or_path ${model_name} \
40 |     --do_train \
41 |     --do_eval \
42 |     --save_strategy epoch \
43 |     --evaluation_strategy steps \
44 |     --logging_strategy steps \
45 |     --logging_steps 1000 \
46 |     --log_level info \
47 |     --log_level_replica info \
48 |     --eval_steps 5000 \
49 |     --max_seq_length 128 \
50 |     --num_train_epochs ${num_epochs} \
51 |     --per_device_train_batch_size ${batch_size} \
52 |     --per_device_eval_batch_size ${batch_size} \
53 |     --warmup_ratio 0.06\
54 |     --learning_rate ${learning_rate}\
55 |     --weight_decay 0.1\
56 |     --seed ${seed} \
57 |     --apply_lora \
58 |     --lora_alpha ${lora_alpha} \
59 |     --lora_r ${lora_r} \
60 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
61 |     --report_to none \


--------------------------------------------------------------------------------
/test/test_gpu_base_speed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | import torch
 5 | from models import build_model
 6 | from transformers import HfArgumentParser
 7 | from args import DataTrainingArguments
 8 | from models.model_args import ModelArguments
 9 | from args import MinusTrainingArguments
10 | from utils.utils import *
11 | from utils.minus_utils import bench_latency
12 | 
13 | NUM_GPUS=8
14 | 
15 | def main():
16 |     sys.argv = ['neuron_importance.py',
17 |             '--output_dir',
18 |             './output/neuron_importance/',
19 |             '--model_name_or_path',
20 |             'roberta-base',
21 |             '--task_name',
22 |             'mnli',
23 |             '--do_train',
24 |             '--do_eval',
25 |             '--max_seq_length',
26 |             '128',
27 |             '--per_device_train_batch_size',
28 |             '32',
29 |             '--per_device_eval_batch_size',
30 |             '32',
31 |             '--apply_lora',
32 |             '--do_distill',
33 |             '--lora_r',
34 |             '64'
35 |             ]
36 |     parser = HfArgumentParser(
37 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
38 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
39 |         # If we pass only one argument to the script and it's the path to a json file,
40 |         # let's parse it to get our arguments.
41 |         model_args, data_args, training_args = parser.parse_json_file(
42 |             json_file=os.path.abspath(sys.argv[1]))
43 |     else:
44 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
45 |     # training_args.disable_tqdm = False
46 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
47 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
48 |     
49 |     results = {}
50 |     for i in range(NUM_GPUS):
51 |         model.cuda(i)
52 |         results[i] = bench_latency(model, 128, 128, tokenizer)['t_mean'] * 1000
53 |         
54 |     
55 | 
56 | if __name__ == '__main__':
57 |     main()


--------------------------------------------------------------------------------
/scripts/train_lora_squad.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=32G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     lora_r=8
14 |     lora_alpha=16
15 |     learning_rate=2e-4
16 |     teacher_param_tuning_config=q:0-11,v:0-11
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     lora_r=$2
20 |     lora_alpha=$3
21 |     learning_rate=$4
22 |     teacher_param_tuning_config=$5
23 | fi
24 | 
25 | adapter_type=lora
26 | num_epochs=20
27 | batch_size=32
28 | suffix=''
29 | 
30 | if [ -d $model_name ]
31 | then
32 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
33 | else
34 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
35 | fi
36 | 
37 | echo $output_dir
38 | mkdir -p $output_dir
39 | 
40 | python run_minus_squad_training.py \
41 |     --output_dir ${output_dir}\
42 |     --model_name_or_path ${model_name} \
43 |     --do_train \
44 |     --do_eval \
45 |     --save_strategy no \
46 |     --evaluation_strategy steps \
47 |     --logging_strategy steps \
48 |     --logging_steps 1000 \
49 |     --eval_steps 5000 \
50 |     --max_seq_length 384 \
51 |     --doc_stride 128 \
52 |     --num_train_epochs ${num_epochs} \
53 |     --per_device_train_batch_size ${batch_size} \
54 |     --per_device_eval_batch_size ${batch_size} \
55 |     --tf32 True \
56 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
57 |     --warmup_ratio 0.06\
58 |     --learning_rate ${learning_rate}\
59 |     --weight_decay 0.1\
60 |     --apply_lora \
61 |     --lora_alpha ${lora_alpha} \
62 |     --lora_r ${lora_r} \
63 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/train_lora_squadv2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     lora_r=8
14 |     lora_alpha=16
15 |     learning_rate=2e-4
16 |     teacher_param_tuning_config=q:0-11,v:0-11
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     lora_r=$2
20 |     lora_alpha=$3
21 |     learning_rate=$4
22 |     teacher_param_tuning_config=$5
23 | fi
24 | 
25 | adapter_type=lora
26 | num_epochs=30
27 | batch_size=32
28 | suffix=''
29 | 
30 | if [ -d $model_name ]
31 | then
32 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
33 | else
34 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
35 | fi
36 | 
37 | echo $output_dir
38 | mkdir -p $output_dir
39 | 
40 | python run_minus_squad_training.py \
41 |     --output_dir ${output_dir}\
42 |     --model_name_or_path ${model_name} \
43 |     --do_train \
44 |     --do_eval \
45 |     --save_strategy no \
46 |     --evaluation_strategy steps \
47 |     --logging_strategy steps \
48 |     --logging_steps 1000 \
49 |     --eval_steps 5000 \
50 |     --max_seq_length 384 \
51 |     --doc_stride 128 \
52 |     --version_2_with_negative \
53 |     --num_train_epochs ${num_epochs} \
54 |     --per_device_train_batch_size ${batch_size} \
55 |     --per_device_eval_batch_size ${batch_size} \
56 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
57 |     --warmup_ratio 0.06\
58 |     --learning_rate ${learning_rate}\
59 |     --weight_decay 0.1\
60 |     --apply_lora \
61 |     --lora_alpha ${lora_alpha} \
62 |     --lora_r ${lora_r} \
63 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/train_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     task_name=mnli
14 |     lora_r=8
15 |     lora_alpha=16
16 |     learning_rate=2e-4
17 |     teacher_param_tuning_config=q:0-11,v:0-11
18 | elif [ "$#" -eq 6 ]; then
19 |     model_name=$1
20 |     task_name=$2
21 |     lora_r=$3
22 |     lora_alpha=$4
23 |     learning_rate=$5
24 |     teacher_param_tuning_config=$6
25 | fi
26 | 
27 | adapter_type=lora
28 | num_epochs=30
29 | batch_size=32
30 | suffix=''
31 | 
32 | if [ -d $model_name ]
33 | then
34 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
35 | else
36 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
37 | fi
38 | 
39 | echo $output_dir
40 | mkdir -p $output_dir
41 | 
42 | python run_minus_training.py \
43 |     --output_dir ${output_dir}\
44 |     --task_name ${task_name} \
45 |     --model_name_or_path ${model_name} \
46 |     --do_train \
47 |     --do_eval \
48 |     --save_strategy no \
49 |     --evaluation_strategy steps \
50 |     --logging_strategy steps \
51 |     --logging_steps 100 \
52 |     --eval_steps 500 \
53 |     --max_seq_length 128 \
54 |     --num_train_epochs ${num_epochs} \
55 |     --per_device_train_batch_size ${batch_size} \
56 |     --per_device_eval_batch_size ${batch_size} \
57 |     --tf32 True \
58 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
59 |     --warmup_ratio 0.06\
60 |     --learning_rate ${learning_rate}\
61 |     --weight_decay 0.1\
62 |     --apply_lora \
63 |     --lora_alpha ${lora_alpha} \
64 |     --lora_r ${lora_r} \
65 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/train_lora_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     lora_r=8
14 |     lora_alpha=16
15 |     learning_rate=1e-4
16 |     teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     lora_r=$2
20 |     lora_alpha=$3
21 |     learning_rate=$4
22 |     teacher_param_tuning_config=$5
23 | fi
24 | 
25 | adapter_type=lora
26 | task_name=cnndm
27 | num_epochs=10
28 | batch_size=16
29 | suffix=''
30 | 
31 | if [ -d $model_name ]
32 | then
33 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
34 | else
35 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
36 | fi
37 | 
38 | echo $output_dir
39 | mkdir -p $output_dir
40 | 
41 | python run_minus_seq2seq_training.py \
42 |     --output_dir ${output_dir}\
43 |     --model_name_or_path ${model_name} \
44 |     --do_train \
45 |     --do_eval \
46 |     --save_strategy no \
47 |     --evaluation_strategy steps \
48 |     --logging_strategy steps \
49 |     --logging_steps 1000 \
50 |     --eval_steps 5000 \
51 |     --task_name ${task_name} \
52 |     --max_input_length 512 \
53 |     --max_target_length 128 \
54 |     --num_train_epochs ${num_epochs} \
55 |     --per_device_train_batch_size ${batch_size} \
56 |     --per_device_eval_batch_size ${batch_size} \
57 |     --tf32 True \
58 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
59 |     --warmup_ratio 0.06\
60 |     --learning_rate ${learning_rate}\
61 |     --weight_decay 0.1\
62 |     --apply_lora \
63 |     --lora_alpha ${lora_alpha} \
64 |     --lora_r ${lora_r} \
65 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/train_lora_sft.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p ckpt
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gpus=a100:1               # Number of GPUs requested
 9 | #SBATCH --time=24:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     lora_r=8
14 |     lora_alpha=16
15 |     learning_rate=1e-4
16 |     teacher_param_tuning_config=dq:0-31,dv:0-31
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     lora_r=$2
20 |     lora_alpha=$3
21 |     learning_rate=$4
22 |     teacher_param_tuning_config=$5
23 | fi
24 | 
25 | adapter_type=lora
26 | task_name=alpaca_gpt4
27 | num_epochs=2
28 | batch_size=4
29 | suffix=''
30 | 
31 | if [ -d $model_name ]
32 | then
33 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
34 | else
35 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
36 | fi
37 | 
38 | echo $output_dir
39 | mkdir -p $output_dir
40 | 
41 | python run_llama_sft.py \
42 |     --output_dir ${output_dir}\
43 |     --task_name ${task_name} \
44 |     --model_name_or_path ${model_name} \
45 |     --bf16 True \
46 |     --data_path 'data/sft/alpaca_data_cleaned.json' \
47 |     --do_train \
48 |     --do_eval \
49 |     --save_strategy no \
50 |     --evaluation_strategy steps \
51 |     --logging_strategy steps \
52 |     --logging_steps 100 \
53 |     --eval_steps 500 \
54 |     --model_max_length 512 \
55 |     --num_train_epochs ${num_epochs} \
56 |     --per_device_train_batch_size ${batch_size} \
57 |     --per_device_eval_batch_size ${batch_size} \
58 |     --gradient_accumulation_steps 8 \
59 |     --warmup_ratio 0.03\
60 |     --learning_rate ${learning_rate}\
61 |     --weight_decay 0.\
62 |     --lr_scheduler_type cosine \
63 |     --tf32 True \
64 |     --apply_lora \
65 |     --lora_alpha ${lora_alpha} \
66 |     --lora_r ${lora_r} \
67 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
68 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/lora/mt5_base_wmt_enro.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=144:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_name=google/mt5-base
12 | task_name=wmt16
13 | adapter_type=lora
14 | source_lang=en
15 | target_lang=ro
16 | 
17 | if [ "$#" -eq 0 ]; then
18 |     num_epochs=2
19 |     batch_size=16
20 |     lora_r=102
21 |     lora_alpha=408
22 |     learning_rate=5e-5
23 |     seed=42
24 | elif [ "$#" -eq 6 ]; then
25 |     num_epochs=$1
26 |     batch_size=$2
27 |     lora_r=$3
28 |     lora_alpha=$4
29 |     learning_rate=$5
30 |     seed=$6
31 | fi
32 | 
33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
34 | output_dir="output/${model_name}/${task_name}_${source_lang}-${target_lang}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
35 | echo $output_dir
36 | mkdir -p $output_dir
37 | 
38 | python run_minus_seq2seq_training.py \
39 |     --output_dir ${output_dir}\
40 |     --task_name ${task_name} \
41 |     --model_name_or_path ${model_name} \
42 |     --do_train \
43 |     --do_eval \
44 |     --save_strategy epoch \
45 |     --evaluation_strategy steps \
46 |     --logging_strategy steps \
47 |     --logging_steps 500 \
48 |     --eval_steps 2000 \
49 |     --max_input_length 150 \
50 |     --max_target_length 150 \
51 |     --lang_pair ${target_lang}-${source_lang} \
52 |     --source_lang ${source_lang} \
53 |     --target_lang ${target_lang} \
54 |     --num_train_epochs ${num_epochs} \
55 |     --per_device_train_batch_size ${batch_size} \
56 |     --per_device_eval_batch_size ${batch_size} \
57 |     --warmup_ratio 0.06\
58 |     --learning_rate ${learning_rate}\
59 |     --weight_decay 0.01\
60 |     --apply_lora \
61 |     --lora_alpha ${lora_alpha} \
62 |     --lora_r ${lora_r} \
63 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
64 |     --tf32 True \
65 |     --fp16 True \
66 |     --report_to none | tee ${output_dir}/log.txt \


--------------------------------------------------------------------------------
/scripts/lora/mt5_base_wmt_roen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=144:00:00             # Walltime (hh:mm:ss)
10 | 
11 | model_name=google/mt5-base
12 | task_name=wmt16
13 | adapter_type=lora
14 | source_lang=ro
15 | target_lang=en
16 | 
17 | if [ "$#" -eq 0 ]; then
18 |     num_epochs=5
19 |     batch_size=16
20 |     lora_r=8
21 |     lora_alpha=16
22 |     learning_rate=1e-4
23 |     seed=42
24 | elif [ "$#" -eq 6 ]; then
25 |     num_epochs=$1
26 |     batch_size=$2
27 |     lora_r=$3
28 |     lora_alpha=$4
29 |     learning_rate=$5
30 |     seed=$6
31 | fi
32 | 
33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
34 | output_dir="output/${model_name}/${task_name}_${source_lang}-${target_lang}/bz${batch_size}/${adapter_type}/epoch${num_epochs}/lora_r${lora_r}/lora_alpha${lora_alpha}/lr${learning_rate}/seed${seed}"
35 | echo $output_dir
36 | mkdir -p $output_dir
37 | 
38 | python run_minus_seq2seq_training.py \
39 |     --output_dir ${output_dir}\
40 |     --task_name ${task_name} \
41 |     --model_name_or_path ${model_name} \
42 |     --do_train \
43 |     --do_eval \
44 |     --save_strategy epoch \
45 |     --evaluation_strategy steps \
46 |     --logging_strategy steps \
47 |     --logging_steps 500 \
48 |     --eval_steps 2000 \
49 |     --max_input_length 150 \
50 |     --max_target_length 150 \
51 |     --lang_pair ${target_lang}-${source_lang} \
52 |     --source_lang ${source_lang} \
53 |     --target_lang ${target_lang} \
54 |     --num_train_epochs ${num_epochs} \
55 |     --per_device_train_batch_size ${batch_size} \
56 |     --per_device_eval_batch_size ${batch_size} \
57 |     --warmup_ratio 0.06\
58 |     --learning_rate ${learning_rate}\
59 |     --weight_decay 0.01\
60 |     --label_smoothing 0.1 \
61 |     --apply_lora \
62 |     --lora_alpha ${lora_alpha} \
63 |     --lora_r ${lora_r} \
64 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
65 |     --tf32 True \
66 |     --fp16 True \
67 |     --report_to none | tee ${output_dir}/log.txt \


--------------------------------------------------------------------------------
/models/model_args.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from dataclasses import dataclass, field
 3 | 
 4 | @dataclass
 5 | class ModelArguments:
 6 |     """
 7 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 8 |     """
 9 | 
10 |     model_name_or_path: str = field(
11 |         default=None,
12 |         metadata={
13 |             "help": "Path to pretrained model or model identifier from huggingface.co/models"}
14 |     )
15 |     config_name: Optional[str] = field(
16 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
17 |     )
18 |     tokenizer_name: Optional[str] = field(
19 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
20 |     )
21 |     cache_dir: Optional[str] = field(
22 |         default=None,
23 |         metadata={
24 |             "help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
25 |     )
26 |     use_fast_tokenizer: bool = field(
27 |         default=True,
28 |         metadata={
29 |             "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
30 |     )
31 |     model_revision: str = field(
32 |         default="main",
33 |         metadata={
34 |             "help": "The specific model version to use (can be a branch name, tag name or commit id)."},
35 |     )
36 |     use_auth_token: bool = field(
37 |         default=False,
38 |         metadata={
39 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
40 |             "with private models)."
41 |         },
42 |     )
43 |     apply_lora: bool = field(
44 |         default=False,
45 |         metadata={
46 |             "help": "Whether to apply LORA to the model or not."
47 |         },
48 |     )
49 |     lora_alpha: int = field(
50 |         default=16,
51 |         metadata={
52 |             "help": "The alpha value for LoRA."
53 |         },
54 |     )
55 |     lora_r: int = field(
56 |         default=8,
57 |         metadata={
58 |             "help": "The r value for LoRA."
59 |         },
60 |     )
61 |     do_auto_pruning: bool = field(
62 |         default=False,
63 |         metadata={
64 |             "help": "Whether to apply auto pruning to the model when loading or not."
65 |         }
66 |     )


--------------------------------------------------------------------------------
/scripts/hyperparameter_searching/test_training_hypers.sh:
--------------------------------------------------------------------------------
 1 | for teacher_loss_alpha in 0.2 0.3 0.4 0.5 0.6 0.7
 2 | do
 3 |     for distill_loss_alpha in 0.4
 4 |     do
 5 |         distill_ce_loss_alpha=$(echo "1 - ${distill_loss_alpha}" | bc -l)
 6 |         mac_constraint=0.5
 7 |         model_name='roberta-base'
 8 |         adapter_type=lora
 9 |         pruning_scheduler=once
10 |         pruner_type=global
11 |         task_name=mnli
12 |         lora_alpha=16
13 |         lora_r=64
14 |         pruning_batches=256
15 |         pruning_batch_size=4
16 |         steppoint=1.0
17 | 
18 |         output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_distill_full_hyperexp/mac${mac_constraint}/teacher${teacher_loss_alpha}/ce${distill_ce_loss_alpha}"
19 |         echo $output_dir
20 |         mkdir -p $output_dir
21 | 
22 |         python run_minus_training.py \
23 |             --output_dir ${output_dir}\
24 |             --task_name ${task_name} \
25 |             --model_name_or_path ${model_name} \
26 |             --do_train \
27 |             --do_eval \
28 |             --save_strategy no \
29 |             --evaluation_strategy steps \
30 |             --minus_scheduler \
31 |             --max_seq_length 128 \
32 |             --num_train_epochs 10 \
33 |             --per_device_train_batch_size 32 \
34 |             --per_device_eval_batch_size 32 \
35 |             --lr_scheduler_type linear\
36 |             --warmup_ratio 0.06\
37 |             --learning_rate 5e-4\
38 |             --weight_decay 0.1\
39 |             --apply_lora \
40 |             --lora_alpha ${lora_alpha} \
41 |             --lora_r ${lora_r} \
42 |             --report_to none \
43 |             --pruning_batches ${pruning_batches} \
44 |             --pruning_batch_size ${pruning_batch_size} \
45 |             --mac_constraint ${mac_constraint} \
46 |             --pruning_scheduler ${pruning_scheduler} \
47 |             --pruning_start ${steppoint} \
48 |             --head_scorer_type gradient_l2 \
49 |             --intermediate_scorer_type gradient_l2 \
50 |             --pruner_type ${pruner_type} \
51 |             --do_distill \
52 |             --teacher_loss_alpha ${teacher_loss_alpha} \
53 |             --distill_loss_alpha ${distill_loss_alpha} \
54 |             --distill_ce_loss_alpha ${distill_ce_loss_alpha} \
55 |             --distill_epoch 8
56 |     done
57 | done


--------------------------------------------------------------------------------
/scripts/train_lora_wmt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name='bert-base-uncased'
13 |     lora_r=8
14 |     lora_alpha=16
15 |     learning_rate=1e-4
16 |     teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
17 | elif [ "$#" -eq 5 ]; then
18 |     model_name=$1
19 |     lora_r=$2
20 |     lora_alpha=$3
21 |     learning_rate=$4
22 |     teacher_param_tuning_config=$5
23 | fi
24 | 
25 | adapter_type=lora
26 | num_epochs=30
27 | batch_size=8
28 | suffix=''
29 | task_name=wmt16
30 | source_lang=en
31 | target_lang=ro
32 | 
33 | if [ -d $model_name ]
34 | then
35 |     output_dir="${model_name}/loraed/epoch${num_epochs}/bz${batch_size}/lr${learning_rate}"
36 | else
37 |     output_dir="output/${model_name}_${adapter_type}_${task_name}${suffix}/epoch${num_epochs}/bz${batch_size}/lora_r${lora_r}/lora_alpha${lora_alpha}"
38 | fi
39 | 
40 | echo $output_dir
41 | mkdir -p $output_dir
42 | 
43 | python run_minus_seq2seq_training.py \
44 |     --output_dir ${output_dir}\
45 |     --model_name_or_path ${model_name} \
46 |     --do_train \
47 |     --do_eval \
48 |     --save_strategy no \
49 |     --evaluation_strategy steps \
50 |     --logging_strategy steps \
51 |     --logging_steps 1000 \
52 |     --eval_steps 5000 \
53 |     --task_name ${task_name} \
54 |     --max_input_length 256 \
55 |     --max_target_length 256 \
56 |     --lang_pair ${target_lang}-${source_lang} \
57 |     --source_lang ${source_lang} \
58 |     --target_lang ${target_lang} \
59 |     --max_input_length 512 \
60 |     --max_target_length 128 \
61 |     --num_train_epochs ${num_epochs} \
62 |     --per_device_train_batch_size ${batch_size} \
63 |     --per_device_eval_batch_size ${batch_size} \
64 |     --tf32 True \
65 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
66 |     --warmup_ratio 0.06\
67 |     --learning_rate ${learning_rate}\
68 |     --weight_decay 0.1\
69 |     --apply_lora \
70 |     --lora_alpha ${lora_alpha} \
71 |     --lora_r ${lora_r} \
72 |     --report_to none | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/eval/wmt_enro.sh:
--------------------------------------------------------------------------------
 1 | model_name=$1
 2 | 
 3 | mac_constraint=0.4
 4 | lora_r=8
 5 | pruning_start=-1
 6 | pruning_scheduler=cubic_gradual
 7 | pruner_type=none
 8 | param_allocation_strategy=running_fisher
 9 | distillation_type=self_momentum
10 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student
11 | 
12 | 
13 | task_name=wmt16
14 | adapter_type=lora
15 | source_lang=en
16 | target_lang=ro
17 | param_resizing_strategy=tophalf_limited
18 | pruning_start=-1
19 | pruning_stop=3
20 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated
21 | distill_epoch=5
22 | pruning_batches=64
23 | num_prunings=10
24 | pruning_batch_size=4
25 | # pre_pruning_tuning_epochs=1
26 | pre_pruning_tuning_steps=200
27 | sparsity_warmup_epochs=1
28 | 
29 | learning_rate=1e-3
30 | training_batch_size=16
31 | num_train_epochs=10
32 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
33 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
34 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
35 | 
36 | 
37 | output_dir="${model_name}/eval"
38 | echo $output_dir
39 | mkdir -p $output_dir
40 | 
41 | python run_minus_seq2seq_training.py \
42 |     --output_dir ${output_dir}\
43 |     --model_name_or_path ${model_name} \
44 |     --do_eval \
45 |     --save_strategy no \
46 |     --evaluation_strategy steps \
47 |     --logging_strategy steps \
48 |     --eval_steps 5000 \
49 |     --logging_steps 1000 \
50 |     --log_level info \
51 |     --log_level_replica info \
52 |     --minus_scheduler \
53 |     --task_name ${task_name} \
54 |     --max_input_length 256 \
55 |     --max_target_length 256 \
56 |     --lang_pair ${target_lang}-${source_lang} \
57 |     --source_lang ${source_lang} \
58 |     --target_lang ${target_lang} \
59 |     --num_train_epochs ${num_train_epochs} \
60 |     --per_device_train_batch_size ${training_batch_size} \
61 |     --per_device_eval_batch_size ${training_batch_size} \
62 |     --tf32 True \
63 |     --lr_scheduler_type linear\
64 |     --distillation_type ${distillation_type} \
65 |     --distill_mapping_strategy ${distill_mapping_strategy} \
66 |     --warmup_ratio 0.06\
67 |     --learning_rate ${learning_rate}\
68 |     --weight_decay 0.1\
69 |     --seed 128 \
70 |     --apply_lora \
71 |     --lora_alpha 16 \
72 |     --lora_r ${lora_r} \
73 |     --report_to none \
74 |     --pruner_type none \
75 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/test/test_t5_efficiency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | 
 5 | from transformers import HfArgumentParser
 6 | from args import Seq2SeqDataTrainingArguments
 7 | from models import build_model
 8 | from models.model_args import ModelArguments
 9 | from utils.utils import *
10 | from args import MinusTrainingArguments
11 | from utils.minus_utils import efficiency_testing, input_constructor
12 | 
13 | def main():
14 |     sys.argv = ['test_t5.py',
15 |             '--output_dir',
16 |             './output/test_t5_grafting/',
17 |             '--model_name_or_path',
18 |             'output/t5-large_lora_minus_cnndm_once_global_free_inout_nodistill/mac0.4/epoch5/bz16/numprune5/parameq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23,ei:0-23,di:0-23/lora_r8/lora_alpha16/best_model',
19 |             '--task_name',
20 |             'cnndm',
21 |             '--do_train',
22 |             '--do_eval',
23 |             '--max_input_length',
24 |             '512',
25 |             '--max_target_length',
26 |             '128',
27 |             '--per_device_train_batch_size',
28 |             '32',
29 |             '--per_device_eval_batch_size',
30 |             '32',
31 |             '--eval_accumulation_steps',
32 |             '1',
33 |             '--lora_r',
34 |             '8',
35 |             '--lora_alpha',
36 |             '16',
37 |             '--apply_lora',
38 |             '--pruner_type',
39 |             'global',
40 |             '--head_scorer_type',
41 |             'gradient_l2',
42 |             '--intermediate_scorer_type',
43 |             'gradient_l2',
44 |             '--pruning_batch_size',
45 |             '4',
46 |             '--pruning_batches',
47 |             '64',
48 |             '--pruning_scheduler',
49 |             'once',
50 |             ]
51 |     parser = HfArgumentParser(
52 |         (ModelArguments, Seq2SeqDataTrainingArguments, MinusTrainingArguments))
53 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
54 |         # If we pass only one argument to the script and it's the path to a json file,
55 |         # let's parse it to get our arguments.
56 |         model_args, data_args, training_args = parser.parse_json_file(
57 |             json_file=os.path.abspath(sys.argv[1]))
58 |     else:
59 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
60 |     
61 |     config, tokenizer, model = build_model(model_args, data_args, training_args)
62 | 
63 |     efficiency_results = efficiency_testing(model, tokenizer, training_args.device)


--------------------------------------------------------------------------------
/test/test_t5_prune_consistency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | 
 5 | from transformers import HfArgumentParser
 6 | from args import Seq2SeqDataTrainingArguments
 7 | from models import build_model
 8 | from models.model_args import ModelArguments
 9 | from utils.utils import *
10 | from args import MinusTrainingArguments
11 | from utils.minus_utils import efficiency_testing, input_constructor
12 | 
13 | def main():
14 |     sys.argv = ['test_t5.py',
15 |             '--output_dir',
16 |             './output/test_t5_grafting/',
17 |             '--model_name_or_path',
18 |             'output/t5-large_lora_minus_xsum_once_global_free_inout_nodistill/mac0.05/epoch3/bz4/numprune3/parameq:0-23,ev:0-23,dq:0-23,dv:0-23,cq:0-23,cv:0-23,ei:0-23,di:0-23/lora_r8/prunestart0.01/pre_pruning_model',
19 |             '--task_name',
20 |             'xsum',
21 |             '--do_train',
22 |             '--do_eval',
23 |             '--max_input_length',
24 |             '936',
25 |             '--max_target_length',
26 |             '38',
27 |             '--per_device_train_batch_size',
28 |             '32',
29 |             '--per_device_eval_batch_size',
30 |             '32',
31 |             '--eval_accumulation_steps',
32 |             '1',
33 |             '--lora_r',
34 |             '8',
35 |             '--lora_alpha',
36 |             '16',
37 |             '--apply_lora',
38 |             '--pruner_type',
39 |             'global',
40 |             '--head_scorer_type',
41 |             'gradient_l2',
42 |             '--intermediate_scorer_type',
43 |             'gradient_l2',
44 |             '--pruning_batch_size',
45 |             '4',
46 |             '--pruning_batches',
47 |             '64',
48 |             '--pruning_scheduler',
49 |             'once',
50 |             ]
51 |     parser = HfArgumentParser(
52 |         (ModelArguments, Seq2SeqDataTrainingArguments, MinusTrainingArguments))
53 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
54 |         # If we pass only one argument to the script and it's the path to a json file,
55 |         # let's parse it to get our arguments.
56 |         model_args, data_args, training_args = parser.parse_json_file(
57 |             json_file=os.path.abspath(sys.argv[1]))
58 |     else:
59 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
60 |     
61 |     config, tokenizer, model = build_model(model_args, data_args, training_args)
62 | 
63 |     efficiency_results = efficiency_testing(model, tokenizer, training_args.device)


--------------------------------------------------------------------------------
/eval/mmlu/categories.py:
--------------------------------------------------------------------------------
 1 | subcategories = {
 2 |     "abstract_algebra": ["math"],
 3 |     "anatomy": ["health"],
 4 |     "astronomy": ["physics"],
 5 |     "business_ethics": ["business"],
 6 |     "clinical_knowledge": ["health"],
 7 |     "college_biology": ["biology"],
 8 |     "college_chemistry": ["chemistry"],
 9 |     "college_computer_science": ["computer science"],
10 |     "college_mathematics": ["math"],
11 |     "college_medicine": ["health"],
12 |     "college_physics": ["physics"],
13 |     "computer_security": ["computer science"],
14 |     "conceptual_physics": ["physics"],
15 |     "econometrics": ["economics"],
16 |     "electrical_engineering": ["engineering"],
17 |     "elementary_mathematics": ["math"],
18 |     "formal_logic": ["philosophy"],
19 |     "global_facts": ["other"],
20 |     "high_school_biology": ["biology"],
21 |     "high_school_chemistry": ["chemistry"],
22 |     "high_school_computer_science": ["computer science"],
23 |     "high_school_european_history": ["history"],
24 |     "high_school_geography": ["geography"],
25 |     "high_school_government_and_politics": ["politics"],
26 |     "high_school_macroeconomics": ["economics"],
27 |     "high_school_mathematics": ["math"],
28 |     "high_school_microeconomics": ["economics"],
29 |     "high_school_physics": ["physics"],
30 |     "high_school_psychology": ["psychology"],
31 |     "high_school_statistics": ["math"],
32 |     "high_school_us_history": ["history"],
33 |     "high_school_world_history": ["history"],
34 |     "human_aging": ["health"],
35 |     "human_sexuality": ["culture"],
36 |     "international_law": ["law"],
37 |     "jurisprudence": ["law"],
38 |     "logical_fallacies": ["philosophy"],
39 |     "machine_learning": ["computer science"],
40 |     "management": ["business"],
41 |     "marketing": ["business"],
42 |     "medical_genetics": ["health"],
43 |     "miscellaneous": ["other"],
44 |     "moral_disputes": ["philosophy"],
45 |     "moral_scenarios": ["philosophy"],
46 |     "nutrition": ["health"],
47 |     "philosophy": ["philosophy"],
48 |     "prehistory": ["history"],
49 |     "professional_accounting": ["other"],
50 |     "professional_law": ["law"],
51 |     "professional_medicine": ["health"],
52 |     "professional_psychology": ["psychology"],
53 |     "public_relations": ["politics"],
54 |     "security_studies": ["politics"],
55 |     "sociology": ["culture"],
56 |     "us_foreign_policy": ["politics"],
57 |     "virology": ["health"],
58 |     "world_religions": ["philosophy"],
59 | }
60 | 
61 | categories = {
62 |     "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
63 |     "humanities": ["history", "philosophy", "law"],
64 |     "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
65 |     "other (business, health, misc.)": ["other", "business", "health"],
66 | }
67 | 


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/t5_base_lm_adapt_mnli.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     pruning_start=1
 5 |     pruning_scheduler=once
 6 |     pruner_type=global
 7 |     param_allocation_strategy=none
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     pruning_start=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     pruning_start=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=google/t5-base-lm-adapt
27 | task_name=mnli
28 | adapter_type=lora
29 | continuous_alloc_interval=1
30 | pruning_batches=64
31 | num_prunings=5
32 | pruning_batch_size=4
33 | 
34 | learning_rate=1e-3
35 | training_batch_size=32
36 | num_train_epochs=40
37 | pruning_stop=30
38 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
39 | 
40 | 
41 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
42 | echo $output_dir
43 | mkdir -p $output_dir
44 | 
45 | python run_minus_training.py \
46 |     --output_dir ${output_dir}\
47 |     --model_name_or_path ${model_name} \
48 |     --do_train \
49 |     --do_eval \
50 |     --save_strategy no \
51 |     --evaluation_strategy steps \
52 |     --logging_strategy steps \
53 |     --eval_steps 5000 \
54 |     --logging_steps 5000 \
55 |     --minus_scheduler \
56 |     --task_name ${task_name} \
57 |     --max_seq_length 128 \
58 |     --num_train_epochs ${num_train_epochs} \
59 |     --per_device_train_batch_size ${training_batch_size} \
60 |     --per_device_eval_batch_size ${training_batch_size} \
61 |     --lr_scheduler_type linear\
62 |     --warmup_ratio 0.06\
63 |     --learning_rate ${learning_rate}\
64 |     --weight_decay 0.1\
65 |     --apply_lora \
66 |     --lora_alpha 16 \
67 |     --lora_r ${lora_r} \
68 |     --report_to none \
69 |     --pruning_batches ${pruning_batches} \
70 |     --pruning_batch_size ${pruning_batch_size} \
71 |     --mac_constraint ${mac_constraint} \
72 |     --pruning_scheduler ${pruning_scheduler} \
73 |     --param_allocation_strategy ${param_allocation_strategy} \
74 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
75 |     --head_scorer_type gradient_l2 \
76 |     --intermediate_scorer_type gradient_l2 \
77 |     --pruner_type ${pruner_type} \
78 |     --pruning_start ${pruning_start} \
79 |     --pruning_stop ${pruning_stop} \
80 |     --num_prunings ${num_prunings} \
81 |     --pruning_scheduler_strategy saliency 


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/t5_base_lm_adapt_sst2.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     pruning_start=1
 5 |     pruning_scheduler=once
 6 |     pruner_type=global
 7 |     param_allocation_strategy=none
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     pruning_start=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     pruning_start=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=google/t5-base-lm-adapt
27 | task_name=sst2
28 | adapter_type=lora
29 | continuous_alloc_interval=1
30 | pruning_batches=64
31 | num_prunings=5
32 | pruning_batch_size=4
33 | 
34 | learning_rate=1e-3
35 | training_batch_size=32
36 | num_train_epochs=40
37 | pruning_stop=30
38 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
39 | 
40 | 
41 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
42 | echo $output_dir
43 | mkdir -p $output_dir
44 | 
45 | python run_minus_training.py \
46 |     --output_dir ${output_dir}\
47 |     --model_name_or_path ${model_name} \
48 |     --do_train \
49 |     --do_eval \
50 |     --save_strategy no \
51 |     --evaluation_strategy steps \
52 |     --logging_strategy steps \
53 |     --eval_steps 5000 \
54 |     --logging_steps 5000 \
55 |     --minus_scheduler \
56 |     --task_name ${task_name} \
57 |     --max_seq_length 128 \
58 |     --num_train_epochs ${num_train_epochs} \
59 |     --per_device_train_batch_size ${training_batch_size} \
60 |     --per_device_eval_batch_size ${training_batch_size} \
61 |     --lr_scheduler_type linear\
62 |     --warmup_ratio 0.06\
63 |     --learning_rate ${learning_rate}\
64 |     --weight_decay 0.1\
65 |     --apply_lora \
66 |     --lora_alpha 16 \
67 |     --lora_r ${lora_r} \
68 |     --report_to none \
69 |     --pruning_batches ${pruning_batches} \
70 |     --pruning_batch_size ${pruning_batch_size} \
71 |     --mac_constraint ${mac_constraint} \
72 |     --pruning_scheduler ${pruning_scheduler} \
73 |     --param_allocation_strategy ${param_allocation_strategy} \
74 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
75 |     --head_scorer_type gradient_l2 \
76 |     --intermediate_scorer_type gradient_l2 \
77 |     --pruner_type ${pruner_type} \
78 |     --pruning_start ${pruning_start} \
79 |     --pruning_stop ${pruning_stop} \
80 |     --num_prunings ${num_prunings} \
81 |     --pruning_scheduler_strategy saliency 


--------------------------------------------------------------------------------
/scripts/train_ft_distill_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name=roberta-base
13 |     teacher_path=textattack/roberta-base-SST-2
14 |     task_name=sst2
15 |     learning_rate=2e-5
16 |     training_batch_size=32
17 |     num_train_epochs=20
18 |     distill_mapping_strategy=static_teacher_static_student
19 | elif [ "$#" -eq 3 ]; then
20 |     model_name=$1
21 |     teacher_path=$2
22 |     task_name=$3
23 |     learning_rate=2e-5
24 |     training_batch_size=32
25 |     num_train_epochs=20
26 |     distill_mapping_strategy=static_teacher_static_student
27 | elif [ "$#" -eq 7 ]; then
28 |     model_name=$1
29 |     teacher_path=$2
30 |     task_name=$3
31 |     learning_rate=$4
32 |     training_batch_size=$5
33 |     num_train_epochs=$6
34 |     distill_mapping_strategy=$7
35 | fi
36 | 
37 | lora_r=8
38 | lora_alpha=16
39 | 
40 | if [ -d $model_name ]
41 | then
42 |     output_dir="${model_name}/finetune_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/${distill_mapping_strategy}"
43 | else
44 |     output_dir="output/${model_name}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/"
45 | fi
46 | 
47 | 
48 | echo $output_dir
49 | mkdir -p $output_dir
50 | 
51 | python run_minus_seq2seq_training.py \
52 |     --output_dir ${output_dir}\
53 |     --task_name ${task_name} \
54 |     --model_name_or_path ${model_name} \
55 |     --do_train \
56 |     --do_eval \
57 |     --save_strategy no \
58 |     --log_level info \
59 |     --log_level_replica info \
60 |     --evaluation_strategy steps \
61 |     --logging_strategy steps \
62 |     --logging_steps 1000 \
63 |     --eval_steps 5000 \
64 |     --task_name ${task_name} \
65 |     --max_input_length 512 \
66 |     --max_target_length 128 \
67 |     --num_train_epochs ${num_train_epochs} \
68 |     --per_device_train_batch_size ${training_batch_size} \
69 |     --per_device_eval_batch_size ${training_batch_size} \
70 |     --tf32 True \
71 |     --distillation_type self_student \
72 |     --distill_mapping_strategy ${distill_mapping_strategy} \
73 |     --warmup_ratio 0.06\
74 |     --learning_rate ${learning_rate}\
75 |     --weight_decay 0.1\
76 |     --lora_alpha ${lora_alpha} \
77 |     --lora_r ${lora_r} \
78 |     --report_to none \
79 |     --do_distill \
80 |     --distill_start 0 \
81 |     --distill_epoch ${num_train_epochs} \
82 |     --teacher_path ${teacher_path} | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/bert_base_mnli.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     lora_alpha=16
 5 |     pruning_scheduler=once
 6 |     pruner_type=global
 7 |     param_allocation_strategy=free_inout
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     lora_alpha=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     lora_alpha=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=bert-base-uncased
27 | task_name=mnli
28 | adapter_type=lora
29 | pruning_start=1
30 | continuous_alloc_interval=1
31 | pruning_batches=256
32 | num_prunings=5
33 | pruning_batch_size=4
34 | 
35 | learning_rate=2e-5
36 | training_batch_size=32
37 | num_train_epochs=30
38 | pruning_stop=20
39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
40 | 
41 | 
42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill_evensmallerlr/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
43 | echo $output_dir
44 | mkdir -p $output_dir
45 | 
46 | python run_minus_training.py \
47 |     --output_dir ${output_dir}\
48 |     --task_name ${task_name} \
49 |     --model_name_or_path ${model_name} \
50 |     --do_train \
51 |     --do_eval \
52 |     --save_strategy no \
53 |     --evaluation_strategy epoch \
54 |     --logging_strategy epoch \
55 |     --minus_scheduler \
56 |     --max_seq_length 128 \
57 |     --num_train_epochs ${num_train_epochs} \
58 |     --per_device_train_batch_size ${training_batch_size} \
59 |     --per_device_eval_batch_size ${training_batch_size} \
60 |     --lr_scheduler_type linear\
61 |     --warmup_ratio 0.06\
62 |     --learning_rate ${learning_rate}\
63 |     --weight_decay 0.1\
64 |     --apply_lora \
65 |     --lora_alpha ${lora_alpha} \
66 |     --lora_r ${lora_r} \
67 |     --report_to none \
68 |     --pruning_batches ${pruning_batches} \
69 |     --mac_constraint ${mac_constraint} \
70 |     --pruning_scheduler ${pruning_scheduler} \
71 |     --param_allocation_strategy ${param_allocation_strategy} \
72 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
73 |     --continuous_allocation \
74 |     --continuous_alloc_interval ${continuous_alloc_interval} \
75 |     --pruning_start ${pruning_start} \
76 |     --pruning_stop ${pruning_stop} \
77 |     --head_scorer_type gradient_l2 \
78 |     --intermediate_scorer_type gradient_l2 \
79 |     --pruner_type ${pruner_type} \
80 |     --num_prunings ${num_prunings} \
81 |     --pruning_batch_size ${pruning_batch_size} \
82 |     --pruning_scheduler_strategy saliency


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/bert_base_squad.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     lora_alpha=16
 5 |     pruning_scheduler=once
 6 |     pruner_type=global
 7 |     param_allocation_strategy=free_inout
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     lora_alpha=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     lora_alpha=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=bert-base-uncased
27 | task_name=squad
28 | adapter_type=lora
29 | pruning_start=1
30 | continuous_alloc_interval=1
31 | pruning_batches=64
32 | num_prunings=5
33 | pruning_batch_size=4
34 | 
35 | learning_rate=1e-4
36 | training_batch_size=48
37 | num_train_epochs=10
38 | pruning_stop=8
39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
40 | 
41 | 
42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
43 | echo $output_dir
44 | mkdir -p $output_dir
45 | 
46 | python run_minus_squad_training.py \
47 |     --output_dir ${output_dir}\
48 |     --model_name_or_path ${model_name} \
49 |     --do_train \
50 |     --do_eval \
51 |     --save_strategy no \
52 |     --evaluation_strategy steps \
53 |     --logging_strategy steps \
54 |     --eval_steps 1000 \
55 |     --logging_steps 1000 \
56 |     --minus_scheduler \
57 |     --max_seq_length 384 \
58 |     --doc_stride 128 \
59 |     --num_train_epochs ${num_train_epochs} \
60 |     --per_device_train_batch_size ${training_batch_size} \
61 |     --per_device_eval_batch_size ${training_batch_size} \
62 |     --lr_scheduler_type linear\
63 |     --warmup_ratio 0.06\
64 |     --learning_rate ${learning_rate}\
65 |     --weight_decay 0.1\
66 |     --apply_lora \
67 |     --lora_alpha ${lora_alpha} \
68 |     --lora_r ${lora_r} \
69 |     --report_to none \
70 |     --pruning_batches ${pruning_batches} \
71 |     --mac_constraint ${mac_constraint} \
72 |     --pruning_scheduler ${pruning_scheduler} \
73 |     --param_allocation_strategy ${param_allocation_strategy} \
74 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
75 |     --continuous_allocation \
76 |     --continuous_alloc_interval ${continuous_alloc_interval} \
77 |     --pruning_start ${pruning_start} \
78 |     --pruning_stop ${pruning_stop} \
79 |     --head_scorer_type gradient_l2 \
80 |     --intermediate_scorer_type gradient_l2 \
81 |     --pruner_type ${pruner_type} \
82 |     --num_prunings ${num_prunings} \
83 |     --pruning_batch_size ${pruning_batch_size} \
84 |     --pruning_scheduler_strategy saliency


--------------------------------------------------------------------------------
/test/test_param_tuning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | import torch
 5 | from transformers import HfArgumentParser
 6 | from args import DataTrainingArguments
 7 | from models import build_model
 8 | from models.model_args import ModelArguments
 9 | from utils.utils import *
10 | from args import MinusTrainingArguments
11 | from utils import build_trainer
12 | 
13 | def main():
14 |     sys.argv = ['neuron_importance.py',
15 |             '--output_dir',
16 |             './output/neuron_importance/',
17 |             '--model_name_or_path',
18 |             'output/debug_output',
19 |             '--task_name',
20 |             'mnli',
21 |             '--do_eval',
22 |             '--max_seq_length',
23 |             '128',
24 |             '--per_device_train_batch_size',
25 |             '32',
26 |             '--per_device_eval_batch_size',
27 |             '32',
28 |             '--lora_r',
29 |             '64',
30 |             '--apply_lora',
31 |             '--report_to',
32 |             'none',
33 |             ]
34 |     parser = HfArgumentParser(
35 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
36 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
37 |         # If we pass only one argument to the script and it's the path to a json file,
38 |         # let's parse it to get our arguments.
39 |         model_args, data_args, training_args = parser.parse_json_file(
40 |             json_file=os.path.abspath(sys.argv[1]))
41 |     else:
42 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
43 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
44 |         
45 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
46 |     train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets)
47 |     trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None)
48 |     
49 |     model_args.model_name_or_path = os.path.join(model_args.model_name_or_path, 'best_model')
50 |     # training_args.disable_tqdm = False
51 |     config, tokenizer, best_model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
52 |     best_trainer = build_trainer(data_args, training_args, best_model, tokenizer, train_dataset, eval_dataset, param_controller=None)
53 |     
54 |     final_model_params = dict(model.named_parameters())
55 |     best_model_params = dict(best_model.named_parameters())
56 |     tuned_params = [
57 |         k for k in final_model_params if not torch.allclose(final_model_params[k], best_model_params[k])
58 |     ]
59 |     sum_tuned_params = sum([torch.numel(final_model_params[k]) for k in tuned_params])
60 |     sum_changed_params = sum([(final_model_params[k] != best_model_params[k]).sum() for k in tuned_params])
61 |     
62 | if __name__ == '__main__':
63 |     main()


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/roberta_base_squad.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     lora_alpha=16
 5 |     pruning_scheduler=once
 6 |     pruner_type=global
 7 |     param_allocation_strategy=free_inout
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     lora_alpha=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     lora_alpha=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=roberta-base
27 | task_name=squad
28 | adapter_type=lora
29 | pruning_start=0.1
30 | continuous_alloc_interval=1
31 | pruning_batches=64
32 | num_prunings=5
33 | pruning_batch_size=4
34 | 
35 | learning_rate=1e-4
36 | training_batch_size=48
37 | num_train_epochs=10
38 | pruning_stop=8
39 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
40 | 
41 | 
42 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/prunestart${pruning_start}"
43 | echo $output_dir
44 | mkdir -p $output_dir
45 | 
46 | python run_minus_squad_training.py \
47 |     --output_dir ${output_dir}\
48 |     --model_name_or_path ${model_name} \
49 |     --do_train \
50 |     --do_eval \
51 |     --save_strategy no \
52 |     --evaluation_strategy steps \
53 |     --logging_strategy steps \
54 |     --eval_steps 1000 \
55 |     --logging_steps 1000 \
56 |     --minus_scheduler \
57 |     --max_seq_length 384 \
58 |     --doc_stride 128 \
59 |     --num_train_epochs ${num_train_epochs} \
60 |     --per_device_train_batch_size ${training_batch_size} \
61 |     --per_device_eval_batch_size ${training_batch_size} \
62 |     --lr_scheduler_type linear\
63 |     --warmup_ratio 0.06\
64 |     --learning_rate ${learning_rate}\
65 |     --weight_decay 0.1\
66 |     --apply_lora \
67 |     --adapter_type ${adapter_type} \
68 |     --lora_alpha ${lora_alpha} \
69 |     --lora_r ${lora_r} \
70 |     --report_to none \
71 |     --pruning_batches ${pruning_batches} \
72 |     --mac_constraint ${mac_constraint} \
73 |     --pruning_scheduler ${pruning_scheduler} \
74 |     --param_allocation_strategy ${param_allocation_strategy} \
75 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
76 |     --continuous_allocation \
77 |     --continuous_alloc_interval ${continuous_alloc_interval} \
78 |     --pruning_start ${pruning_start} \
79 |     --pruning_stop ${pruning_stop} \
80 |     --head_scorer_type gradient_l2 \
81 |     --intermediate_scorer_type gradient_l2 \
82 |     --pruner_type ${pruner_type} \
83 |     --num_prunings ${num_prunings} \
84 |     --pruning_batch_size ${pruning_batch_size} \
85 |     --pruning_scheduler_strategy saliency


--------------------------------------------------------------------------------
/scripts/train_ft_distill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name=roberta-base
13 |     teacher_path=textattack/roberta-base-SST-2
14 |     task_name=sst2
15 |     learning_rate=2e-5
16 |     training_batch_size=32
17 |     num_train_epochs=20
18 |     distill_mapping_strategy=static_teacher_static_student
19 | elif [ "$#" -eq 3 ]; then
20 |     model_name=$1
21 |     teacher_path=$2
22 |     task_name=$3
23 |     learning_rate=2e-5
24 |     training_batch_size=32
25 |     num_train_epochs=20
26 |     distill_mapping_strategy=static_teacher_static_student
27 | elif [ "$#" -eq 7 ]; then
28 |     model_name=$1
29 |     teacher_path=$2
30 |     task_name=$3
31 |     learning_rate=$4
32 |     training_batch_size=$5
33 |     num_train_epochs=$6
34 |     distill_mapping_strategy=$7
35 | fi
36 | 
37 | lora_r=8
38 | lora_alpha=16
39 | teacher_param_tuning_config=q:0-11,v:0-11
40 | student_param_tuning_config=q:0-11,v:0-11
41 | 
42 | if [ -d $model_name ]
43 | then
44 |     output_dir="${model_name}/finetune_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/${distill_mapping_strategy}"
45 | else
46 |     output_dir="output/${model_name}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/"
47 | fi
48 | 
49 | 
50 | echo $output_dir
51 | mkdir -p $output_dir
52 | 
53 | python run_minus_training.py \
54 |     --output_dir ${output_dir}\
55 |     --task_name ${task_name} \
56 |     --model_name_or_path ${model_name} \
57 |     --do_train \
58 |     --do_eval \
59 |     --save_strategy no \
60 |     --evaluation_strategy steps \
61 |     --logging_strategy steps \
62 |     --logging_steps 1000 \
63 |     --log_level info \
64 |     --log_level_replica info \
65 |     --eval_steps 5000 \
66 |     --max_seq_length 128 \
67 |     --num_train_epochs ${num_train_epochs} \
68 |     --per_device_train_batch_size ${training_batch_size} \
69 |     --per_device_eval_batch_size ${training_batch_size} \
70 |     --distillation_type self_student \
71 |     --distill_mapping_strategy ${distill_mapping_strategy} \
72 |     --warmup_ratio 0.06\
73 |     --learning_rate ${learning_rate}\
74 |     --weight_decay 0.1\
75 |     --lora_alpha ${lora_alpha} \
76 |     --lora_r ${lora_r} \
77 |     --report_to none \
78 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
79 |     --student_param_tuning_config ${student_param_tuning_config} \
80 |     --do_distill \
81 |     --distill_start 0 \
82 |     --distill_epoch ${num_train_epochs} \
83 |     --teacher_path ${teacher_path} | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/test/test_pruned_teacher_training.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | os.environ["WANDB_DISABLED"] = "true"
 4 | import sys
 5 | import time
 6 | 
 7 | from transformers import HfArgumentParser
 8 | from args import DataTrainingArguments
 9 | from models import build_model
10 | from models.model_args import ModelArguments
11 | from utils import build_trainer
12 | from utils.utils import *
13 | from args import MinusTrainingArguments
14 | from utils.cofi_utils import prune_model_with_z
15 | 
16 | def main():
17 |     sys.argv = ['test_pruned_teacher_training.py',
18 |             '--output_dir',
19 |             './output/test_pruned_teacher_training/',
20 |             '--model_name_or_path',
21 |             'output/roberta-base_lora_minus_mnli_once_global_co_learning_loratransform_distill/mac0.4/epoch25/bz128/numprune5/lora_r64/lora_alpha16/pre_pruning_model',
22 |             '--task_name',
23 |             'mnli',
24 |             '--evaluation_strategy',
25 |             'steps',
26 |             '--save_strategy',
27 |             'no',
28 |             '--do_train',
29 |             '--do_eval',
30 |             '--max_seq_length',
31 |             '128',
32 |             '--per_device_train_batch_size',
33 |             '32',
34 |             '--per_device_eval_batch_size',
35 |             '32',
36 |             '--apply_lora',
37 |             '--lora_r',
38 |             '8',
39 |             '--lora_alpha',
40 |             '16',
41 |             '--num_train_epochs',
42 |             '30',
43 |             '--learning_rate',
44 |             '5e-4',
45 |             '--warmup_ratio',
46 |             '0.06',
47 |             '--weight_decay',
48 |             '0.1',
49 |             ]
50 |     parser = HfArgumentParser(
51 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
52 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
53 |         # If we pass only one argument to the script and it's the path to a json file,
54 |         # let's parse it to get our arguments.
55 |         model_args, data_args, training_args = parser.parse_json_file(
56 |             json_file=os.path.abspath(sys.argv[1]))
57 |     else:
58 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
59 |     os.makedirs(training_args.output_dir, exist_ok=True)
60 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
61 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
62 |     train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets)
63 |     training_args.disable_tqdm = True
64 |     head_mask, intermediate_mask = torch.load(os.path.join(model_args.model_name_or_path, 'head_mask.pt')), torch.load(os.path.join(model_args.model_name_or_path, 'intermediate_mask.pt'))
65 |     head_mask[-4:, :] = 1
66 |     intermediate_mask[-4:, :] = 1
67 |     zs = {
68 |         'head_z': [v.to('cpu') for v in head_mask],
69 |         'intermediate_z': [v.to('cpu') for v in intermediate_mask],
70 |     }
71 |     prune_model_with_z(zs, model)
72 | 
73 |     model.head_mask, model.intermediate_mask = None, None
74 |     trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset)


--------------------------------------------------------------------------------
/merge_model_lora.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | sns.set_theme(style="darkgrid")
 3 | import os
 4 | os.environ["WANDB_DISABLED"] = "true"
 5 | import sys
 6 | import torch
 7 | import loralib as lora
 8 | 
 9 | from transformers import (HfArgumentParser)
10 | from args import DataTrainingArguments
11 | from models.model_args import ModelArguments
12 | from utils.utils import *
13 | from utils import build_trainer
14 | from utils.minus_utils import lora_to_linear
15 | from args import MinusTrainingArguments
16 | from models import build_model
17 | from torch.utils.data import Subset
18 | from utils.fisher_utils.efficiency.param import *
19 | 
20 | def main():
21 |     parser = HfArgumentParser(
22 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
23 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
24 |         # If we pass only one argument to the script and it's the path to a json file,
25 |         # let's parse it to get our arguments.
26 |         model_args, data_args, training_args = parser.parse_json_file(
27 |             json_file=os.path.abspath(sys.argv[1]))
28 |     else:
29 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
30 |     os.makedirs(training_args.output_dir, exist_ok=True)
31 |     # training_args.disable_tqdm = False
32 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
33 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets, force_model_shape_deduction=True)
34 |     train_dataset, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets)
35 |     IS_SQUAD = 'squad' in data_args.task_name.lower()
36 |     model.head_mask = torch.load(os.path.join(model_args.model_name_or_path, 'head_mask.pt')).to(training_args.device)
37 |     model.intermediate_mask = torch.load(os.path.join(model_args.model_name_or_path, 'intermediate_mask.pt')).to(training_args.device)
38 |     model.hidden_mask = torch.load(os.path.join(model_args.model_name_or_path, 'hidden_mask.pt')).to(training_args.device)
39 | 
40 |     trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None)
41 |     fixed_scaling = True
42 |     if fixed_scaling:
43 |         for m in model.modules():
44 |             if isinstance(m, lora.Linear):
45 |                 m.scaling = model_args.lora_alpha / model_args.lora_r
46 |     model_param_num = sum(p.numel() for p in model.parameters())
47 |     print("Unmerged model's performance: ", trainer.evaluate())
48 |     for n, m in dict(model.named_modules()).items():
49 |         for child_name, child in dict(m.named_children()).items():
50 |             if isinstance(child, lora.Linear):
51 |                 print("Merging layer {}".format(n + '.' + child_name))
52 |                 delattr(m, child_name)
53 |                 merged_layer = lora_to_linear(child)
54 |                 setattr(m, child_name, merged_layer)
55 |     
56 |     model_param_num_merged = sum(p.numel() for p in model.parameters())
57 |     print("Merged model's performance: ", trainer.evaluate())
58 |     print("Parmeter number reduced from {} to {}, with {} parameters removed".format(model_param_num, model_param_num_merged, model_param_num - model_param_num_merged))
59 |     
60 |     trainer.save_model()
61 | 
62 | if __name__ == '__main__':
63 |     main()


--------------------------------------------------------------------------------
/eval/dispatch_openai_requests.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This file is copied and modified from https://gist.github.com/neubig/80de662fb3e225c18172ec218be4917a.
 3 | Thanks to Graham Neubig for sharing the original code.
 4 | '''
 5 | 
 6 | import openai
 7 | import asyncio
 8 | from typing import Any, List, Dict
 9 | 
10 | async def dispatch_openai_chat_requesets(
11 |     messages_list: List[List[Dict[str,Any]]],
12 |     model: str,
13 |     **completion_kwargs: Any,
14 | ) -> List[str]:
15 |     """Dispatches requests to OpenAI chat completion API asynchronously.
16 |     
17 |     Args:
18 |         messages_list: List of messages to be sent to OpenAI chat completion API.
19 |         model: OpenAI model to use.
20 |         completion_kwargs: Keyword arguments to be passed to OpenAI ChatCompletion API. See https://platform.openai.com/docs/api-reference/chat for details.
21 |     Returns:
22 |         List of responses from OpenAI API.
23 |     """
24 |     async_responses = [
25 |         openai.ChatCompletion.acreate(
26 |             model=model,
27 |             messages=x,
28 |             **completion_kwargs,
29 |         )
30 |         for x in messages_list
31 |     ]
32 |     return await asyncio.gather(*async_responses)
33 | 
34 | 
35 | async def dispatch_openai_prompt_requesets(
36 |     prompt_list: List[str],
37 |     model: str,
38 |     **completion_kwargs: Any,
39 | ) -> List[str]:
40 |     """Dispatches requests to OpenAI text completion API asynchronously.
41 |     
42 |     Args:
43 |         prompt_list: List of prompts to be sent to OpenAI text completion API.
44 |         model: OpenAI model to use.
45 |         completion_kwargs: Keyword arguments to be passed to OpenAI text completion API. See https://platform.openai.com/docs/api-reference/completions for details.
46 |     Returns:
47 |         List of responses from OpenAI API.
48 |     """
49 |     async_responses = [
50 |         openai.Completion.acreate(
51 |             model=model,
52 |             prompt=x,
53 |             **completion_kwargs,
54 |         )
55 |         for x in prompt_list
56 |     ]
57 |     return await asyncio.gather(*async_responses)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     chat_completion_responses = asyncio.run(
62 |         dispatch_openai_chat_requesets(
63 |             messages_list=[
64 |                 [{"role": "user", "content": "Write a poem about asynchronous execution."}],
65 |                 [{"role": "user", "content": "Write a poem about asynchronous pirates."}],
66 |             ],
67 |             model="gpt-3.5-turbo",
68 |             temperature=0.3,
69 |             max_tokens=200,
70 |             top_p=1.0,
71 | 
72 |         )
73 |     )
74 | 
75 |     for i, x in enumerate(chat_completion_responses):
76 |         print(f"Chat completion response {i}:\n{x['choices'][0]['message']['content']}\n\n")
77 | 
78 | 
79 |     prompt_completion_responses = asyncio.run(
80 |         dispatch_openai_prompt_requesets(
81 |             prompt_list=[
82 |                 "Write a poem about asynchronous execution.\n",
83 |                 "Write a poem about asynchronous pirates.\n",
84 |             ],
85 |             model="text-davinci-003",
86 |             temperature=0.3,
87 |             max_tokens=200,
88 |             top_p=1.0,
89 |         )
90 |     )
91 | 
92 |     for i, x in enumerate(prompt_completion_responses):
93 |         print(f"Prompt completion response {i}:\n{x['choices'][0]['text']}\n\n")


--------------------------------------------------------------------------------
/scripts/train_lora_distill_squadv2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name=roberta-base
13 |     teacher_path=textattack/roberta-base-SST-2
14 |     lora_r=8
15 |     lora_alpha=16
16 |     learning_rate=2e-4
17 |     training_batch_size=32
18 |     num_train_epochs=20
19 |     distill_mapping_strategy=static_teacher_static_student
20 |     para_config=q:0-11,v:0-11
21 | elif [ "$#" -eq 2 ]; then
22 |     model_name=$1
23 |     teacher_path=$2
24 |     lora_r=8
25 |     lora_alpha=16
26 |     learning_rate=2e-4
27 |     training_batch_size=32
28 |     num_train_epochs=20
29 |     para_config=q:0-11,v:0-11
30 |     distill_mapping_strategy=static_teacher_static_student
31 | elif [ "$#" -eq 9 ]; then
32 |     model_name=$1
33 |     teacher_path=$2
34 |     lora_r=$3
35 |     lora_alpha=$4
36 |     learning_rate=$5
37 |     training_batch_size=$6
38 |     num_train_epochs=$7
39 |     distill_mapping_strategy=$8
40 |     para_config=$9
41 | fi
42 | 
43 | adapter_type=lora
44 | teacher_param_tuning_config=${para_config}
45 | student_param_tuning_config=${para_config}
46 | 
47 | if [ -d $model_name ]
48 | then
49 |     output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}"
50 | else
51 |     output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
52 | fi
53 | 
54 | 
55 | echo $output_dir
56 | mkdir -p $output_dir
57 | 
58 | python run_minus_squad_training.py \
59 |     --output_dir ${output_dir}\
60 |     --model_name_or_path ${model_name} \
61 |     --do_train \
62 |     --do_eval \
63 |     --save_strategy no \
64 |     --evaluation_strategy steps \
65 |     --logging_strategy steps \
66 |     --logging_steps 1000 \
67 |     --log_level info \
68 |     --log_level_replica info \
69 |     --eval_steps 5000 \
70 |     --max_seq_length 384 \
71 |     --doc_stride 128 \
72 |     --version_2_with_negative \
73 |     --num_train_epochs ${num_train_epochs} \
74 |     --per_device_train_batch_size ${training_batch_size} \
75 |     --per_device_eval_batch_size ${training_batch_size} \
76 |     --distillation_type self_student \
77 |     --distill_mapping_strategy ${distill_mapping_strategy} \
78 |     --warmup_ratio 0.06\
79 |     --learning_rate ${learning_rate}\
80 |     --weight_decay 0.1\
81 |     --apply_lora \
82 |     --lora_alpha ${lora_alpha} \
83 |     --lora_r ${lora_r} \
84 |     --report_to none \
85 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
86 |     --student_param_tuning_config ${student_param_tuning_config} \
87 |     --do_distill \
88 |     --distill_start 0 \
89 |     --distill_epoch ${num_train_epochs} \
90 |     --teacher_path ${teacher_path} | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/train_lora_distill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name=roberta-base
13 |     teacher_path=textattack/roberta-base-SST-2
14 |     task_name=sst2
15 |     lora_r=8
16 |     lora_alpha=16
17 |     learning_rate=2e-4
18 |     training_batch_size=32
19 |     num_train_epochs=20
20 |     distill_mapping_strategy=static_teacher_static_student
21 |     para_config=q:0-11,v:0-11
22 | elif [ "$#" -eq 3 ]; then
23 |     model_name=$1
24 |     teacher_path=$2
25 |     task_name=$3
26 |     lora_r=8
27 |     lora_alpha=16
28 |     learning_rate=2e-4
29 |     training_batch_size=32
30 |     num_train_epochs=20
31 |     para_config=q:0-11,v:0-11
32 |     distill_mapping_strategy=static_teacher_static_student
33 | elif [ "$#" -eq 10 ]; then
34 |     model_name=$1
35 |     teacher_path=$2
36 |     task_name=$3
37 |     lora_r=$4
38 |     lora_alpha=$5
39 |     learning_rate=$6
40 |     training_batch_size=$7
41 |     num_train_epochs=$8
42 |     distill_mapping_strategy=$9
43 |     para_config=${10}
44 | fi
45 | 
46 | adapter_type=lora
47 | teacher_param_tuning_config=${para_config}
48 | student_param_tuning_config=${para_config}
49 | 
50 | if [ -d $model_name ]
51 | then
52 |     output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}"
53 | else
54 |     output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
55 | fi
56 | 
57 | 
58 | echo $output_dir
59 | mkdir -p $output_dir
60 | 
61 | python run_minus_training.py \
62 |     --output_dir ${output_dir}\
63 |     --task_name ${task_name} \
64 |     --model_name_or_path ${model_name} \
65 |     --do_train \
66 |     --do_eval \
67 |     --save_strategy no \
68 |     --evaluation_strategy steps \
69 |     --logging_strategy steps \
70 |     --logging_steps 1000 \
71 |     --log_level info \
72 |     --log_level_replica info \
73 |     --eval_steps 5000 \
74 |     --max_seq_length 128 \
75 |     --num_train_epochs ${num_train_epochs} \
76 |     --per_device_train_batch_size ${training_batch_size} \
77 |     --per_device_eval_batch_size ${training_batch_size} \
78 |     --tf32 True \
79 |     --distillation_type self_student \
80 |     --distill_mapping_strategy ${distill_mapping_strategy} \
81 |     --warmup_ratio 0.06\
82 |     --learning_rate ${learning_rate}\
83 |     --weight_decay 0.1\
84 |     --apply_lora \
85 |     --lora_alpha ${lora_alpha} \
86 |     --lora_r ${lora_r} \
87 |     --report_to none \
88 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
89 |     --student_param_tuning_config ${student_param_tuning_config} \
90 |     --do_distill \
91 |     --distill_start 0 \
92 |     --distill_epoch ${num_train_epochs} \
93 |     --teacher_path ${teacher_path} | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/eval/cnndm.sh:
--------------------------------------------------------------------------------
 1 | model_name=$1
 2 | 
 3 | mac_constraint=0.4
 4 | lora_r=8
 5 | pruning_start=-1
 6 | pruning_scheduler=cubic_gradual
 7 | param_allocation_strategy=running_fisher
 8 | distillation_type=self_momentum
 9 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student
10 | 
11 | 
12 | task_name=cnndm
13 | adapter_type=lora
14 | param_resizing_strategy=tophalf_limited
15 | pruning_start=-1
16 | pruning_stop=3
17 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated
18 | distill_epoch=5
19 | pruning_batches=64
20 | num_prunings=10
21 | pruning_batch_size=4
22 | # pre_pruning_tuning_epochs=1
23 | pre_pruning_tuning_steps=200
24 | sparsity_warmup_epochs=1
25 | 
26 | learning_rate=1e-3
27 | training_batch_size=16
28 | num_train_epochs=10
29 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
30 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
31 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
32 | 
33 | 
34 | output_dir="${model_name}/eval"
35 | echo $output_dir
36 | mkdir -p $output_dir
37 | 
38 | python run_minus_seq2seq_training.py \
39 |     --output_dir ${output_dir}\
40 |     --task_name ${task_name} \
41 |     --model_name_or_path ${model_name} \
42 |     --do_eval \
43 |     --save_strategy no \
44 |     --evaluation_strategy steps \
45 |     --logging_strategy steps \
46 |     --eval_steps 5000 \
47 |     --logging_steps 1000 \
48 |     --log_level info \
49 |     --log_level_replica info \
50 |     --minus_scheduler \
51 |     --max_input_length 512 \
52 |     --max_target_length 128 \
53 |     --num_train_epochs ${num_train_epochs} \
54 |     --per_device_train_batch_size ${training_batch_size} \
55 |     --per_device_eval_batch_size ${training_batch_size} \
56 |     --tf32 True \
57 |     --lr_scheduler_type linear\
58 |     --distillation_type ${distillation_type} \
59 |     --distill_mapping_strategy ${distill_mapping_strategy} \
60 |     --warmup_ratio 0.06\
61 |     --learning_rate ${learning_rate}\
62 |     --weight_decay 0.1\
63 |     --seed 128 \
64 |     --apply_lora \
65 |     --lora_alpha 16 \
66 |     --lora_r ${lora_r} \
67 |     --report_to none \
68 |     --pruning_batches ${pruning_batches} \
69 |     --pruning_batch_size ${pruning_batch_size} \
70 |     --mac_constraint ${mac_constraint} \
71 |     --pruning_scheduler ${pruning_scheduler} \
72 |     --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
73 |     --param_allocation_strategy ${param_allocation_strategy} \
74 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
75 |     --student_param_tuning_config ${student_param_tuning_config} \
76 |     --head_scorer_type gradient_l2 \
77 |     --intermediate_scorer_type gradient_l2 \
78 |     --pruner_type none \
79 |     --do_distill \
80 |     --do_virtual_prune \
81 |     --distill_start ${distill_start} \
82 |     --distill_epoch ${distill_epoch} \
83 |     --pruning_start ${pruning_start} \
84 |     --pruning_stop ${pruning_stop} \
85 |     --num_prunings ${num_prunings} \
86 |     --pruning_scheduler_strategy saliency \
87 |     --collect_salience \
88 |     --salience_collecting_start 200 \
89 |     --salience_collecting_end -1 \
90 |     --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
91 |     --mask_lr 0.01 \
92 |     --grafting_top_k -1 \
93 |     --param_resizing_strategy ${param_resizing_strategy} \
94 |     --tuning_expanding_ratio 4.0 \
95 |     --max_lora_r $(($lora_r * 8)) \
96 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/eval/xsum.sh:
--------------------------------------------------------------------------------
 1 | model_name=$1
 2 | 
 3 | mac_constraint=0.4
 4 | lora_r=8
 5 | pruning_start=-1
 6 | pruning_scheduler=cubic_gradual
 7 | param_allocation_strategy=running_fisher
 8 | distillation_type=self_momentum
 9 | distill_mapping_strategy=dynamic_block_teacher_dynamic_student
10 | 
11 | 
12 | task_name=xsum
13 | adapter_type=lora
14 | param_resizing_strategy=tophalf_limited
15 | pruning_start=-1
16 | pruning_stop=3
17 | distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated
18 | distill_epoch=5
19 | pruning_batches=64
20 | num_prunings=10
21 | pruning_batch_size=4
22 | # pre_pruning_tuning_epochs=1
23 | pre_pruning_tuning_steps=200
24 | sparsity_warmup_epochs=1
25 | 
26 | learning_rate=1e-3
27 | training_batch_size=16
28 | num_train_epochs=10
29 | warmup_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
30 | teacher_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
31 | student_param_tuning_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11,ei0:0-11,di0:0-11
32 | 
33 | 
34 | output_dir="${model_name}/eval"
35 | echo $output_dir
36 | mkdir -p $output_dir
37 | 
38 | python run_minus_seq2seq_training.py \
39 |     --output_dir ${output_dir}\
40 |     --task_name ${task_name} \
41 |     --model_name_or_path ${model_name} \
42 |     --do_eval \
43 |     --save_strategy no \
44 |     --evaluation_strategy steps \
45 |     --logging_strategy steps \
46 |     --eval_steps 5000 \
47 |     --logging_steps 1000 \
48 |     --log_level info \
49 |     --log_level_replica info \
50 |     --minus_scheduler \
51 |     --max_input_length 512 \
52 |     --max_target_length 128 \
53 |     --num_train_epochs ${num_train_epochs} \
54 |     --per_device_train_batch_size ${training_batch_size} \
55 |     --per_device_eval_batch_size ${training_batch_size} \
56 |     --tf32 True \
57 |     --lr_scheduler_type linear\
58 |     --distillation_type ${distillation_type} \
59 |     --distill_mapping_strategy ${distill_mapping_strategy} \
60 |     --warmup_ratio 0.06\
61 |     --learning_rate ${learning_rate}\
62 |     --weight_decay 0.1\
63 |     --seed 128 \
64 |     --apply_lora \
65 |     --lora_alpha 16 \
66 |     --lora_r ${lora_r} \
67 |     --report_to none \
68 |     --pruning_batches ${pruning_batches} \
69 |     --pruning_batch_size ${pruning_batch_size} \
70 |     --mac_constraint ${mac_constraint} \
71 |     --pruning_scheduler ${pruning_scheduler} \
72 |     --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
73 |     --param_allocation_strategy ${param_allocation_strategy} \
74 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
75 |     --student_param_tuning_config ${student_param_tuning_config} \
76 |     --head_scorer_type gradient_l2 \
77 |     --intermediate_scorer_type gradient_l2 \
78 |     --pruner_type none \
79 |     --do_distill \
80 |     --do_virtual_prune \
81 |     --distill_start ${distill_start} \
82 |     --distill_epoch ${distill_epoch} \
83 |     --pruning_start ${pruning_start} \
84 |     --pruning_stop ${pruning_stop} \
85 |     --num_prunings ${num_prunings} \
86 |     --pruning_scheduler_strategy saliency \
87 |     --collect_salience \
88 |     --salience_collecting_start 200 \
89 |     --salience_collecting_end -1 \
90 |     --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
91 |     --mask_lr 0.01 \
92 |     --grafting_top_k -1 \
93 |     --param_resizing_strategy ${param_resizing_strategy} \
94 |     --tuning_expanding_ratio 4.0 \
95 |     --max_lora_r $(($lora_r * 8)) \
96 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | import torch
 5 | import json
 6 | from transformers import HfArgumentParser
 7 | from deepspeed.profiling.flops_profiler import get_model_profile
 8 | from args import DataTrainingArguments
 9 | from models import build_model
10 | from utils import build_dataloader, build_trainer
11 | from models.model_args import ModelArguments
12 | from utils.utils import *
13 | from utils.minus_utils import efficiency_testing, input_constructor, compare_parameters
14 | from utils.analysis_utils import gen_run_report
15 | from args import MinusTrainingArguments
16 | from loralib.layers import LoRALayer
17 | 
18 | def main():
19 |     parser = HfArgumentParser(
20 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
21 | 
22 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
23 |         # If we pass only one argument to the script and it's the path to a json file,
24 |         # let's parse it to get our arguments.
25 |         model_args, data_args, training_args = parser.parse_json_file(
26 |             json_file=os.path.abspath(sys.argv[1]))
27 |     else:
28 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
29 |         
30 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
31 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
32 |     MODEL_GENERATIVE = any(['decoder' in n for n, _ in model.named_parameters()])
33 |     train_dataset, eval_dataset, predict_dataset, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets, generative=MODEL_GENERATIVE)
34 | 
35 |     model = model.to(training_args.device)
36 |     model.eval()
37 |     for p in model.parameters():
38 |         p.requires_grad = False
39 |     for m in model.modules():
40 |         if isinstance(m, LoRALayer):
41 |             m.eval()
42 |     
43 |     model.eval()
44 |     trainer = build_trainer(data_args, training_args, model, tokenizer, train_dataset, eval_dataset, param_controller=None)
45 |     model.clear_masks()
46 |     efficiency_results = efficiency_testing(model, tokenizer, training_args.device)
47 | 
48 |     flops, macs, params = get_model_profile(
49 |         model,
50 |         kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()},        print_profile=True,
51 |         detailed=True,
52 |         output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'),
53 |     )
54 |     efficiency_results['model_flops'] = flops
55 |     efficiency_results['model_macs'] = macs
56 |     json.dump(efficiency_results, open(os.path.join(training_args.output_dir, 'efficiency_results.json'), 'w'), indent=4, sort_keys=True)
57 |     # run_report = gen_run_report(training_args.output_dir)
58 |     # run_report['train_runtime_per_epoch'] = run_report['train_runtime'] / training_args.num_train_epochs
59 |     # json.dump(run_report, open(os.path.join(training_args.output_dir, 'run_report.json'), 'w'), indent=4, sort_keys=True)
60 |     
61 |     result = trainer.evaluate()
62 |     json.dump(result, open(os.path.join(training_args.output_dir, 'eval_results.json'), 'w'), indent=4, sort_keys=True)    
63 | 
64 |     
65 | if __name__ == '__main__':
66 |     main()


--------------------------------------------------------------------------------
/scripts/train_lora_distill_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p gpu-rtx6k
 3 | #SBATCH -A h2lab
 4 | #SBATCH --nodes=1                  # Number of nodes
 5 | #SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
 6 | #SBATCH --cpus-per-task=8          # Number of CPU cores per task
 7 | #SBATCH --mem=64G                 # Memory per node (total memory)
 8 | #SBATCH --gres=gpu:1               # Number of GPUs requested
 9 | #SBATCH --time=48:00:00             # Walltime (hh:mm:ss)
10 | 
11 | if [ "$#" -eq 0 ]; then
12 |     model_name=roberta-base
13 |     teacher_path=textattack/roberta-base-SST-2
14 |     task_name=sst2
15 |     lora_r=8
16 |     lora_alpha=16
17 |     learning_rate=2e-4
18 |     training_batch_size=32
19 |     num_train_epochs=20
20 |     distill_mapping_strategy=static_teacher_static_student
21 |     para_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
22 | elif [ "$#" -eq 3 ]; then
23 |     model_name=$1
24 |     teacher_path=$2
25 |     task_name=$3
26 |     lora_r=8
27 |     lora_alpha=16
28 |     learning_rate=2e-4
29 |     training_batch_size=32
30 |     num_train_epochs=20
31 |     para_config=eq:0-11,ev:0-11,dq:0-11,dv:0-11,cq:0-11,cv:0-11
32 |     distill_mapping_strategy=static_teacher_static_student
33 | elif [ "$#" -eq 10 ]; then
34 |     model_name=$1
35 |     teacher_path=$2
36 |     task_name=$3
37 |     lora_r=$4
38 |     lora_alpha=$5
39 |     learning_rate=$6
40 |     training_batch_size=$7
41 |     num_train_epochs=$8
42 |     distill_mapping_strategy=$9
43 |     para_config=${10}
44 | fi
45 | 
46 | adapter_type=lora
47 | teacher_param_tuning_config=${para_config}
48 | student_param_tuning_config=${para_config}
49 | 
50 | if [ -d $model_name ]
51 | then
52 |     output_dir="${model_name}/lora_distilled/epoch${num_train_epochs}/bz${training_batch_size}/lr${learning_rate}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}/${distill_mapping_strategy}"
53 | else
54 |     output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_mapping_${distill_mapping_strategy}_distill_fixedteacher/epoch${num_train_epochs}/bz${training_batch_size}/param${student_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
55 | fi
56 | 
57 | 
58 | echo $output_dir
59 | mkdir -p $output_dir
60 | 
61 | python run_minus_seq2seq_training.py \
62 |     --output_dir ${output_dir}\
63 |     --task_name ${task_name} \
64 |     --model_name_or_path ${model_name} \
65 |     --do_train \
66 |     --do_eval \
67 |     --save_strategy no \
68 |     --log_level info \
69 |     --log_level_replica info \
70 |     --evaluation_strategy steps \
71 |     --logging_strategy steps \
72 |     --logging_steps 1000 \
73 |     --eval_steps 5000 \
74 |     --task_name ${task_name} \
75 |     --max_input_length 512 \
76 |     --max_target_length 128 \
77 |     --num_train_epochs ${num_train_epochs} \
78 |     --per_device_train_batch_size ${training_batch_size} \
79 |     --per_device_eval_batch_size ${training_batch_size} \
80 |     --tf32 True \
81 |     --distillation_type self_student \
82 |     --distill_mapping_strategy ${distill_mapping_strategy} \
83 |     --warmup_ratio 0.06\
84 |     --learning_rate ${learning_rate}\
85 |     --weight_decay 0.1\
86 |     --apply_lora \
87 |     --lora_alpha ${lora_alpha} \
88 |     --lora_r ${lora_r} \
89 |     --report_to none \
90 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
91 |     --student_param_tuning_config ${student_param_tuning_config} \
92 |     --do_distill \
93 |     --distill_start 0 \
94 |     --distill_epoch ${num_train_epochs} \
95 |     --teacher_path ${teacher_path} | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/scripts/adaptpruning_nodistill/bert_base_sst2.sh:
--------------------------------------------------------------------------------
 1 | if [ "$#" -eq 0 ]; then
 2 |     mac_constraint=0.4
 3 |     lora_r=8
 4 |     lora_alpha=16
 5 |     pruning_scheduler=cubic_gradual
 6 |     pruner_type=running_fisher
 7 |     param_allocation_strategy=running_fisher
 8 | elif [ "$#" -eq 6 ]; then
 9 |     mac_constraint=$1
10 |     lora_r=$2
11 |     lora_alpha=$3
12 |     pruning_scheduler=$4
13 |     pruner_type=$5
14 |     param_allocation_strategy=$6
15 | elif [ "$#" -eq 7 ]; then
16 |     mac_constraint=$1
17 |     lora_r=$2
18 |     lora_alpha=$3
19 |     pruning_scheduler=$4
20 |     pruner_type=$5
21 |     param_allocation_strategy=$6
22 |     gpu_id=$7
23 |     export CUDA_VISIBLE_DEVICES=$gpu_id
24 | fi
25 | 
26 | model_name=bert-base-uncased
27 | task_name=sst2
28 | adapter_type=lora
29 | param_resizing_strategy=tophalf_limited
30 | pruning_start=-1
31 | pruning_stop=3
32 | num_prunings=10
33 | pruning_batches=256
34 | pruning_batch_size=4
35 | 
36 | learning_rate=2e-4
37 | training_batch_size=32
38 | num_train_epochs=30
39 | warmup_param_tuning_config=q:0-11,v:0-11
40 | teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
41 | pre_pruning_tuning_epochs=0.5
42 | pre_pruning_layer_warmup_epochs=1.75
43 | suffix='_noffnstart'
44 | 
45 | output_dir="output/${model_name}_${adapter_type}_minus_${task_name}_${pruning_scheduler}_${pruner_type}_${param_allocation_strategy}_${param_resizing_strategy}_resizing_nodistill${suffix}/mac${mac_constraint}/epoch${num_train_epochs}/bz${training_batch_size}/numprune${num_prunings}/param${teacher_param_tuning_config}/lora_r${lora_r}/lora_alpha${lora_alpha}"
46 | echo $output_dir
47 | mkdir -p $output_dir
48 | 
49 | python run_minus_training.py \
50 |     --output_dir ${output_dir}\
51 |     --task_name ${task_name} \
52 |     --model_name_or_path ${model_name} \
53 |     --do_train \
54 |     --do_eval \
55 |     --save_strategy no \
56 |     --evaluation_strategy steps \
57 |     --logging_strategy steps \
58 |     --logging_steps 1000 \
59 |     --log_level info \
60 |     --log_level_replica info \
61 |     --eval_steps 5000 \
62 |     --max_seq_length 128 \
63 |     --num_train_epochs ${num_train_epochs} \
64 |     --per_device_train_batch_size ${training_batch_size} \
65 |     --per_device_eval_batch_size ${training_batch_size} \
66 |     --lr_scheduler_type linear\
67 |     --warmup_ratio 0.06\
68 |     --learning_rate ${learning_rate}\
69 |     --weight_decay 0.1\
70 |     --apply_lora \
71 |     --lora_alpha ${lora_alpha} \
72 |     --lora_r ${lora_r} \
73 |     --report_to none \
74 |     --pruning_batches ${pruning_batches} \
75 |     --mac_constraint ${mac_constraint} \
76 |     --pruning_scheduler ${pruning_scheduler} \
77 |     --param_allocation_strategy ${param_allocation_strategy} \
78 |     --warmup_param_tuning_config ${warmup_param_tuning_config} \
79 |     --teacher_param_tuning_config ${teacher_param_tuning_config} \
80 |     --pruning_start ${pruning_start} \
81 |     --pruning_stop ${pruning_stop} \
82 |     --pre_pruning_layer_warmup_epochs ${pre_pruning_layer_warmup_epochs} \
83 |     --head_scorer_type gradient_l2 \
84 |     --intermediate_scorer_type gradient_l2 \
85 |     --pruner_type ${pruner_type} \
86 |     --num_prunings ${num_prunings} \
87 |     --pruning_batch_size ${pruning_batch_size} \
88 |     --pruning_scheduler_strategy saliency \
89 |     --collect_salience \
90 |     --salience_collecting_start 200 \
91 |     --salience_collecting_end -1 \
92 |     --pre_pruning_tuning_epochs ${pre_pruning_tuning_epochs} \
93 |     --mask_lr 0.01 \
94 |     --grafting_top_k -1 \
95 |     --param_resizing_strategy ${param_resizing_strategy} \
96 |     | tee ${output_dir}/log.txt


--------------------------------------------------------------------------------
/test/test_param_controller.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["WANDB_DISABLED"] = "true"
 3 | import sys
 4 | import torch
 5 | from transformers import HfArgumentParser, default_data_collator, DataCollatorWithPadding
 6 | from args import DataTrainingArguments
 7 | from models import build_model
 8 | from models.model_args import ModelArguments
 9 | from utils.utils import *
10 | from args import MinusTrainingArguments
11 | from torch.utils.data import DataLoader, Subset
12 | from trainer.param_control import ParamController
13 | from utils.minus_utils import count_params
14 | 
15 | def main():
16 |     sys.argv = ['neuron_importance.py',
17 |             '--output_dir',
18 |             './output/neuron_importance/',
19 |             '--model_name_or_path',
20 |             'output/roberta-base_lora_minus_mnli_once_fisher_distill_full/step1.0/batchuse64/mac0.6',
21 |             '--task_name',
22 |             'mnli',
23 |             '--do_eval',
24 |             '--max_seq_length',
25 |             '128',
26 |             '--per_device_train_batch_size',
27 |             '32',
28 |             '--per_device_eval_batch_size',
29 |             '32',
30 |             '--apply_lora',
31 |             '--do_distill'
32 |             ]
33 |     parser = HfArgumentParser(
34 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
35 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
36 |         # If we pass only one argument to the script and it's the path to a json file,
37 |         # let's parse it to get our arguments.
38 |         model_args, data_args, training_args = parser.parse_json_file(
39 |             json_file=os.path.abspath(sys.argv[1]))
40 |     else:
41 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
42 |     os.makedirs(training_args.output_dir, exist_ok=True)
43 |     # training_args.disable_tqdm = False
44 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
45 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
46 |     _, eval_dataset, _, is_regression = build_data(model_args, data_args, training_args, model, tokenizer, config, raw_datasets)
47 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
48 |     # we already did the padding.
49 |     if data_args.pad_to_max_length:
50 |         data_collator = default_data_collator
51 |     elif training_args.fp16:
52 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
53 |     else:
54 |         data_collator = None
55 |     dataloader = DataLoader(
56 |         Subset(eval_dataset, torch.randperm(len(eval_dataset)).tolist()[:training_args.per_device_eval_batch_size * 64]),
57 |         batch_size=training_args.per_device_eval_batch_size,
58 |         collate_fn=data_collator,
59 |     )
60 |     inputs = next(iter(dataloader))
61 |     
62 |     teacher_config = {
63 |         'key': [9,10,11],
64 |         'query': [9, 10, 11],
65 |         'value': [9, 10, 11],
66 |     }
67 |     student_config = {
68 |         'intermediate': [9,10,11],
69 |     }
70 |     controller = ParamController(model, teacher_config, student_config)
71 |     results = {}
72 |     results['original'] = count_params(model, mode='tuned')
73 |     controller.freeze()
74 |     results['freeze'] = count_params(model, mode='tuned')
75 |     controller.model_as_teacher()
76 |     results['teacher'] = count_params(model, mode='tuned')
77 |     controller.model_as_student()
78 |     results['student'] = count_params(model, mode='tuned')
79 |     
80 | if __name__ == '__main__':
81 |     main()


--------------------------------------------------------------------------------
/test/test_deepspeed_profiler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import sys
 4 | os.environ["WANDB_DISABLED"] = "true"
 5 | from deepspeed.profiling.flops_profiler import get_model_profile, get_module_duration
 6 | from transformers import HfArgumentParser
 7 | from args import DataTrainingArguments, MinusTrainingArguments
 8 | from models import build_model
 9 | from models.model_args import ModelArguments
10 | from utils.utils import *
11 | from trainer.model_arch import get_layers
12 | from utils.cofi_utils import update_params, prune_model_with_z
13 | from utils.minus_utils import input_constructor
14 | 
15 | def main():
16 |     sys.argv = ['neuron_importance.py',
17 |             '--output_dir',
18 |             './output/neuron_importance/',
19 |             '--model_name_or_path',
20 |             'roberta-base',
21 |             '--task_name',
22 |             'mnli',
23 |             '--do_train',
24 |             '--do_eval',
25 |             '--max_seq_length',
26 |             '128',
27 |             '--per_device_train_batch_size',
28 |             '128',
29 |             '--per_device_eval_batch_size',
30 |             '128',
31 |             '--apply_lora',
32 |             '--do_distill',
33 |             '--lora_r',
34 |             '8'
35 |             ]
36 |     parser = HfArgumentParser(
37 |         (ModelArguments, DataTrainingArguments, MinusTrainingArguments))
38 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
39 |         # If we pass only one argument to the script and it's the path to a json file,
40 |         # let's parse it to get our arguments.
41 |         model_args, data_args, training_args = parser.parse_json_file(
42 |             json_file=os.path.abspath(sys.argv[1]))
43 |     else:
44 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
45 |     os.makedirs(training_args.output_dir, exist_ok=True)
46 |     # training_args.disable_tqdm = False
47 |     t_name, raw_datasets = get_raw_datasets(model_args, data_args, training_args)
48 |     config, tokenizer, model = build_model(model_args, data_args, training_args, t_name, raw_datasets)
49 |     if model.head_mask is not None:
50 |         mask_prefix = 'final_' if os.path.exists(os.path.join(model_args.model_name_or_path, 'final_head_mask.pt')) else ''
51 |         zs = {
52 |             'head_z': torch.load(os.path.join(model_args.model_name_or_path, mask_prefix + 'head_mask.pt'), map_location='cpu'),
53 |             'intermediate_z': torch.load(os.path.join(model_args.model_name_or_path, mask_prefix + 'intermediate_mask.pt'), map_location='cpu'),
54 |         }
55 |         update_params(model, zs)
56 |         prune_model_with_z(zs, model)
57 |         model.head_mask, model.intermediate_mask = None, None
58 |     model.eval()
59 |     for i in range(model.config.num_hidden_layers):
60 |         module = get_layers(model)[i].intermediate.dense
61 |         module.eval()
62 |         module.weight.data += (module.lora_B @ module.lora_A)* module.scaling
63 |         module.merged=True
64 | 
65 |     with torch.cuda.device(0):
66 |         model=model.cuda()
67 |         batch_size = training_args.per_device_eval_batch_size
68 |         seq_len = 128
69 |         enable_profile = True
70 |         if enable_profile:
71 |             flops, macs, params = get_model_profile(
72 |                 model,
73 |                 kwargs={k: v.to(model.device) for k, v in input_constructor(batch_size, seq_len, tokenizer).items()},
74 |                 print_profile=True,
75 |                 detailed=True,
76 |                 output_file='roberta-base-profile.txt'
77 |             )
78 |         else:
79 |             inputs = input_constructor((batch_size, seq_len), tokenizer)
80 |             outputs = model(inputs)


--------------------------------------------------------------------------------