├── requirements.txt ├── __pycache__ ├── utils.cpython-38.pyc ├── trainer.cpython-38.pyc └── active_sampler.cpython-38.pyc ├── commands ├── run_agnews_finetune.sh ├── run_sst2_finetune.sh ├── run_pubmed_finetune.sh ├── run_dbpedia_finetune.sh ├── run_pubmed.sh ├── run_sst2.sh ├── run_dbpedia.sh └── run_agnews.sh ├── README.md ├── log.txt ├── main.py ├── utils.py ├── active_sampler.py └── trainer.py /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.2.0 2 | pytorch==1.6.0 3 | tqdm 4 | scikit-learn 5 | faiss-cpu==1.6.4 -------------------------------------------------------------------------------- /__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yueyu1030/actune/HEAD/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/trainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yueyu1030/actune/HEAD/__pycache__/trainer.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/active_sampler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yueyu1030/actune/HEAD/__pycache__/active_sampler.cpython-38.pyc -------------------------------------------------------------------------------- /commands/run_agnews_finetune.sh: -------------------------------------------------------------------------------- 1 | task=agnews 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=finetune 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | eval_batch_size=512 11 | labels=25 12 | rounds=10 13 | dev_labels=1000 14 | steps=5 15 | tsb_logging_steps=20 16 | logging_steps=100 17 | st_logging_steps=50 18 | self_training_max_steps=2000 19 | epochs=15 20 | lr=2e-5 21 | self_training_weight=0.7 22 | gce_loss=0 23 | gce_loss_q=0.8 24 | eps=0.7 25 | pool=0.4 26 | 27 | soft_label=1 28 | gamma=1 29 | prob=1 30 | 31 | model_type=roberta-base 32 | output_dir=../datasets/${task}-${labels}-${seed} 33 | 34 | distill=0 35 | for pool in 1 ; do 36 | for beta in 0.5 ; do 37 | for sample_per_group in 10 ; do 38 | for al_method in random ; do 39 | if [ ${method} == "active_selftrain" ]; then 40 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 41 | elif [ ${method} == "finetune" ]; then 42 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 43 | fi 44 | 45 | 46 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 47 | expname="${expname}_beta${beta}_sample${sample_per_group}" 48 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group}" 49 | elif [ ${al_method} == "region_entropy_prop" ]; then 50 | expname="${expname}_beta${beta}_rho${region_rho}_sample${sample_per_group}" 51 | region_command="--region_beta=${beta} --region_rho=${region_rho} --sample_per_group=${sample_per_group}" 52 | else 53 | region_command="" 54 | fi 55 | tsb_dir=../exp/active_self_training/tsb/${expname} 56 | rm -r ${tsb_dir} 57 | mkdir -p ${tsb_dir} 58 | mkdir -p ${output_dir} 59 | echo ${method} 60 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 61 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 62 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 63 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 64 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 65 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 66 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 67 | --learning_rate=${lr} --model_type=${model_type} \ 68 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} \ 69 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 70 | --self_training_eps=${eps} --self_training_weight=${self_training_weight} \ 71 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} \ 72 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 73 | --gamma=${gamma} --smooth_prob=${prob} ${region_command}" 74 | 75 | echo $train_cmd 76 | eval $train_cmd 77 | done 78 | done 79 | done 80 | # done 81 | done 82 | -------------------------------------------------------------------------------- /commands/run_sst2_finetune.sh: -------------------------------------------------------------------------------- 1 | task=SST-2 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=1 5 | n_gpu=1 6 | method=finetune 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=64 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=50 13 | rounds=10 14 | dev_labels=1000 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=50 18 | st_logging_steps=30 19 | self_training_update_period=50 20 | self_training_max_steps=1500 21 | epochs=20 22 | lr=2e-5 23 | self_training_weight=0.0 24 | gce_loss=1 25 | gce_loss_q=0.8 26 | gamma=1 27 | 28 | if [ $method == 'active_selftrain' ]; then 29 | eps=0.4 30 | pool=0.4 31 | else 32 | eps=0.95 33 | pool=0.1 34 | fi 35 | 36 | soft_label=1 37 | model_type=roberta-base 38 | al_method=entropy 39 | output_dir=../datasets/${task}-${labels}-${seed} 40 | prob=1 41 | pool=1 42 | beta=0.5 43 | sample_per_group=10 44 | al_method=region_entropy 45 | 46 | if [ ${method} == "active_selftrain" ]; then 47 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 48 | elif [ ${method} == "finetune" ]; then 49 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 50 | fi 51 | 52 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 53 | expname="${expname}_beta${beta}_sample${sample_per_group}" 54 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group}" 55 | elif [ ${al_method} == "region_entropy_prop" ]; then 56 | expname="${expname}_beta${beta}_rho${region_rho}_sample${sample_per_group}" 57 | region_command="--region_beta=${beta} --region_rho=${region_rho} --sample_per_group=${sample_per_group}" 58 | else 59 | region_command="" 60 | fi 61 | 62 | tsb_dir=../exp/active_self_training/tsb/${expname} 63 | rm -r ${tsb_dir} 64 | mkdir -p ${tsb_dir} 65 | mkdir -p ${output_dir} 66 | echo ${method} 67 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 68 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 69 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 70 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 71 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 72 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 73 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 74 | --learning_rate=${lr} --model_type=${model_type} \ 75 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 76 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 77 | --self_training_eps=${eps} --max_steps=${steps} --self_training_weight=${self_training_weight} \ 78 | --self_training_update_period=${self_training_update_period} --self_training_max_step=${self_training_max_steps} --al_method=${al_method} \ 79 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 80 | --balance_st=${balance_st} --balance_query=${balance_query} \ 81 | --gamma=${gamma} --smooth_prob=${prob} ${region_command}" 82 | 83 | echo $train_cmd 84 | eval $train_cmd 85 | -------------------------------------------------------------------------------- /commands/run_pubmed_finetune.sh: -------------------------------------------------------------------------------- 1 | task=pubmed 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=finetune 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=20 13 | rounds=10 14 | dev_labels=1000 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=100 18 | st_logging_steps=40 19 | self_training_max_steps=2000 20 | epochs=15 21 | lr=2e-5 22 | self_training_weight=0.7 23 | gce_loss=0 24 | gce_loss_q=0.8 25 | 26 | if [ $method == 'active_selftrain' ]; then 27 | eps=0.6 28 | pool=0.4 29 | else 30 | eps=0.95 31 | pool=0.4 32 | fi 33 | soft_label=1 34 | label_smooth=0 35 | prob=0 36 | model_type=scibert 37 | gamma=1 38 | self_training_weight=0 39 | # al_method=cal 40 | # tsb_dir=../exp/active_self_training/tsb/${expname} 41 | # rm -r ${tsb_dir} 42 | distill=0 43 | pool=1 44 | beta=0.5 45 | al_method=region_entropy 46 | beta=0.5 47 | 48 | al_method=region_cal 49 | if [ ${method} == "active_selftrain" ]; then 50 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 51 | elif [ ${method} == "finetune" ]; then 52 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 53 | fi 54 | 55 | 56 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 57 | sample_per_group=10 58 | expname="${expname}_beta${beta}_sample${sample_per_group}" 59 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group}" 60 | elif [ ${al_method} == "region_entropy_prop" ]; then 61 | expname="${expname}_beta${beta}_rho${region_rho}_sample${sample_per_group}" 62 | region_command="--region_beta=${beta} --region_rho=${region_rho} --sample_per_group=${sample_per_group}" 63 | else 64 | region_command="" 65 | fi 66 | output_dir=../datasets/${task}-${labels}-${seed} 67 | tsb_dir=../exp/active_self_training/tsb/${expname} 68 | rm -r ${tsb_dir} 69 | mkdir -p ${tsb_dir} 70 | mkdir -p ${output_dir} 71 | echo ${method} 72 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 73 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 74 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 75 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 76 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 77 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 78 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 79 | --learning_rate=${lr} --model_type=${model_type} \ 80 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 81 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 82 | --self_training_eps=${eps} --max_steps=${steps} --self_training_weight=${self_training_weight} \ 83 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} --label_smooth=${label_smooth} \ 84 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 85 | --gamma=${gamma} --smooth_prob=${prob} ${region_command}" 86 | 87 | echo $train_cmd 88 | eval $train_cmd 89 | -------------------------------------------------------------------------------- /commands/run_dbpedia_finetune.sh: -------------------------------------------------------------------------------- 1 | task=dbpedia 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=finetune 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=5 13 | rounds=40 14 | dev_labels=100 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=50 18 | st_logging_steps=50 19 | self_training_update_period=100 20 | self_training_max_steps=2000 21 | epochs=20 22 | lr=2e-5 23 | lr_self_training=1e-6 24 | self_training_weight=0.7 25 | gce_loss=1 26 | gce_loss_q=0.9 27 | gamma=1 28 | if [ $method == 'active_selftrain' ]; then 29 | eps=0.7 30 | pool=0.4 31 | else 32 | eps=0.9 33 | pool=0.4 34 | fi 35 | soft_label=1 36 | label_smooth=0 37 | balance_query=0 38 | balance_st=1 39 | model_type=roberta-base 40 | prob=0 41 | # al_method=cal 42 | # expname=${task}_${model_type}_${method}_lr${lr}_lrst${lr_self_training}_update${self_training_update_period}_weight${self_training_weight}_eps${eps}_soft${soft_label}_${al_method}_seed${seed} 43 | # tsb_dir=../exp/active_self_training/tsb/${expname} 44 | # rm -r ${tsb_dir} 45 | distill=0 46 | prob=1 47 | beta=0.5 48 | sample_per_group=10 49 | n_centroids=30 50 | pool=0.1 51 | al_method=region_entropy 52 | 53 | if [ ${method} == "active_selftrain" ]; then 54 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 55 | elif [ ${method} == "finetune" ]; then 56 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 57 | fi 58 | 59 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 60 | expname="${expname}_beta${beta}_k${n_centroids}_sample${sample_per_group}" 61 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group} --n_centroids=${n_centroids}" 62 | else 63 | region_command="" 64 | fi 65 | output_dir=../datasets/${task}-${labels}-${seed} 66 | tsb_dir=../exp/active_self_training/tsb/${expname} 67 | rm -r ${tsb_dir} 68 | mkdir -p ${tsb_dir} 69 | mkdir -p ${output_dir} 70 | echo ${method} 71 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 72 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 73 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 74 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 75 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 76 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 77 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 78 | --learning_rate=${lr} --model_type=${model_type} \ 79 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 80 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 81 | --self_training_eps=${eps} --max_steps=${steps} --self_training_weight=${self_training_weight} \ 82 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} --label_smooth=${label_smooth} \ 83 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 84 | --gamma=${gamma} --smooth_prob=${prob} ${region_command}" 85 | 86 | echo $train_cmd 87 | eval $train_cmd 88 | done -------------------------------------------------------------------------------- /commands/run_pubmed.sh: -------------------------------------------------------------------------------- 1 | task=pubmed 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=active_selftrain 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=20 13 | rounds=10 14 | dev_labels=1000 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=100 18 | st_logging_steps=50 19 | self_training_max_steps=2000 20 | epochs=15 21 | lr=2e-5 22 | self_training_weight=0.7 23 | gce_loss=0 24 | gce_loss_q=0.7 25 | 26 | if [ $method == 'active_selftrain' ]; then 27 | eps=0.6 28 | pool=0.4 29 | else 30 | eps=0.95 31 | pool=0.4 32 | fi 33 | soft_label=1 34 | label_smooth=0 35 | prob=1 36 | 37 | model_type=scibert 38 | pool_scheduler=1 39 | gamma_scheduler=1 40 | 41 | pool=3000 42 | pool_min=3000 43 | self_training_weight=0.5 44 | eps=0.5 45 | gamma=0.9 46 | gamma_min=0.7 47 | al_method=region_entropy 48 | beta=0.5 49 | 50 | if [ ${method} == "active_selftrain" ]; then 51 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_eps${eps}_soft${soft_label}_${al_method}_seed${seed} 52 | elif [ ${method} == "finetune" ]; then 53 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 54 | fi 55 | 56 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 57 | sample_per_group=10 58 | expname="${expname}_beta${beta}_sample${sample_per_group}" 59 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group}" 60 | else 61 | region_command="" 62 | fi 63 | ################## 64 | if [ ${pool_scheduler} == 1 ]; then 65 | expname="${expname}_poolmin${pool_min}" 66 | pool_command="--pool_scheduler=1 --pool_min=${pool_min}" 67 | else 68 | pool_command="" 69 | fi 70 | if [ ${gamma_scheduler} == 1 ]; then 71 | expname="${expname}_gammamin${gamma_min}" 72 | gamma_command="--gamma_scheduler=1 --gamma_min=${gamma_min}" 73 | else 74 | gamma_command="" 75 | fi 76 | ######### path for saving the results 77 | output_dir=../datasets/${task}-${labels}-${seed} 78 | tsb_dir=../exp/active_self_training/tsb/${expname} 79 | rm -r ${tsb_dir} 80 | mkdir -p ${tsb_dir} 81 | mkdir -p ${output_dir} 82 | echo ${method} 83 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 84 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 85 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 86 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 87 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 88 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 89 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 90 | --learning_rate=${lr} --model_type=${model_type} \ 91 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 92 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 93 | --self_training_eps=${eps} --max_steps=${steps} --self_training_weight=${self_training_weight} \ 94 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} --label_smooth=${label_smooth} \ 95 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 96 | --gamma=${gamma} --smooth_prob=${prob} ${region_command} ${pool_command} ${gamma_command}" 97 | 98 | echo $train_cmd 99 | eval $train_cmd -------------------------------------------------------------------------------- /commands/run_sst2.sh: -------------------------------------------------------------------------------- 1 | task=SST-2 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=active_selftrain 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=64 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=50 13 | rounds=10 14 | dev_labels=1000 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=50 18 | st_logging_steps=30 19 | self_training_max_steps=1500 20 | epochs=10 21 | lr=2e-5 22 | self_training_weight=0.5 23 | gce_loss=0 24 | gce_loss_q=0.7 25 | 26 | if [ $method == 'active_selftrain' ]; then 27 | eps=0.7 28 | pool=0.4 29 | else # not used 30 | eps=0 31 | pool=0 32 | fi 33 | soft_label=1 34 | model_type=roberta-base 35 | 36 | pool_scheduler=1 37 | gamma_scheduler=1 38 | pool=3000 39 | pool_min=6000 40 | self_training_weight=0.5 41 | eps=0.6 42 | gamma=1 43 | n_centroids=25 # number of clusters for region-aware sampling 44 | beta=0.5 # weight for region-aware sampling 45 | prob=1 46 | gamma=1 47 | weight_embedding=1 48 | al_method=region_entropy 49 | if [ ${method} == "active_selftrain" ]; then 50 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_eps${eps}_soft${soft_label}_${al_method}_seed${seed} 51 | elif [ ${method} == "finetune" ]; then 52 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 53 | fi 54 | 55 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 56 | sample_per_group=15 57 | expname="${expname}_beta${beta}_w${weight_embedding}_k${n_centroids}_sample${sample_per_group}" 58 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group}" 59 | else 60 | region_command="" 61 | fi 62 | if [ ${pool_scheduler} == 1 ]; then 63 | expname="${expname}_poolmin${pool_min}" 64 | pool_command="--pool_scheduler=1 --pool_min=${pool_min}" 65 | else 66 | pool_command="" 67 | fi 68 | if [ ${gamma_scheduler} == 1 ]; then 69 | expname="${expname}_gammamin${gamma_min}" 70 | gamma_command="--gamma_scheduler=1 --gamma_min=${gamma_min}" 71 | else 72 | gamma_command="" 73 | fi 74 | ######### path for saving the results 75 | tsb_dir=../exp/active_self_training/tsb/${expname} 76 | output_dir=../datasets/${task}-${labels}-${seed} 77 | 78 | rm -r ${tsb_dir} 79 | mkdir -p ${tsb_dir} 80 | mkdir -p ${output_dir} 81 | echo ${method} 82 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 83 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 84 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 85 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 86 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 87 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 88 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 89 | --learning_rate=${lr} --model_type=${model_type} \ 90 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 91 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 92 | --self_training_eps=${eps} --self_training_weight=${self_training_weight} \ 93 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} \ 94 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 95 | --gamma=${gamma} --smooth_prob=${prob} ${region_command} ${pool_command} ${gamma_command}" 96 | echo $train_cmd 97 | eval $train_cmd 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AcTune 2 | This is the code repo for our paper `[AcTune: Uncertainty-Based Active Self-Training for Active Fine-Tuning of Pretrained Language Models](https://aclanthology.org/2022.naacl-main.102/)' (In Proceedings of NAACL 2022 Main Conference, Oral Presentation). 3 | 4 | # Requirements 5 | ``` 6 | python 3.7 7 | transformers==4.2.0 8 | pytorch==1.6.0 9 | tqdm 10 | scikit-learn 11 | faiss-cpu==1.6.4 12 | ``` 13 | 14 | # Datasets 15 | ## Main Experiments 16 | We use the following four datasets for the main experiments. 17 | | Dataset | Task | Number of Classes | Number of Train/Test | 18 | |---------------- | -------------- |-------------- | -------------- | 19 | | [SST-2](https://nlp.stanford.edu/sentiment/) | Sentiment | 2 | 60.6k/1.8k | 20 | | [AG News](https://huggingface.co/datasets/ag_news) | News Topic | 4 | 119k/7.6k | 21 | | [Pubmed-RCT](https://github.com/Franck-Dernoncourt/pubmed-rct) | Medical Abstract | 5 | 180k/30.1k | 22 | | [DBPedia](https://huggingface.co/datasets/dbpedia_14) | Ontology Topic | 14 | 280k/70k | 23 | 24 | The processed data can be found at [this link](https://drive.google.com/drive/folders/1Yhsf1Gji-kCxPfFtNnNW8isSbXqKZcKh?usp=sharing). The folder to put these datasets will be discribed in the following parts. 25 | 26 | ## Weak Supervision Experiments 27 | Most of the dataset are from the [WRENCH](https://github.com/JieyuZ2/wrench) benchmark. Please checkout their repo for dataset details. 28 | 29 | # Training 30 | Please use the commands in `commands` folder for experiments. 31 | Take AG News dataset as an example, `run_agnews_finetune.sh` is used for running the experiment of standard active learning approaches, and `run_agnews.sh` is used for running active self-training experiments as unlabeled data is also used during fine-tuning. 32 | 33 | Here, we suppose there is a folder for storing datasets as in `../datasets/`, and a folder for logging the experiment results as in `../exp`. 34 | 35 | # Hyperparameter Tuning 36 | The key hyperparameter for our approach includes `pool`, `pool_min`, `gamma`, `gamma_min`, `self_training_weight` `sample_per_group` and `n_centroids`. 37 | - `pool` stands for the number of samples selected for self-training *on average* for each round. For example, if 38 | - `pool_min` stands for the initial number of samples selected for self-training. For example, if `pool_min=3000` and `pool=4000` and there are 10 rounds in total, it means that in the first round, it selects 3000 samples, and in the later rounds, the number of low-uncertainty samples used for self-training will increase linearly. Finally, the total number of unlabeled data used for self-training equals to `4000*10=40000`. 39 | - `gamma` stands for the final weight of momentum-based memory bank. 40 | - `gamma_min` stands for the initial weight of momentum-based memory bank. The weight will gradually become closer to `gamma` in later rounds. 41 | - `n_centroids` is the number of clusters used in region-aware sampling. 42 | - `sample_per_group` is the number of samples selected in each high-uncertainty clusters. 43 | 44 | 45 | # Citation 46 | 47 | Please cite the following paper if you are using our datasets/tool. Thanks! 48 | 49 | ``` 50 | @inproceedings{yu2022actune, 51 | title = "{A}c{T}une: Uncertainty-Based Active Self-Training for Active Fine-Tuning of Pretrained Language Models", 52 | author = "Yu, Yue and Kong, Lingkai and Zhang, Jieyu and Zhang, Rongzhi and Zhang, Chao", 53 | booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", 54 | year = "2022", 55 | address = "Seattle, United States", 56 | publisher = "Association for Computational Linguistics", 57 | url = "https://aclanthology.org/2022.naacl-main.102", 58 | pages = "1422--1436", 59 | } 60 | ``` 61 | -------------------------------------------------------------------------------- /commands/run_dbpedia.sh: -------------------------------------------------------------------------------- 1 | task=dbpedia 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=active_selftrain 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=5 13 | rounds=40 14 | dev_labels=1000 15 | steps=5 16 | tsb_logging_steps=20 17 | logging_steps=100 18 | st_logging_steps=50 19 | self_training_update_period=100 20 | self_training_max_steps=2000 21 | epochs=15 22 | lr=2e-5 23 | lr_self_training=1e-6 24 | self_training_weight=0.7 25 | gce_loss=1 26 | gce_loss_q=0.8 27 | gamma=1 28 | if [ $method == 'active_selftrain' ]; then 29 | eps=0.7 30 | pool=0.4 31 | else 32 | eps=0.9 33 | pool=0.4 34 | fi 35 | soft_label=1 36 | label_smooth=0 37 | balance_query=0 38 | balance_st=1 39 | model_type=roberta-base 40 | 41 | # al_method=cal 42 | # expname=${task}_${model_type}_${method}_lr${lr}_lrst${lr_self_training}_update${self_training_update_period}_weight${self_training_weight}_eps${eps}_soft${soft_label}_${al_method}_seed${seed} 43 | # tsb_dir=../exp/active_self_training/tsb/${expname} 44 | # rm -r ${tsb_dir} 45 | pool_scheduler=1 46 | gamma_scheduler=1 47 | distill=0 48 | prob=1 49 | beta=0.5 50 | pool=4000 51 | self_training_weight=0.5 52 | gamma=1 53 | gamma_min=0.9 54 | pool_min=5000 55 | sample_per_group=10 56 | n_centroids=30 57 | 58 | al_method=region_entropy 59 | if [ ${method} == "active_selftrain" ]; then 60 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 61 | elif [ ${method} == "finetune" ]; then 62 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 63 | fi 64 | 65 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 66 | expname="${expname}_beta${beta}_k${n_centroids}_sample${sample_per_group}" 67 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group} --n_centroids=${n_centroids}" 68 | else 69 | region_command="" 70 | fi 71 | if [ ${pool_scheduler} == 1 ]; then 72 | expname="${expname}_poolmin${pool_min}" 73 | pool_command="--pool_scheduler=1 --pool_min=${pool_min}" 74 | else 75 | pool_command="" 76 | fi 77 | if [ ${gamma_scheduler} == 1 ]; then 78 | expname="${expname}_gammamin${gamma_min}" 79 | gamma_command="--gamma_scheduler=1 --gamma_min=${gamma_min}" 80 | else 81 | gamma_command="" 82 | fi 83 | output_dir=../datasets/${task}-${labels}-${seed} 84 | tsb_dir=../exp/active_self_training/tsb/${expname} 85 | rm -r ${tsb_dir} 86 | mkdir -p ${tsb_dir} 87 | mkdir -p ${output_dir} 88 | echo ${method} 89 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 90 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 91 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 92 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 93 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 94 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 95 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 96 | --learning_rate=${lr} --model_type=${model_type} \ 97 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 98 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 99 | --self_training_eps=${eps} --max_steps=${steps} --self_training_weight=${self_training_weight} \ 100 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} --label_smooth=${label_smooth} \ 101 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 102 | --gamma=${gamma} --smooth_prob=${prob} ${region_command} ${pool_command} ${gamma_command}" 103 | 104 | echo $train_cmd 105 | eval $train_cmd 106 | -------------------------------------------------------------------------------- /commands/run_agnews.sh: -------------------------------------------------------------------------------- 1 | task=agnews 2 | ######## Experiment Setups (GPU, Random Seed etc.) 3 | seed=0 4 | gpu=0 5 | n_gpu=1 6 | method=active_selftrain 7 | ######## Training Setups (bsz, learning rate etc.) 8 | max_seq_len=128 9 | batch_size=8 10 | self_training_batch_size=32 11 | eval_batch_size=512 12 | labels=25 # number of initial labels 13 | rounds=10 # number of AL rounds 14 | dev_labels=1000 15 | steps=5 # number of training epochs 16 | tsb_logging_steps=20 17 | logging_steps=50 18 | st_logging_steps=50 # frequency of evaluating on dev set during training 19 | self_training_max_steps=2000 # max steps for self-training 20 | epochs=15 21 | lr=2e-5 22 | self_training_weight=0.5 23 | gce_loss=0 24 | gce_loss_q=0.7 25 | 26 | if [ $method == 'active_selftrain' ]; then 27 | eps=0.7 28 | pool=0.4 29 | else # not used 30 | eps=0 31 | pool=0 32 | fi 33 | soft_label=1 34 | 35 | model_type=roberta-base 36 | pool_scheduler=1 37 | gamma_scheduler=1 38 | 39 | ######## AL Setups (bsz, learning rate etc.) 40 | pool=5000 # the average number of samples used for self-training for all rounds 41 | self_training_weight=0.5 42 | gamma_min=0.9 # the initial weight for momentum based memory bank 43 | gamma=1.0 # the final weight for momentum based memory bank 44 | pool_min=6000 45 | n_centroids=25 # number of clusters for region-aware sampling 46 | beta=0.5 # weight for region-aware sampling 47 | weight_embedding=1 48 | al_method=region_entropy 49 | prob=1 50 | 51 | if [ ${method} == "active_selftrain" ]; then 52 | expname=${task}_${model_type}_${method}_lr${lr}_pool${pool}_smoothprob${prob}_gamma${gamma}_weight${self_training_weight}_soft${soft_label}_${al_method}_seed${seed} 53 | elif [ ${method} == "finetune" ]; then 54 | expname=${task}_${model_type}_${method}_lr${lr}_${al_method}_seed${seed} 55 | fi 56 | 57 | if [ ${al_method} == "region_entropy" ] || [ ${al_method} == "region_cal" ] ; then 58 | beta=${beta} 59 | sample_per_group=10 60 | expname="${expname}_beta${beta}_w${weight_embedding}_k${n_centroids}_sample${sample_per_group}" 61 | region_command="--region_beta=${beta} --sample_per_group=${sample_per_group} --n_centroids=${n_centroids} --weight_embedding=${weight_embedding}" 62 | else 63 | region_command="" 64 | fi 65 | 66 | if [ ${pool_scheduler} == 1 ]; then 67 | expname="${expname}_poolmin${pool_min}" 68 | pool_command="--pool_scheduler=1 --pool_min=${pool_min}" 69 | else 70 | pool_command="" 71 | fi 72 | 73 | if [ ${gamma_scheduler} == 1 ]; then 74 | expname="${expname}_gammamin${gamma_min}_tune" 75 | gamma_command="--gamma_scheduler=1 --gamma_min=${gamma_min}" 76 | else 77 | gamma_command="" 78 | fi 79 | ######### path for saving the results 80 | output_dir=../datasets/${task}-${labels}-${seed} 81 | tsb_dir=../exp/active_self_training/tsb/${expname} 82 | mkdir -p ${tsb_dir} 83 | mkdir -p ${output_dir} 84 | 85 | train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \ 86 | --train_file=train.json --dev_file=valid.json --test_file=test.json \ 87 | --unlabel_file=unlabeled.json --data_dir="../datasets/${task}-${labels}-${seed}" --seed=${seed} \ 88 | --output_dir=${output_dir} --tsb_dir=${tsb_dir} \ 89 | --logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --tsb_logging_steps=${tsb_logging_steps} \ 90 | --sample_labels=${labels} --rounds=${rounds} --dev_labels=${dev_labels} \ 91 | --gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \ 92 | --learning_rate=${lr} --model_type=${model_type} \ 93 | --method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} --self_training_batch_size=${self_training_batch_size} \ 94 | --max_seq_len=${max_seq_len} --auto_load=1 --pool=${pool} \ 95 | --self_training_eps=${eps} --self_training_weight=${self_training_weight} \ 96 | --self_training_max_step=${self_training_max_steps} --al_method=${al_method} \ 97 | --gce_loss=${gce_loss} --gce_loss_q=${gce_loss_q} \ 98 | --gamma=${gamma} --smooth_prob=${prob} ${region_command} ${pool_command} ${gamma_command}" 99 | 100 | echo $train_cmd 101 | eval $train_cmd 102 | -------------------------------------------------------------------------------- /log.txt: -------------------------------------------------------------------------------- 1 | 07/24/2022 02:34:24 - INFO - utils - Creating features from dataset file at ../datasets/SST-2-50-0 2 | 07/24/2022 02:34:24 - INFO - utils - LOOKING AT ../datasets/SST-2-50-0/valid.json 3 | 07/24/2022 02:34:24 - INFO - utils - {'_id': 1, 'text': 'there is a fabric of complex ideas here , and feelings that profoundly deepen them .'} 4 | 07/24/2022 02:34:24 - INFO - utils - Writing example 0 of 872 5 | 07/24/2022 02:34:24 - INFO - utils - *** Example *** 6 | 07/24/2022 02:34:24 - INFO - utils - guid: dev-0 7 | 07/24/2022 02:34:24 - INFO - utils - tokens: there Ġis Ġa Ġfabric Ġof Ġcomplex Ġideas Ġhere Ġ, Ġand Ġfeelings Ġthat Ġprofoundly Ġdeepen Ġthem Ġ. 8 | 07/24/2022 02:34:24 - INFO - utils - input_ids: 0 8585 16 10 10199 9 2632 2956 259 2156 8 6453 14 27301 23176 106 479 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 07/24/2022 02:34:24 - INFO - utils - attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 | 07/24/2022 02:34:24 - INFO - utils - token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 07/24/2022 02:34:24 - INFO - utils - label: 1 (id = 1) 12 | 07/24/2022 02:34:24 - INFO - utils - Saving features into cached file ../datasets/SST-2-50-0/cached_dev_SST-2_roberta-base_64 13 | 07/24/2022 02:34:24 - INFO - utils - Creating features from dataset file at ../datasets/SST-2-50-0 14 | 07/24/2022 02:34:24 - INFO - utils - LOOKING AT ../datasets/SST-2-50-0/test.json 15 | 07/24/2022 02:34:24 - INFO - utils - {'_id': 1, 'text': "this is one of polanski 's best films ."} 16 | 07/24/2022 02:34:24 - INFO - utils - Writing example 0 of 1821 17 | 07/24/2022 02:34:24 - INFO - utils - *** Example *** 18 | 07/24/2022 02:34:24 - INFO - utils - guid: test-0 19 | 07/24/2022 02:34:24 - INFO - utils - tokens: this Ġis Ġone Ġof Ġpol anski Ġ' s Ġbest Ġfilms Ġ. 20 | 07/24/2022 02:34:24 - INFO - utils - input_ids: 0 9226 16 65 9 8385 24345 128 29 275 3541 479 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 | 07/24/2022 02:34:24 - INFO - utils - attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 | 07/24/2022 02:34:24 - INFO - utils - token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 | 07/24/2022 02:34:24 - INFO - utils - label: 1 (id = 1) 24 | 07/24/2022 02:34:24 - INFO - utils - Saving features into cached file ../datasets/SST-2-50-0/cached_test_SST-2_roberta-base_64 25 | 07/24/2022 02:34:25 - INFO - utils - Creating features from dataset file at ../datasets/SST-2-50-0 26 | 07/24/2022 02:34:25 - INFO - utils - LOOKING AT ../datasets/SST-2-50-0/train.json 27 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'has a caffeinated , sloppy brilliance , sparkling with ideas you wish had been developed with more care , but'} 28 | 07/24/2022 02:34:25 - INFO - utils - Writing example 0 of 100 29 | 07/24/2022 02:34:25 - INFO - utils - *** Example *** 30 | 07/24/2022 02:34:25 - INFO - utils - guid: train-0 31 | 07/24/2022 02:34:25 - INFO - utils - tokens: has Ġa Ġcaffe inated Ġ, Ġsloppy Ġbrilliance Ġ, Ġsparkling Ġwith Ġideas Ġyou Ġwish Ġhad Ġbeen Ġdeveloped Ġwith Ġmore Ġcare Ġ, Ġbut 32 | 07/24/2022 02:34:25 - INFO - utils - input_ids: 0 7333 10 45167 9339 2156 26654 28505 2156 21121 19 2956 47 2813 56 57 2226 19 55 575 2156 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 | 07/24/2022 02:34:25 - INFO - utils - attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 34 | 07/24/2022 02:34:25 - INFO - utils - token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 | 07/24/2022 02:34:25 - INFO - utils - label: 1 (id = 1) 36 | 07/24/2022 02:34:25 - INFO - utils - Saving features into cached file ../datasets/SST-2-50-0/cached_train_SST-2_roberta-base_64 37 | 07/24/2022 02:34:25 - INFO - utils - Creating features from dataset file at ../datasets/SST-2-50-0 38 | 07/24/2022 02:34:25 - INFO - utils - LOOKING AT ../datasets/SST-2-50-0/unlabeled.json 39 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'as though i was in the tiny two seater plane that carried the giant camera around australia , sweeping and gliding , banking'} 40 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'contemplation'} 41 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 0, 'text': 'a disappointingly thin slice of lower-class london life ; despite the title ... amounts to surprisingly little'} 42 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 0, 'text': 'imagine that a more confused , less interesting and more sloppily made film could possibly come down the road in 2002'} 43 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'which should appeal to women'} 44 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'of daring films'} 45 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': 'has clever ways of capturing inner-city life during the reagan years'} 46 | 07/24/2022 02:34:25 - INFO - utils - {'_id': 1, 'text': "campanella 's competent direction and his excellent cast overcome the obstacles of a predictable outcome and a screenplay that glosses over rafael 's evolution ."} 47 | 07/24/2022 02:34:25 - INFO - utils - Writing example 0 of 40000 48 | 07/24/2022 02:34:25 - INFO - utils - *** Example *** 49 | 07/24/2022 02:34:25 - INFO - utils - guid: unlabeled-0 50 | 07/24/2022 02:34:25 - INFO - utils - tokens: as Ġthough Ġi Ġwas Ġin Ġthe Ġtiny Ġtwo Ġse ater Ġplane Ġthat Ġcarried Ġthe Ġgiant Ġcamera Ġaround Ġaust ral ia Ġ, Ġsweeping Ġand Ġgl iding Ġ, Ġbanking 51 | 07/24/2022 02:34:25 - INFO - utils - input_ids: 0 281 600 939 21 11 5 5262 80 842 5109 3286 14 2584 5 3065 2280 198 28410 7085 493 2156 9893 8 5921 8231 2156 3454 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 | 07/24/2022 02:34:25 - INFO - utils - attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 53 | 07/24/2022 02:34:25 - INFO - utils - token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 54 | 07/24/2022 02:34:25 - INFO - utils - label: 1 (id = 1) 55 | 07/24/2022 02:34:25 - INFO - utils - Writing example 5000 of 40000 56 | 07/24/2022 02:34:26 - INFO - utils - Writing example 10000 of 40000 57 | 07/24/2022 02:34:26 - INFO - utils - Writing example 15000 of 40000 58 | 07/24/2022 02:34:27 - INFO - utils - Writing example 20000 of 40000 59 | 07/24/2022 02:34:27 - INFO - utils - Writing example 25000 of 40000 60 | 07/24/2022 02:34:28 - INFO - utils - Writing example 30000 of 40000 61 | 07/24/2022 02:34:28 - INFO - utils - Writing example 35000 of 40000 62 | 07/24/2022 02:34:29 - INFO - utils - Saving features into cached file ../datasets/SST-2-50-0/cached_unlabeled_SST-2_roberta-base_64_unlabel 63 | 07/24/2022 02:34:37 - INFO - trainer - Loading from ../datasets/SST-2-50-0/model/checkpoint-roberta-base-active_selftrain-region_entropy-train-50! 64 | 07/24/2022 02:34:39 - INFO - trainer - ***** Running evaluation on test dataset ***** 65 | 07/24/2022 02:34:39 - INFO - trainer - Num examples = 1821 66 | 07/24/2022 02:34:41 - INFO - trainer - ***** Eval results ***** 67 | 07/24/2022 02:35:34 - INFO - trainer - Finished iterating Train dataset, begin reiterate 68 | 07/24/2022 02:35:35 - INFO - trainer - ***** Running evaluation on dev dataset ***** 69 | 07/24/2022 02:35:35 - INFO - trainer - Num examples = 872 70 | 07/24/2022 02:35:36 - INFO - trainer - ***** Eval results ***** 71 | 07/24/2022 02:35:36 - INFO - trainer - Best model updated! 72 | 07/24/2022 02:35:39 - INFO - trainer - Finished iterating Train dataset, begin reiterate 73 | 07/24/2022 02:35:41 - INFO - trainer - ***** Running evaluation on dev dataset ***** 74 | 07/24/2022 02:35:41 - INFO - trainer - Num examples = 872 75 | 07/24/2022 02:35:42 - INFO - trainer - ***** Eval results ***** 76 | 07/24/2022 02:35:45 - INFO - trainer - Finished iterating Train dataset, begin reiterate 77 | 07/24/2022 02:35:48 - INFO - trainer - ***** Running evaluation on dev dataset ***** 78 | 07/24/2022 02:35:48 - INFO - trainer - Num examples = 872 79 | 07/24/2022 02:35:49 - INFO - trainer - ***** Eval results ***** 80 | 07/24/2022 02:35:49 - INFO - trainer - Best model updated! 81 | 07/24/2022 02:35:51 - INFO - trainer - Finished iterating Train dataset, begin reiterate 82 | 07/24/2022 02:35:54 - INFO - trainer - ***** Running evaluation on dev dataset ***** 83 | 07/24/2022 02:35:54 - INFO - trainer - Num examples = 872 84 | 07/24/2022 02:35:55 - INFO - trainer - ***** Eval results ***** 85 | 07/24/2022 02:35:55 - INFO - trainer - Best model updated! 86 | 07/24/2022 02:35:56 - INFO - trainer - Finished iterating Train dataset, begin reiterate 87 | 07/24/2022 02:36:01 - INFO - trainer - ***** Running evaluation on dev dataset ***** 88 | 07/24/2022 02:36:01 - INFO - trainer - Num examples = 872 89 | 07/24/2022 02:36:02 - INFO - trainer - ***** Eval results ***** 90 | 07/24/2022 02:36:02 - INFO - trainer - Best model updated! 91 | 07/24/2022 02:36:02 - INFO - trainer - Finished iterating Train dataset, begin reiterate 92 | 07/24/2022 02:36:06 - INFO - trainer - Finished iterating Train dataset, begin reiterate 93 | 07/24/2022 02:36:07 - INFO - trainer - ***** Running evaluation on dev dataset ***** 94 | 07/24/2022 02:36:07 - INFO - trainer - Num examples = 872 95 | 07/24/2022 02:36:08 - INFO - trainer - ***** Eval results ***** 96 | 07/24/2022 02:36:10 - INFO - trainer - Finished iterating Unlabeled dataset, begin reiterate 97 | 07/24/2022 02:36:12 - INFO - trainer - Finished iterating Train dataset, begin reiterate 98 | 07/24/2022 02:36:14 - INFO - trainer - ***** Running evaluation on dev dataset ***** 99 | 07/24/2022 02:36:14 - INFO - trainer - Num examples = 872 100 | 07/24/2022 02:36:15 - INFO - trainer - ***** Eval results ***** 101 | 07/24/2022 02:36:18 - INFO - trainer - Finished iterating Train dataset, begin reiterate 102 | 07/24/2022 02:36:20 - INFO - trainer - ***** Running evaluation on dev dataset ***** 103 | 07/24/2022 02:36:20 - INFO - trainer - Num examples = 872 104 | 07/24/2022 02:36:21 - INFO - trainer - ***** Eval results ***** 105 | 07/24/2022 02:36:23 - INFO - trainer - Finished iterating Train dataset, begin reiterate 106 | 07/24/2022 02:36:27 - INFO - trainer - ***** Running evaluation on dev dataset ***** 107 | 07/24/2022 02:36:27 - INFO - trainer - Num examples = 872 108 | 07/24/2022 02:36:28 - INFO - trainer - ***** Eval results ***** 109 | 07/24/2022 02:36:28 - INFO - trainer - Best model updated! 110 | 07/24/2022 02:36:29 - INFO - trainer - Finished iterating Train dataset, begin reiterate 111 | 07/24/2022 02:36:33 - INFO - trainer - ***** Running evaluation on dev dataset ***** 112 | 07/24/2022 02:36:33 - INFO - trainer - Num examples = 872 113 | 07/24/2022 02:36:34 - INFO - trainer - ***** Eval results ***** 114 | 07/24/2022 02:36:34 - INFO - trainer - Best model updated! 115 | 07/24/2022 02:36:34 - INFO - trainer - Finished iterating Train dataset, begin reiterate 116 | 07/24/2022 02:36:39 - INFO - trainer - Finished iterating Train dataset, begin reiterate 117 | 07/24/2022 02:36:40 - INFO - trainer - ***** Running evaluation on dev dataset ***** 118 | 07/24/2022 02:36:40 - INFO - trainer - Num examples = 872 119 | 07/24/2022 02:36:41 - INFO - trainer - ***** Eval results ***** 120 | 07/24/2022 02:36:41 - INFO - trainer - Best model updated! 121 | 07/24/2022 02:36:44 - INFO - trainer - Finished iterating Train dataset, begin reiterate 122 | 07/24/2022 02:36:46 - INFO - trainer - ***** Running evaluation on dev dataset ***** 123 | 07/24/2022 02:36:46 - INFO - trainer - Num examples = 872 124 | 07/24/2022 02:36:47 - INFO - trainer - ***** Eval results ***** 125 | 07/24/2022 02:36:47 - INFO - trainer - Best model updated! 126 | 07/24/2022 02:36:50 - INFO - trainer - Finished iterating Train dataset, begin reiterate 127 | 07/24/2022 02:36:50 - INFO - trainer - Finished iterating Unlabeled dataset, begin reiterate 128 | 07/24/2022 02:36:53 - INFO - trainer - ***** Running evaluation on dev dataset ***** 129 | 07/24/2022 02:36:53 - INFO - trainer - Num examples = 872 130 | 07/24/2022 02:36:54 - INFO - trainer - ***** Eval results ***** 131 | 07/24/2022 02:36:55 - INFO - trainer - Finished iterating Train dataset, begin reiterate 132 | 07/24/2022 02:36:59 - INFO - trainer - ***** Running evaluation on dev dataset ***** 133 | 07/24/2022 02:36:59 - INFO - trainer - Num examples = 872 134 | 07/24/2022 02:37:00 - INFO - trainer - ***** Eval results ***** 135 | 07/24/2022 02:37:01 - INFO - trainer - Finished iterating Train dataset, begin reiterate 136 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | from utils import load_and_cache_examples, load_and_cache_unlabeled_examples, init_logger, load_tokenizer 5 | from trainer import Trainer 6 | import torch 7 | import numpy as np 8 | import random 9 | import torch.nn as nn 10 | import copy 11 | from torch.utils.data import ConcatDataset, TensorDataset, Subset 12 | import json 13 | 14 | def model_dict(model_type): 15 | if model_type == 'roberta-base': 16 | return 'roberta-base' 17 | elif model_type == 'bert-base': 18 | return 'bert-base-uncased' 19 | elif model_type == 'scibert': 20 | return 'allenai/scibert_scivocab_uncased' 21 | 22 | 23 | def set_seed(args): 24 | random.seed(args.seed) 25 | np.random.seed(args.seed) 26 | torch.manual_seed(args.seed) 27 | if args.n_gpu > 0 and torch.cuda.is_available(): 28 | # print('yes') 29 | # assert 0 30 | torch.cuda.manual_seed_all(args.seed) 31 | torch.cuda.manual_seed(args.seed) 32 | torch.backends.cudnn.deterministic = True 33 | torch.backends.cudnn.benchmark = False 34 | 35 | def main(args): 36 | init_logger() 37 | set_seed(args) 38 | tokenizer = load_tokenizer(args) 39 | 40 | 41 | dev_dataset, num_labels, dev_size = load_and_cache_examples(args, tokenizer, mode="dev", size = 1000) 42 | test_dataset, num_labels, test_size = load_and_cache_examples(args, tokenizer, mode="test") 43 | try: 44 | train_dataset, num_labels, train_size = load_and_cache_examples(args, tokenizer, mode= "train") 45 | unlabeled_dataset, unlabeled_size = load_and_cache_unlabeled_examples(args, tokenizer, mode = 'unlabeled', train_size = train_size, size = num_labels * 20000) 46 | except: 47 | unlabeled_dataset, unlabeled_size = load_and_cache_unlabeled_examples(args, tokenizer, mode = 'unlabeled', train_size = 64) 48 | with open(f"../datasets/{args.task}-{args.sample_labels}-0/train_idx_roberta-base_{args.al_method}_{args.sample_labels}.json", 'r') as f: 49 | indexes = json.load(f) 50 | print("number of labeled data:", len(indexes)) 51 | train_dataset = Subset(unlabeled_dataset, indexes) 52 | train_size = len(indexes) 53 | 54 | 55 | print('number of labels:', num_labels) 56 | print('train_size:', train_size) 57 | print('dev_size:', dev_size) 58 | print('test_size:', test_size) 59 | print('unlabel_size:', unlabeled_size) 60 | trainer = Trainer(args, train_dataset=train_dataset, dev_dataset=dev_dataset,test_dataset=test_dataset, \ 61 | unlabeled = unlabeled_dataset, \ 62 | num_labels = num_labels, data_size = train_size, n_gpu = args.n_gpu 63 | ) 64 | 65 | 66 | trainer.init_model() 67 | 68 | if args.method == 'active_selftrain': 69 | for i in range(args.rounds): 70 | if args.task in ['dbpedia']: 71 | train_sample = 100 * (i + 1) 72 | sample_labels = 100 73 | elif args.task in ['trec', 'chemprot']: 74 | train_sample = 50 * (i + 1) 75 | sample_labels = 50 76 | # if i == 0: 77 | else: 78 | train_sample = args.sample_labels * (i + 1) 79 | sample_labels = args.sample_labels * args.n_labels 80 | if i == 0: 81 | try: 82 | if 'dbpedia' in args.output_dir: 83 | trainer.load_model(path = os.path.join(args.output_dir, 'model', f'checkpoint-{args.model_type}-finetune-random-train-100')) 84 | elif 'trec' in args.output_dir or 'chemprot' in args.output_dir: 85 | trainer.load_model(path = os.path.join(args.output_dir, 'model', f'checkpoint-{args.model_type}-finetune-random-train-100')) 86 | else: 87 | trainer.load_model(path = os.path.join(args.output_dir, 'model', f'checkpoint-{args.model_type}-active_selftrain-region_entropy-train-{args.sample_labels}')) 88 | loss_test, acc_test = trainer.evaluate('test', 0) 89 | print(f"Initial, acc={acc_test}") 90 | trainer.tb_writer.add_scalar(f"FT_Test_acc_{args.method}_seed{args.seed}", acc_test, train_sample) 91 | trainer.tb_writer.add_scalar(f"ST_Test_acc_{args.method}_seed{args.seed}", acc_test, train_sample) 92 | except: 93 | print("Loading Error! Retrain the model.") 94 | trainer.train(n_sample = train_sample) 95 | 96 | else: 97 | trainer.active_selftrain(n_sample = train_sample, soft = False) 98 | if args.smooth_prob == 1: # pool 99 | if args.pool_scheduler == 1: 100 | max_sample = int(args.pool) * args.rounds 101 | min_sample = int(args.pool_min) 102 | sample_num = min_sample + int((max_sample - min_sample) * i/(args.rounds-1)) 103 | else: 104 | sample_num = min(int(args.pool) * (i + 1), len(trainer.unlabeled) - 1) 105 | elif args.pool < 1: # 106 | sample_num = int(args.pool * (len(trainer.unlabeled)- sample_labels)) 107 | else: # 108 | sample_num = int(args.pool) 109 | if sample_num < 0: # corner case, can be ignored in most cases 110 | sample_num = 1 111 | 112 | trainer.sample(n_sample = sample_labels, n_unlabeled = sample_num, round = i) 113 | query_distribution = np.array(list(trainer.active_sampler.sample_class.values())) 114 | st_distribution = np.array(list(trainer.active_sampler.st_class.values())) 115 | trainer.tb_writer.add_histogram(f"Query_Class_Distribution", query_distribution/np.sum(query_distribution), args.sample_labels * args.n_labels * (i+1)) 116 | trainer.tb_writer.add_histogram(f"ST_Data_Class_Distribution", st_distribution/np.sum(st_distribution) , args.sample_labels * args.n_labels * (i+1)) 117 | trainer.reinit_model() 118 | 119 | elif args.method == 'finetune': 120 | for i in range(args.rounds): 121 | sample_num = 1 122 | if args.task in ['dbpedia']: 123 | train_sample = 100 * (i + 1) 124 | sample_labels = 100 125 | elif args.task in ['trec', 'chemprot']: 126 | train_sample = 50 * (i + 1) 127 | sample_labels = 50 128 | else: 129 | train_sample = args.sample_labels * (i + 1) 130 | sample_labels = args.sample_labels * args.n_labels 131 | if args.task in ['trec', 'chemprot'] and i == 0: # WL init 132 | trainer.load_model(path = os.path.join(args.output_dir, 'model', f'checkpoint-{args.model_type}-finetune-random-train-100')) 133 | loss_test, acc_test = trainer.evaluate('test', 0) 134 | else: 135 | trainer.train(n_sample = train_sample) 136 | trainer.sample(n_sample = sample_labels, n_unlabeled = sample_num) 137 | trainer.reinit_model() 138 | 139 | 140 | 141 | if __name__ == '__main__': 142 | 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument("--method", default='clean', type=str, help="which method to use") 145 | parser.add_argument("--gpu", default='0,1,2,3', type=str, help="which gpu to use") 146 | parser.add_argument("--n_gpu", default=1, type=int, help="which gpu to use") 147 | 148 | parser.add_argument("--seed", default=0, type=int, help="which seed to use") 149 | parser.add_argument("--task", default="agnews", type=str, help="The name of the task to train") 150 | parser.add_argument("--data_dir", default="../datasets", type=str, 151 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 152 | parser.add_argument("--model_dir", default="./model", type=str, help="Path to model") 153 | parser.add_argument("--eval_dir", default="./eval", type=str, help="Evaluation script, result directory") 154 | parser.add_argument("--tsb_dir", default="./eval", type=str, help="TSB script, result directory") 155 | parser.add_argument("--train_file", default="train.tsv", type=str, help="Train file") 156 | parser.add_argument("--dev_file", default="dev.tsv", type=str, help="dev file") 157 | parser.add_argument("--test_file", default="test.tsv", type=str, help="Test file") 158 | parser.add_argument("--unlabel_file", default="unlabeled.tsv", type=str, help="Test file") 159 | parser.add_argument("--do_train", action="store_true", help="Whether to run training.") 160 | parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.") 161 | parser.add_argument("--sample_labels", default=100, type=int, help="number of labels for sampling in AL") 162 | parser.add_argument("--dev_labels", default=100, type=int, help="number of labels for dev set") 163 | parser.add_argument("--pool", default=0.1, type=float, help="number of labels for dev set") 164 | parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3",) 165 | parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.",) 166 | 167 | parser.add_argument('--rounds', type=int, default=10, help="Active Learning Rounds.") 168 | parser.add_argument('--logging_steps', type=int, default=10, help="Log every X updates steps.") 169 | parser.add_argument('--tsb_logging_steps', type=int, default=10, help="Log every X updates steps.") 170 | parser.add_argument('--self_train_logging_steps', type=int, default=20, help="Log every X updates steps.") 171 | parser.add_argument('--save_steps', type=int, default=200, help="Save checkpoint every X updates steps.") 172 | parser.add_argument("--model_type", default="bert-base-uncased", type=str) 173 | parser.add_argument("--auto_load", default=1, type=int, help="Auto loading the model or not") 174 | parser.add_argument("--add_sep_token", action="store_true", help="Add [SEP] token at the end of the sentence") 175 | 176 | parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") 177 | parser.add_argument("--max_steps", default=100, type=int, help="Training steps for initialization.") 178 | parser.add_argument("--weight_decay", default=1e-4, type=float, help="Weight decay if we apply some.") 179 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") 180 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 181 | parser.add_argument("--warmup_steps", default=100, type=int, help="Linear warmup over warmup_steps.") 182 | parser.add_argument("--dropout_rate", default=0.1, type=float, help="Dropout for fully-connected layers") 183 | parser.add_argument("--batch_size", default=32, type=int, help="Batch size for training and evaluation.") 184 | parser.add_argument("--self_training_batch_size", default=32, type=int, help="Batch size for training and evaluation.") 185 | parser.add_argument("--eval_batch_size", default=256, type=int, help="Batch size for training and evaluation.") 186 | parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") 187 | parser.add_argument("--max_seq_len", default=128, type=int, help="The maximum total input sequence length after tokenization.") 188 | parser.add_argument("--gradient_accumulation_steps", default=1, type=int, help="The maximum total input sequence length after tokenization.") 189 | 190 | parser.add_argument("--gce_loss", default=0, type=int, help="Whether Use GCE LOSS or not.") 191 | parser.add_argument("--gce_loss_q", default=0.8, type=float, help="Whether Use GCE LOSS or not.") 192 | 193 | parser.add_argument('--self_training_max_step', type = int, default = 10000, help = 'the maximum step (usually after the first epoch) for self training') 194 | parser.add_argument("--self_training_eps", default=0.6, type=float, help="The confidence thershold for the pseudo labels.") 195 | parser.add_argument("--self_training_power", default=2, type=float, help="The power of predictions used for self-training with soft labels.") 196 | parser.add_argument("--self_training_weight", default=0.5, type=float, help="The weight for self-training term.") 197 | 198 | parser.add_argument("--al_method", default='random', type=str, help="The initial learning rate for Adam.") 199 | 200 | parser.add_argument("--gamma", default=1, type=float, help="Balance between prev and current.") 201 | parser.add_argument("--smooth_prob", default=1, type=int, help="Balance between prev and current.") 202 | 203 | parser.add_argument("--n_centroids", default=25, type=int, help="Number of regions used in region-aware sampling.") 204 | parser.add_argument("--region_beta", default=0.1, type=float, help="The weight used in region-aware sampling.") 205 | parser.add_argument("--sample_per_group", default=10, type=int, help="Number of samples selected from each cluster.") 206 | parser.add_argument("--gamma_scheduler", default=0, type=int, help="Whether to dynamically adjust weight for momentum based memory bank.") 207 | parser.add_argument("--pool_scheduler", default=0, type=int, help="Whether to adjust number of unlabeled examples.") 208 | parser.add_argument("--gamma_min", default=0.6, type=float, help="The momentum coefficient for aggregating predictions.") 209 | parser.add_argument("--pool_min", default=5000, type=int, help="The minimum number of selected pseudo-labeled samples for self-training.") 210 | parser.add_argument("--weight_embedding", default=1, type=int, help="Whether use weighted K-means for clustering.") 211 | 212 | args = parser.parse_args() 213 | args.model_name_or_path = model_dict(args.model_type) 214 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 215 | if args.task in ["SST-2"]: 216 | args.n_labels = 2 217 | elif args.task in ["agnews"]: 218 | args.n_labels = 4 219 | elif args.task in ["pubmed"]: 220 | args.n_labels = 5 221 | elif args.task in ["trec"]: 222 | args.n_labels = 6 223 | elif args.task in ["chemprot"]: 224 | args.n_labels = 10 225 | elif args.task in ["dbpedia"]: 226 | args.n_labels = 14 227 | main(args) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import pandas as pd 5 | from collections import Counter 6 | import numpy as np 7 | from sklearn.datasets import fetch_20newsgroups 8 | from collections import Counter, defaultdict 9 | from nltk.corpus import stopwords 10 | from sklearn.model_selection import train_test_split 11 | from transformers import AutoTokenizer 12 | 13 | import re 14 | import json 15 | import logging 16 | import copy 17 | import csv,os 18 | from torch.utils.data import TensorDataset 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | def init_logger(): 23 | logging.basicConfig(filename = 'log.txt', 24 | filemode = 'w', 25 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 26 | datefmt='%m/%d/%Y %H:%M:%S', 27 | level=logging.INFO) 28 | 29 | def load_tokenizer(args): 30 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 31 | # tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS}) 32 | return tokenizer 33 | 34 | def clean_str(string): 35 | """ 36 | Tokenization/string cleaning for all datasets except for SST. 37 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 38 | """ 39 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 40 | string = re.sub(r"\'s", " \'s", string) 41 | string = re.sub(r"\'ve", " \'ve", string) 42 | string = re.sub(r"n\'t", " n\'t", string) 43 | string = re.sub(r"\'re", " \'re", string) 44 | string = re.sub(r"\'d", " \'d", string) 45 | string = re.sub(r"\'ll", " \'ll", string) 46 | string = re.sub(r",", " , ", string) 47 | string = re.sub(r"!", " ! ", string) 48 | string = re.sub(r"\(", " \( ", string) 49 | string = re.sub(r"\)", " \) ", string) 50 | string = re.sub(r"\?", " \? ", string) 51 | string = re.sub(r"\s{2,}", " ", string) 52 | return string.strip().lower() 53 | 54 | 55 | def clean_doc(x, word_freq): 56 | stop_words = set(stopwords.words('english')) 57 | clean_docs = [] 58 | most_commons = dict(word_freq.most_common(min(len(word_freq), 50000))) 59 | for doc_content in x: 60 | doc_words = [] 61 | cleaned = clean_str(doc_content.strip()) 62 | for word in cleaned.split(): 63 | if word not in stop_words and word_freq[word] >= 5: 64 | if word in most_commons: 65 | doc_words.append(word) 66 | else: 67 | doc_words.append("") 68 | doc_str = ' '.join(doc_words).strip() 69 | clean_docs.append(doc_str) 70 | return clean_docs 71 | 72 | class InputExample(object): 73 | """ 74 | A single training/test example for simple sequence classification. 75 | Args: 76 | guid: Unique id for the example. 77 | text_a: string. The untokenized text of the first sequence. For single 78 | sequence tasks, only this sequence must be specified. 79 | label: (Optional) string. The label of the example. This should be 80 | specified for train and dev examples, but not for test examples. 81 | """ 82 | 83 | def __init__(self, guid, text_a, label): 84 | self.guid = guid 85 | self.text_a = text_a 86 | self.label = label 87 | 88 | def __repr__(self): 89 | return str(self.to_json_string()) 90 | 91 | def to_dict(self): 92 | """Serializes this instance to a Python dictionary.""" 93 | output = copy.deepcopy(self.__dict__) 94 | return output 95 | 96 | def to_json_string(self): 97 | """Serializes this instance to a JSON string.""" 98 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 99 | 100 | class InputFeatures(object): 101 | """ 102 | A single set of features of data. 103 | Args: 104 | input_ids: Indices of input sequence tokens in the vocabulary. 105 | attention_mask: Mask to avoid performing attention on padding token indices. 106 | Mask values selected in ``[0, 1]``: 107 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 108 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 109 | """ 110 | 111 | def __init__(self, input_ids, attention_mask, token_type_ids, label_id, 112 | e1_mask = None, e2_mask = None, keys=None): 113 | self.input_ids = input_ids 114 | self.attention_mask = attention_mask 115 | self.token_type_ids = token_type_ids 116 | self.label_id = label_id 117 | self.e1_mask = e1_mask 118 | self.e2_mask = e2_mask 119 | self.keys=keys 120 | 121 | def __repr__(self): 122 | return str(self.to_json_string()) 123 | 124 | def to_dict(self): 125 | """Serializes this instance to a Python dictionary.""" 126 | output = copy.deepcopy(self.__dict__) 127 | return output 128 | 129 | def to_json_string(self): 130 | """Serializes this instance to a JSON string.""" 131 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 132 | 133 | class Processor(object): 134 | """Processor for the text data set """ 135 | def __init__(self, args): 136 | self.args = args 137 | if self.args.task in ['agnews']: 138 | self.num_label = 4 139 | elif self.args.task in ['chemprot']: 140 | self.num_label = 10 141 | elif self.args.task in ['SST-2']: 142 | self.num_label = 2 143 | elif self.args.task in ['yelp-full', 'pubmed']: 144 | self.num_label = 5 145 | elif self.args.task in ['trec']: 146 | self.num_label = 6 147 | elif self.args.task in ['dbpedia']: 148 | self.num_label = 14 149 | self.relation_labels = [x for x in range(self.num_label)] 150 | self.label2id = {x:x for x in range(self.num_label)} 151 | self.id2label = {x:x for x in range(self.num_label)} 152 | 153 | def read_data(self, filename): 154 | path = filename 155 | with open(path, 'r') as f: 156 | data = f 157 | for x in data: 158 | yield json.loads(x) 159 | # return data 160 | 161 | def _create_examples(self, data, set_type): 162 | examples = [] 163 | for i, d in enumerate(data): 164 | guid = "%s-%s" % (set_type, i) 165 | text_a = d["text"] 166 | label = d["_id"] 167 | 168 | if i % 5000 == 0: 169 | logger.info(d) 170 | examples.append(InputExample(guid=guid, text_a=text_a, label=label)) 171 | return examples 172 | 173 | def get_examples(self, mode): 174 | """ 175 | Args: 176 | mode: train, dev, test 177 | """ 178 | file_to_read = None 179 | if mode == 'train': 180 | file_to_read = self.args.train_file 181 | elif mode == 'dev': 182 | file_to_read = self.args.dev_file 183 | elif mode == 'test': 184 | file_to_read = self.args.test_file 185 | elif mode == 'unlabeled': 186 | file_to_read = self.args.unlabel_file 187 | 188 | logger.info("LOOKING AT {}".format(os.path.join(self.args.data_dir, file_to_read))) 189 | return self._create_examples(self.read_data(os.path.join(self.args.data_dir, file_to_read)), mode) 190 | 191 | def load_and_cache_examples(args, tokenizer, mode, size = -1): 192 | processor = Processor(args) 193 | cached_features_file = os.path.join( 194 | args.data_dir, 195 | 'cached_{}_{}_{}_{}'.format( 196 | mode, 197 | args.task, 198 | list(filter(None, args.model_name_or_path.split("/"))).pop(), 199 | args.max_seq_len, 200 | ) 201 | ) 202 | if os.path.exists(cached_features_file) and args.auto_load: 203 | logger.info("Loading features from cached file %s", cached_features_file) 204 | features = torch.load(cached_features_file) 205 | else: 206 | logger.info("Creating features from dataset file at %s", args.data_dir) 207 | if mode == "train": 208 | examples = processor.get_examples("train") 209 | elif mode == "dev": 210 | examples = processor.get_examples("dev") 211 | elif mode == "test": 212 | examples = processor.get_examples("test") 213 | else: 214 | raise Exception("For mode, Only train, dev, test is available") 215 | features = convert_examples_to_features(examples, args.max_seq_len, tokenizer, add_sep_token=args.add_sep_token) 216 | logger.info("Saving features into cached file %s", cached_features_file) 217 | torch.save(features, cached_features_file) 218 | 219 | # Convert to Tensors and build dataset 220 | if size > 0: 221 | import random 222 | random.shuffle(features) 223 | features = features[:size] 224 | else: 225 | size = len(features) 226 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 227 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) 228 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) 229 | 230 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) 231 | all_ids = torch.tensor([ _ for _,f in enumerate(features)], dtype=torch.long) 232 | dataset = TensorDataset(all_input_ids, all_attention_mask, 233 | all_token_type_ids, all_label_ids, all_ids) 234 | return dataset, processor.num_label, size 235 | 236 | def load_and_cache_unlabeled_examples(args, tokenizer, mode, train_size = 100, size = -1): 237 | processor = Processor(args) 238 | 239 | # Load data features from cache or dataset file 240 | cached_features_file = os.path.join( 241 | args.data_dir, 242 | 'cached_{}_{}_{}_{}_unlabel'.format( 243 | mode, 244 | args.task, 245 | list(filter(None, args.model_name_or_path.split("/"))).pop(), 246 | args.max_seq_len, 247 | ) 248 | ) 249 | 250 | if os.path.exists(cached_features_file) and args.auto_load: 251 | logger.info("Loading features from cached file %s", cached_features_file) 252 | features = torch.load(cached_features_file) 253 | else: 254 | logger.info("Creating features from dataset file at %s", args.data_dir) 255 | 256 | assert mode == "unlabeled" 257 | examples = processor.get_examples("unlabeled") 258 | 259 | features = convert_examples_to_features(examples, args.max_seq_len, tokenizer, add_sep_token=args.add_sep_token) 260 | if size > 0: 261 | import random 262 | random.shuffle(features) 263 | features = features[:size] 264 | logger.info("Saving features into cached file %s", cached_features_file) 265 | torch.save(features, cached_features_file) 266 | 267 | # Convert to Tensors and build dataset 268 | if size > 0: 269 | features = features[:size] 270 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 271 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) 272 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) 273 | 274 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) 275 | all_ids = torch.tensor([_+train_size for _ ,f in enumerate(features)], dtype=torch.long) 276 | 277 | dataset = TensorDataset(all_input_ids, all_attention_mask, 278 | all_token_type_ids, all_label_ids, all_ids) 279 | 280 | return dataset, len(features) 281 | 282 | def convert_examples_to_features(examples, max_seq_len, tokenizer, 283 | cls_token_segment_id=0, 284 | pad_token=0, 285 | pad_token_segment_id=0, 286 | sequence_a_segment_id=0, 287 | add_sep_token=False, 288 | mask_padding_with_zero=True, 289 | ): 290 | features = [] 291 | for (ex_index, example) in enumerate(examples[:]): 292 | if ex_index % 5000 == 0: 293 | logger.info("Writing example %d of %d" % (ex_index, len(examples))) 294 | tokens_a = tokenizer.tokenize(example.text_a) 295 | 296 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. 297 | if add_sep_token: 298 | special_tokens_count = 2 299 | else: 300 | special_tokens_count = 1 301 | if len(tokens_a) > max_seq_len - special_tokens_count: 302 | tokens_a = tokens_a[:(max_seq_len - special_tokens_count)] 303 | 304 | tokens = tokens_a 305 | if add_sep_token: 306 | sep_token = tokenizer.sep_token 307 | tokens += [sep_token] 308 | 309 | token_type_ids = [sequence_a_segment_id] * len(tokens) 310 | cls_token = tokenizer.cls_token 311 | tokens = [cls_token] + tokens 312 | token_type_ids = [cls_token_segment_id] + token_type_ids 313 | #tokens[0] = "$" 314 | #tokens[1] = "" 315 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 316 | 317 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 318 | # tokens are attended to. 319 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 320 | 321 | # Zero-pad up to the sequence length. 322 | padding_length = max_seq_len - len(input_ids) 323 | input_ids = input_ids + ([pad_token] * padding_length) 324 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 325 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 326 | 327 | 328 | assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len) 329 | assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len) 330 | assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len) 331 | 332 | label_id = int(example.label) 333 | 334 | if ex_index < 1: 335 | logger.info("*** Example ***") 336 | logger.info("guid: %s" % example.guid) 337 | logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) 338 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 339 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) 340 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) 341 | logger.info("label: %s (id = %d)" % (example.label, label_id)) 342 | features.append( 343 | InputFeatures( 344 | input_ids=input_ids, 345 | attention_mask=attention_mask, 346 | token_type_ids=token_type_ids, 347 | label_id=label_id, 348 | ) 349 | ) 350 | 351 | return features 352 | 353 | 354 | -------------------------------------------------------------------------------- /active_sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from collections import Counter 4 | import time 5 | from scipy import stats 6 | import pandas as pd 7 | from torch.nn import functional as F 8 | from torch.utils.data.sampler import Sampler 9 | from torch.utils.data import TensorDataset, DataLoader, ConcatDataset 10 | import faiss 11 | from tqdm import tqdm, trange 12 | from sklearn.metrics import pairwise_distances 13 | from sklearn.cluster import KMeans, MiniBatchKMeans 14 | import copy 15 | import time 16 | 17 | 18 | def calc_entropy(x): 19 | # x is the number of occurrences of each label 20 | lst = [] 21 | for y in x: 22 | lst.append(x[y]) 23 | lst = np.array(lst) / np.max(lst) 24 | return -np.sum(lst * np.log(lst + 1e-12)) 25 | 26 | class SubsetSampler(Sampler): 27 | r"""Samples elements seqentially from a given list of indices, without replacement. 28 | 29 | Arguments: 30 | indices (sequence): a sequence of indices 31 | """ 32 | 33 | def __init__(self, indices): 34 | self.indices = indices 35 | 36 | def __iter__(self): 37 | return (self.indices[i] for i in range(len(self.indices))) 38 | 39 | def __len__(self): 40 | return len(self.indices) 41 | 42 | 43 | class Active_sampler(object): 44 | 45 | def __init__(self, args, train_dataset, unlabeled_dataset, seed=0): 46 | self.args = args 47 | self.npr = np.random.RandomState(seed) 48 | self.train_dataset = train_dataset 49 | self.unlabeled_dataset = unlabeled_dataset 50 | self.pooled_dataset = None 51 | # self.get_sample = { 52 | # 'random': self.get_random, 53 | # 'entropy': self.get_max_entropy, 54 | # 'cal': self.get_cal, 55 | # } 56 | self.sample_class = Counter() 57 | self.st_class = Counter() 58 | self.mem = [] 59 | 60 | def convert_tensor_to_dataset(self, tensor, prediction = None): 61 | if prediction is None: 62 | return TensorDataset(tensor[0],tensor[1], tensor[2],tensor[3],tensor[4],) 63 | else: 64 | prediction = torch.FloatTensor(prediction) 65 | # print(tensor[0].shape,tensor[1].shape, tensor[2].shape,tensor[3].shape,tensor[4].shape, prediction.shape) 66 | return TensorDataset(tensor[0],tensor[1], tensor[2],tensor[3],tensor[4], prediction) 67 | 68 | def sample(self, method, train_pred, train_feat, train_label, unlabeled_pred, unlabeled_feat, unlabeled_label, entropy = None, n_sample = 100, n_unlabeled = 2048, round = 1): 69 | print(f"Active sampling: {method}, Samping {n_sample} data, add {n_unlabeled} to pool in total!") 70 | self.train_pred = train_pred 71 | self.train_feat = train_feat 72 | self.train_label = train_label 73 | self.unlabeled_pred = unlabeled_pred 74 | self.unlabeled_feat = unlabeled_feat 75 | self.unlabeled_label = unlabeled_label 76 | self.unlabel_pseudo = np.argmax(unlabeled_pred, axis = -1) 77 | self.unlabel_correct = (self.unlabel_pseudo == unlabeled_label).astype(int) 78 | len_unlabel = unlabeled_pred.shape[0] 79 | if method == 'cal' and self.args.smooth_prob == 1 and len(self.mem) > 0: 80 | unlabeled_pred = (1 - self.args.gamma) * self.mem + self.args.gamma * self.unlabeled_pred 81 | 82 | if method == 'random': 83 | idx = np.random.permutation(len_unlabel) 84 | value = np.sum(-np.log(unlabeled_pred + 1e-12) * unlabeled_pred, axis = -1) 85 | elif method == 'entropy': 86 | idx, value = self.get_max_entropy(unlabeled_pred, n_sample, n_unlabeled) 87 | elif method == 'cal': 88 | idx, value = self.get_cal(train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel = n_unlabeled) 89 | elif method == 'region_cal': 90 | idx, value = self.get_region_cal(train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel = n_unlabeled, ncentroids = self.args.n_centroids, sample_per_group=self.args.sample_per_group, beta = self.args.region_beta, weight = self.args.weight_embedding) 91 | elif method == 'region_entropy': 92 | idx, value = self.get_region_entropy(train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel = n_unlabeled, ncentroids = self.args.n_centroids, sample_per_group=self.args.sample_per_group, beta = self.args.region_beta, weight = self.args.weight_embedding) 93 | 94 | if len(self.mem) == 0: 95 | if self.args.smooth_prob == 1: 96 | self.mem = self.unlabeled_pred 97 | else: 98 | self.mem = value 99 | else: 100 | if self.args.gamma_scheduler == 1: # gradually upweight gamma in AL rounds 101 | gamma = self.args.gamma_min + (self.args.gamma - self.args.gamma_min) * ((round-1) / (self.args.rounds-2)) 102 | else: 103 | gamma = self.args.gamma 104 | print("Gamma", gamma) 105 | if self.args.smooth_prob == 1: 106 | self.mem = (1 - gamma) * self.mem + gamma * self.unlabeled_pred 107 | else: 108 | self.mem = (1 - gamma) * self.mem + gamma * value 109 | idx = list(idx) 110 | sample_idx = idx[:n_sample] 111 | if self.args.smooth_prob == 1: 112 | save_idx = idx[n_sample:] 113 | smooth_val = np.sum(-np.log(self.mem + 1e-12) * self.mem, axis = -1) 114 | smooth_idx = list(np.argsort(smooth_val))[::-1] 115 | else: 116 | smooth_idx = list(np.argsort(self.mem))[::-1] 117 | save_idx = idx[n_sample : -n_unlabeled] 118 | pool_idx = smooth_idx[-n_unlabeled:] 119 | pool_idx = pool_idx[::-1] 120 | 121 | indexes = np.arange(len(idx)) 122 | n_class = unlabeled_pred.shape[-1] 123 | pool_idx_class = [] 124 | save_idx_class = [] 125 | sample_idx_class = [] 126 | for i in range(n_class): 127 | label_idx = (self.unlabel_pseudo == i) 128 | if self.args.smooth_prob == 1: 129 | value_class = value[label_idx] 130 | else: 131 | value_class = self.mem[label_idx] 132 | index_class = indexes[label_idx] 133 | 134 | class_idx = np.argsort(value_class)[::-1] 135 | sorted_index = index_class[class_idx] 136 | 137 | pool_idx_tmp = list(sorted_index[-n_unlabeled//n_class:]) 138 | sample_idx_tmp = list(sorted_index[:n_sample//n_class]) 139 | save_idx_tmp = list(sorted_index[n_sample//n_class:-n_unlabeled//n_class]) 140 | pool_idx_class += pool_idx_tmp 141 | save_idx_class += save_idx_tmp 142 | sample_idx_class += sample_idx_tmp 143 | 144 | # if self.args.balance_st: 145 | # pool_idx = pool_idx_class 146 | # if self.args.balance_query: 147 | # sample_idx = sample_idx_class 148 | items = {} 149 | for x in sample_idx: 150 | items[x] = 1 151 | 152 | pool_idx = list( set(pool_idx) - (set(pool_idx) & set(sample_idx)) ) 153 | if self.args.smooth_prob == 1: 154 | for x in sample_idx: 155 | items[x] = 1 156 | else: 157 | for x in pool_idx: 158 | items[x] = 1 159 | # if self.args.balance_st or self.args.balance_query: 160 | # save_idx = [i for i in range(len(idx)) if i not in items] 161 | self.mem = self.mem[save_idx] 162 | print(self.mem.shape) 163 | sample_dataset = self.convert_tensor_to_dataset(self.unlabeled_dataset[sample_idx]) 164 | unlabeled_dataset = self.convert_tensor_to_dataset(self.unlabeled_dataset[save_idx]) 165 | pooled_dataset = self.convert_tensor_to_dataset(self.unlabeled_dataset[pool_idx], unlabeled_pred[pool_idx]) 166 | 167 | train_dataset = ConcatDataset([self.train_dataset, sample_dataset]) 168 | self.train_dataset = train_dataset 169 | 170 | if self.args.smooth_prob == 1: 171 | self.pooled_dataset = pooled_dataset 172 | self.unlabeled_dataset = unlabeled_dataset 173 | else: 174 | self.unlabeled_dataset = unlabeled_dataset 175 | if self.pooled_dataset: 176 | self.pooled_dataset = ConcatDataset([self.pooled_dataset, pooled_dataset]) 177 | else: 178 | self.pooled_dataset = pooled_dataset 179 | self.sample_class.update(unlabeled_label[sample_idx]) 180 | self.st_class.update(np.argmax(unlabeled_pred[pool_idx], axis = -1)) 181 | return self.train_dataset, self.unlabeled_dataset, self.pooled_dataset 182 | 183 | def get_random(self, unlabeled_pred, n_sample): 184 | entropy = np.sum(-np.log(unlabeled_pred + 1e-12) * unlabeled_pred, axis = -1) 185 | 186 | len_unlabel = unlabeled_pred.shape[0] 187 | rand_idx = np.random.permutation(len_unlabel) 188 | return rand_idx, entropy 189 | 190 | def get_max_entropy(self, unlabeled_pred, n_sample, n_unlabel = 2048): 191 | entropy = np.sum(-np.log(unlabeled_pred + 1e-12) * unlabeled_pred, axis = -1) 192 | idx = np.argsort(entropy)[::-1] 193 | return idx, entropy 194 | 195 | def get_cal(self, train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel, k = 10): 196 | d = train_feat.shape[-1] 197 | index = faiss.IndexFlatL2(d) 198 | index.add(train_feat) 199 | D, I = index.search(unlabeled_feat, k) 200 | # print(I.shape) 201 | # print(train_pred[I].shape) 202 | # print(unlabeled_pred.shape) 203 | unlabeled_pred = np.expand_dims(unlabeled_pred, axis = 1) 204 | # print(unlabeled_pred.shape) 205 | score = np.log((1e-10+train_pred[I])/ (1e-10+unlabeled_pred)) * train_pred[I] 206 | # print(score.shape) 207 | mean_kl = np.mean(np.sum(score, axis = -1), axis = -1) 208 | idx = np.argsort(mean_kl)[::-1] 209 | sample_idx = list(idx[:n_sample]) 210 | save_idx = list(idx[n_sample:]) 211 | sample_dataset = self.convert_tensor_to_dataset(self.unlabeled_dataset[sample_idx]) 212 | unlabeled_dataset = self.convert_tensor_to_dataset(self.unlabeled_dataset[save_idx]) 213 | train_dataset = ConcatDataset([self.train_dataset, sample_dataset]) 214 | # self.train_dataset = train_dataset 215 | # self.unlabeled_dataset = unlabeled_dataset 216 | return idx, mean_kl 217 | 218 | 219 | def get_region_cal(self, train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel, ncentroids = 25, sample_per_group=10, beta = 1, k = 10, weight = True): 220 | d = train_feat.shape[-1] 221 | index = faiss.IndexFlatL2(d) 222 | index.add(train_feat) 223 | D, I = index.search(unlabeled_feat, k) 224 | unlabeled_pred_expand = np.expand_dims(unlabeled_pred, axis = 1) 225 | score = np.log((1e-10+train_pred[I])/ (1e-10+unlabeled_pred_expand)) * train_pred[I] 226 | entropy = np.mean(np.sum(score, axis = -1), axis = -1) 227 | 228 | d = unlabeled_feat.shape[-1] 229 | if weight: 230 | kmeans = MiniBatchKMeans(n_clusters = ncentroids, random_state=0, batch_size=256, n_init=3, max_iter=100) 231 | kmeans.fit(unlabeled_feat, sample_weight = copy.deepcopy(entropy)) 232 | index = faiss.IndexFlatL2(d) 233 | index.add(kmeans.cluster_centers_) 234 | D, I = index.search(unlabeled_feat, 1) 235 | else: 236 | kmeans = faiss.Clustering(int(d), ncentroids) 237 | index = faiss.IndexFlatL2(d) 238 | kmeans.train(unlabeled_feat, index) 239 | centroid = faiss.vector_to_array(kmeans.centroids).reshape(ncentroids, -1) 240 | index.add(centroid) 241 | D, I = index.search(unlabeled_feat, 1) 242 | I = I.flatten() 243 | unlabeled_pseudo = np.argmax(unlabeled_pred, axis = 1) 244 | scores = [] 245 | indexes = [] 246 | for i in range(ncentroids): 247 | idx = (I == i) 248 | cnt = Counter() 249 | mean_entropy = np.mean(entropy[idx]) 250 | for z in unlabeled_pseudo[idx]: 251 | cnt[z] += 1 252 | class_entropy = calc_entropy(cnt) 253 | value = mean_entropy + beta * class_entropy 254 | scores.append(value) 255 | sorted_idx = np.argsort(entropy[idx]) 256 | idxs = np.arange(len(I))[idx][sorted_idx] 257 | indexes.append(list(idxs)) 258 | sample_idx = [] 259 | remains = n_sample 260 | for i in np.argsort(scores)[::-1]: 261 | if self.args.task == "SST-2": 262 | topK = 10 263 | else: 264 | topK = 20 265 | sample_idx += indexes[i][-min(remains, sample_per_group, len(indexes[i])//topK):] 266 | indexes[i] = indexes[i][:-min(remains, sample_per_group, len(indexes[i])//topK)] 267 | remains -= len( indexes[i][-min(remains, sample_per_group, len(indexes[i])//topK):]) 268 | if remains <= 0: 269 | break 270 | for y in indexes: 271 | sample_idx += y 272 | return sample_idx, entropy 273 | 274 | def get_region_entropy(self, train_pred, train_feat, unlabeled_pred, unlabeled_feat, n_sample, n_unlabel, ncentroids = 25, sample_per_group=10, beta = 1, weight = True): 275 | entropy = np.sum(-np.log(unlabeled_pred + 1e-12) * unlabeled_pred, axis = -1) 276 | d = unlabeled_feat.shape[-1] 277 | if weight: # use weighted K-Means Clustering 278 | kmeans = MiniBatchKMeans(n_clusters = ncentroids, random_state=0, batch_size=256, n_init=3, max_iter=100) 279 | kmeans.fit(unlabeled_feat, sample_weight = copy.deepcopy(entropy)) 280 | index = faiss.IndexFlatL2(d) 281 | index.add(kmeans.cluster_centers_) 282 | D, I = index.search(unlabeled_feat, 1) 283 | else: 284 | kmeans = faiss.Clustering(int(d), ncentroids) 285 | index = faiss.IndexFlatL2(d) 286 | kmeans.train(unlabeled_feat, index) 287 | centroid = faiss.vector_to_array(kmeans.centroids).reshape(ncentroids, -1) 288 | index.add(centroid) 289 | D, I = index.search(unlabeled_feat, 1) 290 | I = I.flatten() 291 | unlabeled_pseudo = np.argmax(unlabeled_pred, axis = 1) 292 | scores = [] 293 | indexes = [] 294 | for i in range(ncentroids): 295 | idx = (I == i) 296 | cnt = Counter() 297 | # calculate the mean entropy of samples 298 | mean_entropy = np.mean(entropy[idx]) 299 | for z in unlabeled_pseudo[idx]: 300 | cnt[z] += 1 301 | # calculate the mean entropy of pseudo labels 302 | class_entropy = calc_entropy(cnt) 303 | value = mean_entropy + beta * class_entropy 304 | scores.append(value) 305 | sorted_idx = np.argsort(entropy[idx]) 306 | idxs = np.arange(len(I))[idx][sorted_idx] 307 | indexes.append(list(idxs)) 308 | sample_idx = [] 309 | remains = n_sample 310 | for i in np.argsort(scores)[::-1]: 311 | if self.args.task == "SST-2": 312 | topK = 10 313 | else: 314 | topK = 20 315 | sample_idx += indexes[i][-min(remains, sample_per_group, len(indexes[i])//topK):] 316 | indexes[i] = indexes[i][:-min(remains, sample_per_group, len(indexes[i])//topK)] 317 | remains -= len( indexes[i][-min(remains, sample_per_group, len(indexes[i])//topK):]) 318 | if remains <= 0: 319 | break 320 | for y in indexes: 321 | sample_idx += y 322 | return sample_idx, entropy 323 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from tqdm import tqdm, trange 4 | from collections import Counter 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, ConcatDataset, TensorDataset 10 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Subset 11 | from torch.utils.data.sampler import SubsetRandomSampler 12 | from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup 13 | from tqdm import tqdm, trange 14 | from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup, AutoConfig, AutoModelForSequenceClassification 15 | import copy 16 | import math 17 | import os 18 | import random 19 | from active_sampler import Active_sampler 20 | import matplotlib 21 | matplotlib.use("Agg") 22 | import matplotlib.pyplot as plt 23 | import json 24 | try: 25 | from torch.utils.tensorboard import SummaryWriter 26 | except ImportError: 27 | from tensorboardX import SummaryWriter 28 | 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | def set_seed(args): 33 | random.seed(args.seed) 34 | np.random.seed(args.seed) 35 | torch.manual_seed(args.seed) 36 | if args.n_gpu > 0 and torch.cuda.is_available(): 37 | # print('yes') 38 | # assert 0 39 | torch.cuda.manual_seed_all(args.seed) 40 | torch.cuda.manual_seed(args.seed) 41 | torch.backends.cudnn.deterministic = True 42 | torch.backends.cudnn.benchmark = False 43 | 44 | def compute_metrics(preds, labels): 45 | assert len(preds) == len(labels) 46 | return acc_and_f1(preds, labels) 47 | 48 | def acc_and_f1(preds, labels, average='macro'): 49 | acc = (preds == labels).mean() 50 | 51 | return { 52 | "acc": acc, 53 | } 54 | 55 | 56 | class Trainer(object): 57 | def __init__(self, args, train_dataset = None, dev_dataset = None, test_dataset = None, unlabeled = None, \ 58 | num_labels = 10, data_size = 100, n_gpu = 1): 59 | self.args = args 60 | self.train_dataset = train_dataset 61 | self.dev_dataset = dev_dataset 62 | self.test_dataset = test_dataset 63 | self.unlabeled = unlabeled 64 | self.data_size = data_size 65 | 66 | self.num_labels = num_labels 67 | self.config_class = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=self.num_labels) 68 | self.model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, num_labels=self.num_labels) 69 | self.n_gpu = n_gpu 70 | self.tb_writer = SummaryWriter(self.args.tsb_dir) 71 | self.active_sampler = Active_sampler(args = self.args, train_dataset = self.train_dataset, unlabeled_dataset = self.unlabeled) 72 | 73 | def soft_frequency(self, logits, soft = True): 74 | """ 75 | Unsupervised Deep Embedding for Clustering Analysis 76 | https://arxiv.org/abs/1511.06335 77 | """ 78 | power = self.args.self_training_power 79 | y = logits 80 | f = torch.sum(y, dim=0) 81 | t = y**power / f 82 | t = t + 1e-10 83 | p = t/torch.sum(t, dim=-1, keepdim=True) 84 | return p if soft else torch.argmax(p, dim=1) 85 | 86 | def calc_loss(self, input, target, loss, thresh = 0.5, soft = True, conf = None, is_prob = False): 87 | softmax = nn.Softmax(dim=1) 88 | if not is_prob: 89 | target = softmax(target.view(-1, target.shape[-1])).view(target.shape) 90 | 91 | if conf == 'max': 92 | weight = torch.max(target, axis = 1).values 93 | w = torch.FloatTensor([1 if x == True else 0 for x in weight>thresh]).to(target.device) 94 | elif conf == 'entropy': 95 | weight = torch.sum(-torch.log(target+1e-6) * target, dim = 1) 96 | weight = 1 - weight / np.log(weight.size(-1)) 97 | w = torch.FloatTensor([1 if x == True else 0 for x in weight>thresh]).to(target.device) 98 | elif conf is None: 99 | weight = torch.ones(target.shape[0]).to(target.device) 100 | w = torch.ones(target.shape[0]).to(target.device) 101 | 102 | target = self.soft_frequency(target, soft = soft) 103 | loss_batch = loss(input, target) 104 | l = loss_batch * w.unsqueeze(1) * weight.unsqueeze(1) 105 | return l, weight, w 106 | 107 | def gce_loss(self, input, target, thresh = 0.5, soft = True, conf = None, is_prob = False): 108 | softmax = nn.Softmax(dim=1) 109 | if not is_prob: 110 | target = softmax(target.view(-1, target.shape[-1])).view(target.shape) 111 | weight = torch.max(target, axis = 1).values 112 | target = torch.argmax(target, dim = -1) 113 | if self.args.gce_loss_q == 0: 114 | if input.size(-1) == 1: 115 | ce_loss = nn.BCEWithLogitsLoss(reduction='none') 116 | loss = ce_loss(input.view(-1), input.float()) 117 | else: 118 | ce_loss = nn.CrossEntropyLoss(reduction='none') 119 | loss = ce_loss(input, target) 120 | else: 121 | if input.size(-1) == 1: 122 | pred = torch.sigmoid(input) 123 | pred = torch.cat((1-pred, pred), dim=-1) 124 | else: 125 | pred = F.softmax(input, dim=-1) 126 | pred_ = torch.gather(pred, dim=-1, index=torch.unsqueeze(target, -1)) 127 | w = pred_ > thresh 128 | loss = (1 - pred_ ** self.args.gce_loss_q) / self.args.gce_loss_q 129 | loss = (loss * w) 130 | return loss, weight, w 131 | 132 | def init_model(self): 133 | # GPU or CPU 134 | self.device = "cuda" if torch.cuda.is_available() and self.n_gpu > 0 else "cpu" 135 | if self.n_gpu > 1: 136 | self.model = nn.DataParallel(self.model) 137 | self.model = self.model.to(self.device) 138 | 139 | def load_model(self, path = None): 140 | if path is None: 141 | logger.info("No ckpt path, load from original ckpt!") 142 | self.model = AutoModelForSequenceClassification.from_pretrained( 143 | self.args.model_name_or_path, 144 | config=self.config_class, 145 | cache_dir=self.args.cache_dir if self.args.cache_dir else None, 146 | ).to(self.device) 147 | else: 148 | logger.info(f"Loading from {path}!") 149 | self.model = AutoModelForSequenceClassification.from_pretrained( 150 | path, 151 | config=self.config_class, 152 | cache_dir=self.args.cache_dir if self.args.cache_dir else None, 153 | ).to(self.device) 154 | self.init_model() 155 | 156 | def reinit_model(self): 157 | self.model = AutoModelForSequenceClassification.from_pretrained( 158 | self.args.model_name_or_path, 159 | config=self.config_class, 160 | cache_dir=self.args.cache_dir if self.args.cache_dir else None, 161 | ).to(self.device) 162 | self.init_model() 163 | 164 | def save_dataset(self, stage = 0): 165 | output_dir = os.path.join( 166 | self.args.output_dir, "dataset", "dataset-{}-{}-{}-{}".format(self.args.model_type, self.args.method, self.args.al_method, stage)) 167 | if not os.path.exists(output_dir): 168 | os.makedirs(output_dir) 169 | torch.save(self.train_dataset, os.path.join(output_dir, 'train')) 170 | torch.save(self.dev_dataset, os.path.join(output_dir, 'dev')) 171 | torch.save(self.test_dataset, os.path.join(output_dir, 'test')) 172 | torch.save(self.unlabeled, os.path.join(output_dir, 'unlabeled')) 173 | if self.pooled: 174 | torch.save(self.unlabeled, os.path.join(output_dir, 'pooled')) 175 | 176 | 177 | def load_dataset(self, stage = 0): 178 | load_dir = os.path.join( 179 | self.args.output_dir, "dataset", "dataset-{}-{}-{}-{}".format(self.args.model_type, self.args.method, self.args.al_method, stage)) 180 | if not os.path.exists(load_dir): 181 | # except: 182 | load_dir = os.path.join( 183 | self.args.output_dir, "dataset", "dataset-{}-{}-{}".format(self.args.model_type, self.args.al_method, stage)) 184 | self.train_dataset = torch.load(os.path.join(load_dir, 'train')) 185 | self.dev_dataset = torch.load(os.path.join(load_dir, 'dev')) 186 | self.test_dataset = torch.load(os.path.join(load_dir, 'test')) 187 | self.unlabeled = torch.load(os.path.join(load_dir, 'unlabeled')) 188 | 189 | 190 | def save_result(self, stage = 0, acc = 0, self_training = False): 191 | if self_training: 192 | setup = 'self_training' 193 | else: 194 | setup = 'train' 195 | output_dir = os.path.join( 196 | self.args.output_dir, "result", "result-{}-{}-{}-{}-{}".format(self.args.model_type,self.args.method, self.args.al_method, setup, stage)) 197 | if not os.path.exists(output_dir): 198 | os.makedirs(output_dir) 199 | with open(os.path.join(output_dir, 'acc.json') , 'w') as f: 200 | json.dump({"acc": acc, "stage": stage, "method": self.args.method, "model_type":self.args.model_type, "al_method": self.args.al_method}, f) 201 | 202 | def save_model(self, stage = 0, self_training = False): 203 | if self_training: 204 | setup = 'self_training' 205 | else: 206 | setup = 'train' 207 | output_dir = os.path.join( 208 | self.args.output_dir, "model", "checkpoint-{}-{}-{}-{}-{}".format(self.args.model_type,self.args.method, self.args.al_method, setup, stage)) 209 | if not os.path.exists(output_dir): 210 | os.makedirs(output_dir) 211 | model_to_save = ( 212 | self.model.module if hasattr(self.model, "module") else self.model 213 | ) # Take care of distributed/parallel training 214 | model_to_save.save_pretrained(output_dir) 215 | torch.save(self.args, os.path.join(output_dir, "training_args.bin")) 216 | # torch.save(self.model.state_dict(), os.path.join(output_dir, "model.pt")) 217 | logger.info("Saving model checkpoint to %s", output_dir) 218 | 219 | 220 | def train(self, n_sample = 20): 221 | train_sampler = RandomSampler(self.train_dataset) 222 | train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.batch_size) 223 | 224 | no_decay = ['bias', 'LayerNorm.weight'] 225 | optimizer_grouped_parameters = [ 226 | {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 227 | 'weight_decay': self.args.weight_decay}, 228 | {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 229 | ] 230 | optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) 231 | training_steps = int(self.args.num_train_epochs) * len(train_dataloader) 232 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(training_steps * 0.05), num_training_steps = training_steps) 233 | 234 | # Train! 235 | logger.info("***** Running training *****") 236 | logger.info(" Num examples = %d", len(self.train_dataset)) 237 | logger.info(" Num Epochs = %d", self.args.num_train_epochs) 238 | logger.info(" Total train batch size = %d", self.args.batch_size) 239 | logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) 240 | logger.info(" Total optimization steps = %d", training_steps) 241 | global_step = 0 242 | tr_loss = 0.0 243 | self.model.zero_grad() 244 | 245 | train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") 246 | set_seed(self.args) 247 | criterion = nn.CrossEntropyLoss(reduction = 'mean') 248 | best_model = None 249 | best_dev = -np.float('inf') 250 | for _ in train_iterator: 251 | epoch_iterator = tqdm(train_dataloader, desc="Iteration") 252 | for step, batch in enumerate(epoch_iterator): 253 | self.model.train() 254 | batch = tuple(t.to(self.device) for t in batch) # GPU or CPU 255 | inputs = { 256 | 'input_ids': batch[0], 257 | 'attention_mask': batch[1], 258 | 'token_type_ids': batch[2], 259 | 'labels': batch[3], 260 | } 261 | outputs = self.model(**inputs) 262 | loss = outputs[0] 263 | logits = outputs[1] 264 | if self.args.gradient_accumulation_steps > 1: 265 | loss = loss / self.args.gradient_accumulation_steps 266 | if torch.cuda.device_count() > 1: 267 | loss = loss.mean() 268 | loss.backward() 269 | tr_loss += loss.item() 270 | if (step + 1) % self.args.gradient_accumulation_steps == 0: 271 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) 272 | optimizer.step() 273 | scheduler.step() # Update learning rate schedule 274 | self.model.zero_grad() 275 | global_step += 1 276 | epoch_iterator.set_description("iteration:%d, Loss:%.3f, best dev:%.3f" % (_, tr_loss/global_step, 100*best_dev)) 277 | if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (step in [len(train_dataloader)//2, len(train_dataloader)//4]): 278 | loss_dev, acc_dev = self.evaluate('dev', global_step) 279 | self.tb_writer.add_scalar(f"FT_Dev_acc_sample{n_sample}", acc_dev, global_step) 280 | if acc_dev > best_dev: 281 | logger.info("Best model updated!") 282 | self.best_model = copy.deepcopy(self.model.state_dict()) 283 | best_dev = acc_dev 284 | if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: 285 | self.save_model(stage = n_sample) 286 | 287 | if 0 < training_steps < global_step: 288 | epoch_iterator.close() 289 | break 290 | loss_dev, acc_dev = self.evaluate('dev', global_step) 291 | print(f'Dev: Loss: {loss_dev}, Acc: {acc_dev}') 292 | self.tb_writer.add_scalar(f"FT_Dev_acc_sample{n_sample}", acc_dev, global_step) 293 | if acc_dev > best_dev: 294 | logger.info("Best model updated!") 295 | self.best_model = copy.deepcopy(self.model.state_dict()) 296 | best_dev = acc_dev 297 | self.model.load_state_dict(self.best_model) 298 | loss_test, acc_test = self.evaluate('test', global_step) 299 | print(f'Test: Loss: {loss_test}, Acc: {acc_test}') 300 | self.tb_writer.add_scalar(f"FT_Test_acc_{self.args.method}_seed{self.args.seed}", acc_test, n_sample) 301 | self.save_model(stage = n_sample) 302 | self.save_result(stage = n_sample, acc = acc_test, self_training = False) 303 | return global_step, tr_loss / global_step 304 | 305 | def active_selftrain(self, soft = True, n_sample = 50): 306 | train_sampler = RandomSampler(self.train_dataset) 307 | train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.batch_size) 308 | train_dataloader_iter = iter(train_dataloader) 309 | unlabeled_sampler = RandomSampler(self.pooled) 310 | unlabeled_dataloader = DataLoader(self.pooled, sampler=unlabeled_sampler, batch_size=self.args.self_training_batch_size) 311 | unlabeled_dataloader_iter = iter(unlabeled_dataloader) 312 | if self.args.self_training_max_step > 0: 313 | t_total = self.args.self_training_max_step 314 | self.args.num_train_epochs = self.args.self_training_max_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 315 | else: 316 | t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs 317 | 318 | # Prepare optimizer and schedule (linear warmup and decay) 319 | no_decay = ['bias', 'LayerNorm.weight'] 320 | optimizer_grouped_parameters = [ 321 | {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 322 | 'weight_decay': self.args.weight_decay}, 323 | {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 324 | ] 325 | optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate * 0.25, eps=self.args.adam_epsilon) 326 | self_training_loss = nn.KLDivLoss(reduction = 'none') if soft else nn.CrossEntropyLoss(reduction = 'none') 327 | softmax = nn.Softmax(dim=1) 328 | update_step = 0 329 | self_training_steps = self.args.self_training_max_step 330 | global_step = 0 331 | selftrain_loss = 0 332 | best_model = None 333 | best_dev = -np.float('inf') 334 | set_seed(self.args) 335 | step_iterator = trange(int(self_training_steps * self.args.gradient_accumulation_steps)) 336 | for step in step_iterator: 337 | # epoch_iterator = tqdm(train_dataloader, desc="SelfTrain, Iteration") 338 | try: 339 | batch = next(train_dataloader_iter) 340 | except StopIteration: 341 | logger.info("Finished iterating Train dataset, begin reiterate") 342 | train_dataloader_iter = iter(train_dataloader) 343 | batch = next(train_dataloader_iter) 344 | try: 345 | batch_unlabeled = next(unlabeled_dataloader_iter) 346 | except StopIteration: 347 | logger.info("Finished iterating Unlabeled dataset, begin reiterate") 348 | unlabeled_dataloader_iter = iter(unlabeled_dataloader) 349 | batch_unlabeled = next(unlabeled_dataloader_iter) 350 | self.model.train() 351 | batch = tuple(t.to(self.device) for t in batch) # GPU or CPU 352 | inputs_train = { 353 | 'input_ids': batch[0], 354 | 'attention_mask': batch[1], 355 | 'token_type_ids': batch[2], 356 | 'labels': batch[3], 357 | 'output_hidden_states':True 358 | } 359 | 360 | batch_unlabeled = tuple(t.to(self.device) for t in batch_unlabeled) # GPU or CPU 361 | inputs_unlabeled = { 362 | 'input_ids': batch_unlabeled[0], 363 | 'attention_mask': batch_unlabeled[1], 364 | 'token_type_ids': batch_unlabeled[2], 365 | 'labels': batch_unlabeled[3], # Never use this! 366 | "output_hidden_states": True 367 | } 368 | outputs_train = self.model(**inputs_train) 369 | outputs = self.model(**inputs_unlabeled) 370 | outputs_pseudo = batch_unlabeled[-1] 371 | logits = outputs[1] 372 | if self.args.gce_loss: # an alternative for denoising function, that can further boost the performance :) We do not use it in our main experiments. 373 | loss_st, weight, w = self.gce_loss(input = logits, \ 374 | target= outputs_pseudo, \ 375 | thresh = self.args.self_training_eps, \ 376 | soft = soft, \ 377 | conf = 'max', \ 378 | is_prob = True) 379 | else: 380 | loss_st, weight, w = self.calc_loss(input = torch.log(softmax(logits)), \ 381 | target= outputs_pseudo, \ 382 | loss = self_training_loss, \ 383 | thresh = self.args.self_training_eps, \ 384 | soft = False, \ 385 | conf = 'max', \ 386 | is_prob = True) 387 | weight = weight.unsqueeze(1).detach().cpu().numpy() 388 | w = w.flatten().bool().detach().cpu().numpy() 389 | 390 | train_loss = outputs_train[0] 391 | if torch.cuda.device_count() > 1: 392 | train_loss = train_loss.mean() 393 | loss_st = loss_st.mean() 394 | loss = (1 - self.args.self_training_weight) * train_loss + self.args.self_training_weight * loss_st 395 | clean_loss = train_loss.item() 396 | selftrain_loss = loss_st.item() 397 | all_loss = loss.item() 398 | loss.backward() 399 | if (step + 1) % self.args.gradient_accumulation_steps == 0: 400 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) 401 | optimizer.step() 402 | self.model.zero_grad() 403 | global_step += 1 404 | step_iterator.set_description("Active SelfTrain iter:%d Loss:%.3f, weight: %.2f, Clean loss: %.3f, selftrain loss: %.3f" % (step, all_loss, self.args.self_training_weight, clean_loss, selftrain_loss)) 405 | 406 | if global_step % self.args.self_train_logging_steps == 0: 407 | loss_dev, acc_dev = self.evaluate('dev', global_step) 408 | self.tb_writer.add_scalar(f"ST_Acc_Dev_sample{n_sample}", acc_dev, step) 409 | print(f'Stage 1, Dev: Loss: {loss_dev}, Acc: {acc_dev}') 410 | if acc_dev > best_dev: 411 | logger.info("Best model updated!") 412 | best_model = copy.deepcopy(self.model.state_dict()) 413 | best_dev = acc_dev 414 | 415 | self.model.load_state_dict(best_model) 416 | loss_test, acc_test = self.evaluate('test', global_step) 417 | print(f'Test: Loss: {loss_test}, Acc: {acc_test}') 418 | self.tb_writer.add_scalar(f"ST_Test_acc_{self.args.method}_seed{self.args.seed}", acc_test, n_sample) 419 | self.save_model(stage = n_sample, self_training = True) 420 | self.save_result(stage = n_sample, acc = acc_test, self_training = True) 421 | 422 | def sample(self, n_sample = 20, n_unlabeled = 2048, round = 1): 423 | train_pred, train_feat, train_label, unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_logits = self.inference(layer = -1) 424 | 425 | new_train, new_unlabeled, pooled = self.active_sampler.sample(self.args.al_method, train_pred, train_feat, train_label, unlabeled_pred, \ 426 | unlabeled_feat, unlabeled_label, n_sample= n_sample, n_unlabeled = n_unlabeled, round = round) 427 | self.train_dataset = new_train 428 | self.unlabeled = new_unlabeled 429 | self.pooled = pooled 430 | print(f"======= train {len(new_train)}, unlabel {len(new_unlabeled)} pool {len(pooled)} =========") 431 | self.save_dataset(stage = n_sample) 432 | return new_train, new_unlabeled 433 | 434 | 435 | def inference(self, layer = -1): 436 | ## Inference the embeddings/predictions for unlabeled data 437 | train_dataloader = DataLoader(self.train_dataset, shuffle=False, batch_size=self.args.eval_batch_size) 438 | train_pred = [] 439 | 440 | train_feat = [] 441 | train_label = [] 442 | self.model.eval() 443 | softmax = nn.Softmax(dim = 1) 444 | for batch in tqdm(train_dataloader, desc="Evaluating Labeled Set"): 445 | batch = tuple(t.to(self.device) for t in batch) 446 | with torch.no_grad(): 447 | inputs = { 448 | 'input_ids': batch[0], 449 | 'attention_mask': batch[1], 450 | 'token_type_ids': batch[2], 451 | 'labels': batch[3], 452 | 'output_hidden_states': True 453 | } 454 | outputs = self.model(**inputs) 455 | tmp_eval_loss, logits, feats = outputs[0], outputs[1], outputs[2] 456 | # print(outputs) 457 | logits = softmax(logits).detach().cpu().numpy() 458 | train_pred.append(logits) 459 | train_feat.append(feats[layer][:, 0, :].detach().cpu().numpy()) 460 | train_label.append(batch[3].detach().cpu().numpy()) 461 | train_pred = np.concatenate(train_pred, axis = 0) 462 | train_feat = np.concatenate(train_feat, axis = 0) 463 | train_label = np.concatenate(train_label, axis = 0) 464 | train_conf = np.amax(train_pred, axis = 1) 465 | print("train size:", train_pred.shape, train_feat.shape, train_label.shape, train_conf.shape) 466 | unlabeled_dataloader = DataLoader(self.unlabeled, shuffle=False, batch_size=self.args.eval_batch_size) 467 | unlabeled_pred = [] 468 | unlabeled_logits = [] 469 | unlabeled_feat = [] 470 | unlabeled_label = [] 471 | self.model.eval() 472 | for batch in tqdm(unlabeled_dataloader, desc="Evaluating Unlabeled Set"): 473 | batch = tuple(t.to(self.device) for t in batch) 474 | with torch.no_grad(): 475 | inputs = { 476 | 'input_ids': batch[0], 477 | 'attention_mask': batch[1], 478 | 'token_type_ids': batch[2], 479 | 'labels': batch[3], 480 | 'output_hidden_states': True 481 | } 482 | outputs = self.model(**inputs) 483 | tmp_eval_loss, logits, feats = outputs[0], outputs[1], outputs[2] 484 | unlabeled_logits.append(logits.detach().cpu().numpy()) 485 | logits = softmax(logits).detach().cpu().numpy() 486 | unlabeled_pred.append(logits) 487 | unlabeled_feat.append(feats[layer][:, 0, :].detach().cpu().numpy()) 488 | unlabeled_label.append(batch[3].detach().cpu().numpy()) 489 | unlabeled_feat = np.concatenate(unlabeled_feat, axis = 0) 490 | unlabeled_label = np.concatenate(unlabeled_label, axis = 0) 491 | unlabeled_pred = np.concatenate(unlabeled_pred, axis = 0) 492 | unlabeled_logits = np.concatenate(unlabeled_logits, axis = 0) 493 | unlabeled_conf = np.amax(unlabeled_pred, axis = 1) 494 | unlabeled_pseudo = np.argmax(unlabeled_pred, axis = 1) 495 | 496 | print("unlabeled size:", unlabeled_pred.shape, unlabeled_feat.shape, unlabeled_label.shape, unlabeled_conf.shape) 497 | return train_pred, train_feat, train_label, unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_logits 498 | 499 | 500 | 501 | def evaluate(self, mode, global_step=-1): 502 | # We use test dataset because semeval doesn't have dev dataset 503 | if mode == 'test': 504 | dataset = self.test_dataset 505 | elif mode == 'dev': 506 | dataset = self.dev_dataset 507 | else: 508 | raise Exception("Only dev and test dataset available") 509 | 510 | eval_sampler = SequentialSampler(dataset) 511 | eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size) 512 | 513 | # Eval! 514 | logger.info("***** Running evaluation on %s dataset *****", mode) 515 | logger.info(" Num examples = %d", len(dataset)) 516 | # logger.info(" Batch size = %d", self.args.batch_size) 517 | eval_loss = 0.0 518 | nb_eval_steps = 0 519 | preds = None 520 | out_label_ids = None 521 | self.model.eval() 522 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 523 | batch = tuple(t.to(self.device) for t in batch) 524 | with torch.no_grad(): 525 | inputs = { 526 | 'input_ids': batch[0], 527 | 'attention_mask': batch[1], 528 | 'token_type_ids': batch[2], 529 | 'labels': batch[3], 530 | } 531 | outputs = self.model(**inputs) 532 | tmp_eval_loss, logits = outputs[:2] 533 | 534 | eval_loss += tmp_eval_loss.mean().item() 535 | nb_eval_steps += 1 536 | 537 | if preds is None: 538 | preds = logits.detach().cpu().numpy() 539 | out_label_ids = inputs['labels'].detach().cpu().numpy() 540 | else: 541 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 542 | out_label_ids = np.append( 543 | out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 544 | 545 | eval_loss = eval_loss / nb_eval_steps 546 | results = { 547 | "loss": eval_loss 548 | } 549 | preds = np.argmax(preds, axis=1) 550 | result = compute_metrics(preds, out_label_ids) 551 | result.update(result) 552 | 553 | logger.info("***** Eval results *****") 554 | 555 | return results["loss"], result["acc"] 556 | 557 | --------------------------------------------------------------------------------