├── CV ├── MAE │ ├── README.md │ ├── engine_finetune.py │ ├── engine_pretrain.py │ ├── exp_results │ │ └── MAE │ │ │ ├── base │ │ │ ├── log_base_ft.txt │ │ │ └── log_base_pretrain.txt │ │ │ └── large │ │ │ ├── log_large_ft.txt │ │ │ └── log_large_pretrain.txt │ ├── main_finetune.py │ ├── main_linprobe.py │ ├── main_pretrain.py │ ├── models_mae.py │ ├── models_vit.py │ └── util │ │ ├── crop.py │ │ ├── datasets.py │ │ ├── lars.py │ │ ├── lr_decay.py │ │ ├── lr_sched.py │ │ ├── misc.py │ │ └── pos_embed.py └── timm │ ├── README.md │ ├── exp_results │ ├── ConvNext │ │ └── small │ │ │ ├── args_cvnext_150.yaml │ │ │ ├── args_cvnext_300.yaml │ │ │ ├── summary_cvnext_150.csv │ │ │ └── summary_cvnext_300.csv │ ├── ResNet │ │ ├── Res101 │ │ │ ├── args_res101_100.yaml │ │ │ ├── args_res101_200.yaml │ │ │ ├── args_res101_300.yaml │ │ │ ├── summary_res101_100.csv │ │ │ ├── summary_res101_200.csv │ │ │ ├── summary_res101_300.csv │ │ │ └── summary_res50_200.csv │ │ └── Res50 │ │ │ ├── args_res50_100.yaml │ │ │ ├── args_res50_200.yaml │ │ │ ├── args_res50_300.yaml │ │ │ ├── summary_res50_100.csv │ │ │ ├── summary_res50_200.csv │ │ │ └── summary_res50_300.csv │ └── ViT │ │ ├── base │ │ ├── args_vit-B_150.yaml │ │ ├── args_vit-B_300.yaml │ │ ├── args_vit-B_300_T.yaml │ │ ├── summary_vit-B_150.csv │ │ ├── summary_vit-B_300.csv │ │ └── summary_vit-B_300_T.csv │ │ └── small │ │ ├── args_vit-s_150-I.yaml │ │ ├── args_vit-s_150.yaml │ │ ├── args_vit-s_300-I.yaml │ │ ├── args_vit-s_300.yaml │ │ ├── summary_vit-s_150-I.csv │ │ ├── summary_vit-s_150.csv │ │ ├── summary_vit-s_300-I.csv │ │ └── summary_vit-s_300.csv │ ├── optim_factory.py │ ├── sam.py │ ├── supervised.md │ └── train.py ├── LICENSE ├── NLP ├── BERT │ ├── README.md │ ├── adan.py │ ├── config │ │ ├── finetuning │ │ │ ├── acc_test.py │ │ │ ├── cola-adan.yaml │ │ │ ├── cola.yaml │ │ │ ├── mnli-adan.yaml │ │ │ ├── mnli.yaml │ │ │ ├── qnli-adan.yaml │ │ │ ├── qnli.yaml │ │ │ ├── qqp-adan.yaml │ │ │ ├── qqp.yaml │ │ │ ├── rte-adan.yaml │ │ │ ├── rte.yaml │ │ │ ├── sst_2-adan.yaml │ │ │ ├── sst_2.yaml │ │ │ ├── sts_b-adan.yaml │ │ │ └── sts_b.yaml │ │ └── pretraining │ │ │ ├── base.yaml │ │ │ ├── bert-adan.yaml │ │ │ └── bert-base.yaml │ └── exp_results │ │ └── pretrain │ │ ├── full_config-adam.yaml │ │ ├── full_config-adan.yaml │ │ ├── hydra_train-adam.log │ │ ├── hydra_train-adan-2.log │ │ └── hydra_train-adan.log └── Transformer-XL │ ├── README.md │ ├── data_utils.py │ ├── eval.py │ ├── exp_results │ ├── log-100k.txt │ ├── log-200k.txt │ ├── log-50k.txt │ └── log-adam.txt │ ├── mem_transformer.py │ ├── run_wt103_adan.sh │ ├── train.py │ └── utils │ ├── adaptive_softmax.py │ ├── data_parallel.py │ ├── exp_utils.py │ ├── log_uniform_sampler.py │ ├── proj_adaptive_softmax.py │ └── vocabulary.py ├── README.md ├── adan.py ├── dreamfusion └── README.md ├── fused_adan ├── README.md ├── fused_adan_kernel.cu ├── include │ ├── fused_adan_kernel.cuh │ ├── multi_tensor_apply.cuh │ └── type_shim.h ├── multi_tensor_adan_kernel.cu └── pybind_adan.cpp ├── gpt2 ├── README.md ├── checkpoints │ └── gpt2-adan │ │ ├── gpt_args │ │ └── tokenizer │ │ ├── merges.txt │ │ └── vocab.json ├── download_dataset.py └── pretrain.sh └── setup.py /CV/MAE/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | We provide the instruction to modify the official training and fine-tuning files used in [MAE](https://github.com/facebookresearch/mae) such that you can use Adan to train MAE. **Please follow MAE instruction to install necessary packages.** 4 | 5 | 6 | 7 | ## Environment 8 | 9 | Our experiments for this task are based on the following pkg version. 10 | 11 | ```python 12 | torch.__version__ = '1.7.1+cu110' 13 | torchvision.__version__ = '0.8.2+cu110' 14 | timm.__version__ = '0.4.5' 15 | torchaudio.__version__ = '0.7.2' 16 | ``` 17 | If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:mae](https://hub.docker.com/repository/docker/xyxie/adan-image). 18 | 19 | 20 | 21 | ## Usage of Adan for MAE 22 | 23 | ### Two steps to use Adan 24 | 25 | **Step 1.** add the following parameters to the `main_pretrain.py` and `main_finetune.py`. 26 | 27 | ```python 28 | parser.add_argument('--use-adan', action='store_true', default=False, help='whether to use Adan') 29 | parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient (default: 0.0, no gradient clip)') 30 | parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)') 31 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)') 32 | ``` 33 | 34 | * `use-adan`: whether to use Adan. The default optimizer is AdamW. 35 | 36 | * `max-grad-norm`: it determines whether to perform gradient clipping. 37 | 38 | * `opt-eps`: optimizer epsilon to avoid the bad case where second-order moment is zero. 39 | 40 | * `opt-betas`: optimizer betas for Adan. 41 | 42 | 43 | 44 | **Step 2.** creat the Adan optimizer as follows. In this step, you can directly replace the vanilla optimizer creator : 45 | 46 | ```python 47 | # following timm: set wd as 0 for bias and norm layers 48 | param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay) 49 | if args.use_adan: 50 | if args.bias_decay: 51 | param = model_without_ddp.parameters() 52 | else: 53 | param = param_groups 54 | args.weight_decay = 0.0 55 | optimizer = Adan(param, weight_decay=args.weight_decay, 56 | lr=args.lr, betas=args.opt_betas, 57 | eps = args.opt_eps, max_grad_norm=args.max_grad_norm) 58 | else: 59 | optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) 60 | ``` 61 | 62 | 63 | 64 | ## MAE Pre-training 65 | 66 | ```python 67 | python main_pretrain.py \ 68 | --batch_size ${BS} --accum_iter 1 \ 69 | --model ${MODEL_NAME} --norm_pix_loss --mask_ratio 0.75 \ 70 | --epochs 800 \ 71 | --lr ${LR} --weight_decay 0.02 --warmup_epochs ${WR_EPOCH} \ 72 | --min_lr ${MIN_LR} \ 73 | --opt-betas 0.98 0.92 0.90 --opt-eps 1e-8 --max-grad-norm 10.0 \ 74 | --use-adan \ 75 | --data_path ${IMAGENET_DIR} 76 | --output_dir ${OUT_DIR} 77 | ``` 78 | 79 | - The pre-training file `main_pretrain.py` comes from [MAE](https://github.com/facebookresearch/mae). 80 | - We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large. 81 | - There are some differences between hyper-parameters for MAE-Base and MAE-Large 82 | 83 | | | MODEL_NAME | LR | BS | MIN_LR | WR_EPOCH | 84 | | :-------: | :-------------------: | :----: | :--: | :----: | :------: | 85 | | MAE-Base | mae_vit_base_patch16 | 2.0e-3 | 256 | 1e-8 | 40 | 86 | | MAE-Large | mae_vit_large_patch16 | 2.2e-3 | 128 | 1e-4 | 80 | 87 | 88 | 89 | 90 | ## MAE Fine-tuning 91 | 92 | ```python 93 | python main_finetune.py \ 94 | --accum_iter 1 \ 95 | --batch_size ${BS} \ 96 | --model ${MODEL_NAME} \ 97 | --finetune ${PATH to Ptr-trained Model} \ 98 | --epochs ${EPOCH} \ 99 | --lr 1.5e-2 --layer_decay ${LAYER_DECAY} \ 100 | --min-lr ${MIN_LR} \ 101 | --opt-betas 0.98 0.92 0.99 \ 102 | --opt-eps 1e-8 --max-grad-norm 0 \ 103 | --use-adan --warmup-epochs ${WR_EPOCH} \ 104 | --weight_decay ${WD} --drop_path ${DROP_PATH} \ 105 | --mixup 0.8 --cutmix 1.0 --reprob 0.25 \ 106 | --dist_eval --data_path ${IMAGENET_DIR} 107 | ``` 108 | 109 | - The fine-tune file `main_finetune.py` comes from [MAE](https://github.com/facebookresearch/mae). 110 | - We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large. 111 | - There are some differences between hyper-parameters for MAE-Base and MAE-Large 112 | 113 | | | MODEL_NAME | EPOCH | MIN_LR | BS | LAYER_DECAY | WR_EPOCH | WD | DROP_PATH | 114 | | :-------: | :---------------: | :---: | :----: | :--: | :---------: | :------: | ---- | :-------: | 115 | | MAE-Base | vit_base_patch16 | 100 | 1e-6 | 128 | 0.65 | 40 | 5e-3 | 0.1 | 116 | | MAE-Large | vit_large_patch16 | 50 | 1e-5 | 64 | 0.75 | 10 | 1e-3 | 0.2 | 117 | 118 | 119 | 120 | ## Results and Logs 121 | 122 | | | MAE-Base | MAE-Large | 123 | | :------: | :----------------------------------------------------------: | :----------------------------------------------------------: | 124 | | Top-1 Acc. (%) | 83.8 | 85.9 | 125 | | download | [log-pretrain](./exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./exp_results/MAE/base/log_base_ft.txt)/model | [log-pretrain](./exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./exp_results/MAE/large/log_large_ft.txt)/model | 126 | 127 | -------------------------------------------------------------------------------- /CV/MAE/engine_finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # DeiT: https://github.com/facebookresearch/deit 9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit 10 | # -------------------------------------------------------- 11 | 12 | import math 13 | import sys 14 | from typing import Iterable, Optional 15 | 16 | import torch 17 | 18 | from timm.data import Mixup 19 | from timm.utils import accuracy 20 | 21 | import util.misc as misc 22 | import util.lr_sched as lr_sched 23 | 24 | 25 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, 26 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 27 | device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, 28 | mixup_fn: Optional[Mixup] = None, log_writer=None, 29 | args=None): 30 | model.train(True) 31 | metric_logger = misc.MetricLogger(delimiter=" ") 32 | metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) 33 | header = 'Epoch: [{}]'.format(epoch) 34 | print_freq = 20 35 | 36 | accum_iter = args.accum_iter 37 | 38 | optimizer.zero_grad() 39 | 40 | if log_writer is not None: 41 | print('log_dir: {}'.format(log_writer.log_dir)) 42 | 43 | for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): 44 | 45 | # we use a per iteration (instead of per epoch) lr scheduler 46 | if data_iter_step % accum_iter == 0: 47 | lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) 48 | 49 | samples = samples.to(device, non_blocking=True) 50 | targets = targets.to(device, non_blocking=True) 51 | 52 | if mixup_fn is not None: 53 | samples, targets = mixup_fn(samples, targets) 54 | 55 | with torch.cuda.amp.autocast(): 56 | outputs = model(samples) 57 | loss = criterion(outputs, targets) 58 | 59 | loss_value = loss.item() 60 | 61 | 62 | 63 | loss /= accum_iter 64 | loss_scaler(loss, optimizer, clip_grad=max_norm, 65 | parameters=model.parameters(), create_graph=False, 66 | update_grad=(data_iter_step + 1) % accum_iter == 0) 67 | if (data_iter_step + 1) % accum_iter == 0: 68 | optimizer.zero_grad() 69 | 70 | torch.cuda.synchronize() 71 | 72 | metric_logger.update(loss=loss_value) 73 | min_lr = 10. 74 | max_lr = 0. 75 | for group in optimizer.param_groups: 76 | min_lr = min(min_lr, group["lr"]) 77 | max_lr = max(max_lr, group["lr"]) 78 | 79 | metric_logger.update(lr=max_lr) 80 | 81 | loss_value_reduce = misc.all_reduce_mean(loss_value) 82 | if not math.isfinite(loss_value_reduce): 83 | print("Loss is {}, stopping training".format(loss_value_reduce)) 84 | sys.exit(1) 85 | if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: 86 | """ We use epoch_1000x as the x-axis in tensorboard. 87 | This calibrates different curves when batch size changes. 88 | """ 89 | epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) 90 | log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x) 91 | log_writer.add_scalar('lr', max_lr, epoch_1000x) 92 | 93 | # gather the stats from all processes 94 | metric_logger.synchronize_between_processes() 95 | print("Averaged stats:", metric_logger) 96 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 97 | 98 | 99 | @torch.no_grad() 100 | def evaluate(data_loader, model, device): 101 | criterion = torch.nn.CrossEntropyLoss() 102 | 103 | metric_logger = misc.MetricLogger(delimiter=" ") 104 | header = 'Test:' 105 | 106 | # switch to evaluation mode 107 | model.eval() 108 | 109 | for batch in metric_logger.log_every(data_loader, 10, header): 110 | images = batch[0] 111 | target = batch[-1] 112 | images = images.to(device, non_blocking=True) 113 | target = target.to(device, non_blocking=True) 114 | 115 | # compute output 116 | with torch.cuda.amp.autocast(): 117 | output = model(images) 118 | loss = criterion(output, target) 119 | 120 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 121 | 122 | batch_size = images.shape[0] 123 | metric_logger.update(loss=loss.item()) 124 | metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) 125 | metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) 126 | # gather the stats from all processes 127 | metric_logger.synchronize_between_processes() 128 | print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' 129 | .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) 130 | 131 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} -------------------------------------------------------------------------------- /CV/MAE/engine_pretrain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # DeiT: https://github.com/facebookresearch/deit 9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit 10 | # -------------------------------------------------------- 11 | import math 12 | import sys 13 | from typing import Iterable 14 | 15 | import torch 16 | 17 | import util.misc as misc 18 | import util.lr_sched as lr_sched 19 | 20 | 21 | def train_one_epoch(model: torch.nn.Module, 22 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 23 | device: torch.device, epoch: int, loss_scaler, 24 | log_writer=None, 25 | args=None): 26 | model.train(True) 27 | metric_logger = misc.MetricLogger(delimiter=" ") 28 | metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) 29 | header = 'Epoch: [{}]'.format(epoch) 30 | print_freq = 20 31 | 32 | accum_iter = args.accum_iter 33 | 34 | optimizer.zero_grad() 35 | 36 | if log_writer is not None: 37 | print('log_dir: {}'.format(log_writer.log_dir)) 38 | 39 | for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): 40 | 41 | # we use a per iteration (instead of per epoch) lr scheduler 42 | if data_iter_step % accum_iter == 0: 43 | lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) 44 | 45 | samples = samples.to(device, non_blocking=True) 46 | 47 | with torch.cuda.amp.autocast(): 48 | loss, _, _ = model(samples, mask_ratio=args.mask_ratio) 49 | 50 | loss_value = loss.item() 51 | 52 | 53 | 54 | loss /= accum_iter 55 | loss_scaler(loss, optimizer, parameters=model.parameters(), 56 | update_grad=(data_iter_step + 1) % accum_iter == 0) 57 | if (data_iter_step + 1) % accum_iter == 0: 58 | optimizer.zero_grad() 59 | 60 | torch.cuda.synchronize() 61 | 62 | metric_logger.update(loss=loss_value) 63 | 64 | lr = optimizer.param_groups[0]["lr"] 65 | metric_logger.update(lr=lr) 66 | 67 | loss_value_reduce = misc.all_reduce_mean(loss_value) 68 | if not math.isfinite(loss_value_reduce): 69 | print("Loss is {}, stopping training".format(loss_value_reduce)) 70 | sys.exit(1) 71 | if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: 72 | """ We use epoch_1000x as the x-axis in tensorboard. 73 | This calibrates different curves when batch size changes. 74 | """ 75 | epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) 76 | log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x) 77 | log_writer.add_scalar('lr', lr, epoch_1000x) 78 | 79 | 80 | # gather the stats from all processes 81 | metric_logger.synchronize_between_processes() 82 | print("Averaged stats:", metric_logger) 83 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} -------------------------------------------------------------------------------- /CV/MAE/exp_results/MAE/large/log_large_ft.txt: -------------------------------------------------------------------------------- 1 | {"train_lr": 0.0007476019200000001, "train_loss": 5.9094133159518245, "test_loss": 1.7714076134562493, "test_acc1": 61.33637235611582, "test_acc5": 84.77687142609177, "epoch": 0, "n_parameters": 304326632} 2 | {"train_lr": 0.0022476019200000003, "train_loss": 4.501337738275528, "test_loss": 1.1959131537377834, "test_acc1": 72.27087332465598, "test_acc5": 91.66066860084875, "epoch": 1, "n_parameters": 304326632} 3 | {"train_lr": 0.0037476019200000004, "train_loss": 4.119643689954281, "test_loss": 1.0854404755681752, "test_acc1": 75.52783110144804, "test_acc5": 93.39011516131733, "epoch": 2, "n_parameters": 304326632} 4 | {"train_lr": 0.005247601920000002, "train_loss": 3.9008864871740343, "test_loss": 1.0289268112555146, "test_acc1": 76.92938261289896, "test_acc5": 94.09788868386092, "epoch": 3, "n_parameters": 304326632} 5 | {"train_lr": 0.006747601919999998, "train_loss": 3.76051225707531, "test_loss": 0.9720380315184594, "test_acc1": 78.21497122713639, "test_acc5": 94.63371721293326, "epoch": 4, "n_parameters": 304326632} 6 | {"train_lr": 0.00824760192, "train_loss": 3.651956864875555, "test_loss": 0.9415295435115695, "test_acc1": 78.97672746285214, "test_acc5": 95.09756876746584, "epoch": 5, "n_parameters": 304326632} 7 | {"train_lr": 0.009747601920000001, "train_loss": 3.5677191224038602, "test_loss": 0.9388785093277693, "test_acc1": 79.57453616627957, "test_acc5": 95.29950415058465, "epoch": 6, "n_parameters": 304326632} 8 | {"train_lr": 0.011247601919999997, "train_loss": 3.507449230492115, "test_loss": 0.9052619117870927, "test_acc1": 80.08437302847818, "test_acc5": 95.49944016815986, "epoch": 7, "n_parameters": 304326632} 9 | {"train_lr": 0.012747601919999994, "train_loss": 3.4423172294437885, "test_loss": 0.8388488055765628, "test_acc1": 80.4342610673575, "test_acc5": 95.76935380052772, "epoch": 8, "n_parameters": 304326632} 10 | {"train_lr": 0.014247601920000002, "train_loss": 3.3948125799477102, "test_loss": 0.8529021150618792, "test_acc1": 80.73616445743343, "test_acc5": 95.86732244598355, "epoch": 9, "n_parameters": 304326632} 11 | {"train_lr": 0.01499233375709719, "train_loss": 3.342990658354759, "test_loss": 0.8151264287903905, "test_acc1": 81.03206976010719, "test_acc5": 95.96529109723585, "epoch": 10, "n_parameters": 304326632} 12 | {"train_lr": 0.014946245730243689, "train_loss": 3.288912183743715, "test_loss": 0.8095201044529676, "test_acc1": 81.51191621381963, "test_acc5": 96.16522712243801, "epoch": 11, "n_parameters": 304326632} 13 | {"train_lr": 0.01485427994899793, "train_loss": 3.238141927015781, "test_loss": 0.7871933653950691, "test_acc1": 82.07973450799821, "test_acc5": 96.36716250067556, "epoch": 12, "n_parameters": 304326632} 14 | {"train_lr": 0.014717003412983015, "train_loss": 3.1956452232837678, "test_loss": 0.7688306730240584, "test_acc1": 82.2496801315022, "test_acc5": 96.52111323888074, "epoch": 13, "n_parameters": 304326632} 15 | {"train_lr": 0.014535262477692571, "train_loss": 3.1652532088041307, "test_loss": 0.7522821754962206, "test_acc1": 82.66154833756725, "test_acc5": 96.58309339943104, "epoch": 14, "n_parameters": 304326632} 16 | {"train_lr": 0.014310177636427614, "train_loss": 3.121457608240843, "test_loss": 0.7477796772867441, "test_acc1": 82.73952337029799, "test_acc5": 96.67906269169892, "epoch": 15, "n_parameters": 304326632} 17 | {"train_lr": 0.014043136612082945, "train_loss": 3.0966577651739122, "test_loss": 0.753467806391418, "test_acc1": 82.9974408353359, "test_acc5": 96.78502878132953, "epoch": 16, "n_parameters": 304326632} 18 | {"train_lr": 0.013735785801373714, "train_loss": 3.0689808761537076, "test_loss": 0.7341048694401979, "test_acc1": 83.14339413813727, "test_acc5": 96.79302621802991, "epoch": 17, "n_parameters": 304326632} 19 | {"train_lr": 0.01339002012425247, "train_loss": 3.029768516147137, "test_loss": 0.725501059666276, "test_acc1": 83.34532951271389, "test_acc5": 96.81301982526358, "epoch": 18, "n_parameters": 304326632} 20 | {"train_lr": 0.01300797134109743, "train_loss": 3.0120413874208927, "test_loss": 0.7309531949833036, "test_acc1": 83.50927706414586, "test_acc5": 97.00095968694924, "epoch": 19, "n_parameters": 304326632} 21 | {"train_lr": 0.012591994909700855, "train_loss": 2.9821670488238334, "test_loss": 0.7118158831447363, "test_acc1": 83.61924186945724, "test_acc5": 97.01895393230026, "epoch": 20, "n_parameters": 304326632} 22 | {"train_lr": 0.012144655463088535, "train_loss": 2.962305991309881, "test_loss": 0.7047568802535534, "test_acc1": 83.74520156128774, "test_acc5": 97.07493601513458, "epoch": 21, "n_parameters": 304326632} 23 | {"train_lr": 0.011668710997704269, "train_loss": 2.938569626682997, "test_loss": 0.7103257965296507, "test_acc1": 83.9051503784292, "test_acc5": 97.10092768666078, "epoch": 22, "n_parameters": 304326632} 24 | {"train_lr": 0.01116709586944475, "train_loss": 2.91352473244071, "test_loss": 0.7010805677436293, "test_acc1": 84.26103648877037, "test_acc5": 97.12492002376135, "epoch": 23, "n_parameters": 304326632} 25 | {"train_lr": 0.010642902702379645, "train_loss": 2.8938853970646856, "test_loss": 0.692104572802782, "test_acc1": 84.34101090580701, "test_acc5": 97.2508797091852, "epoch": 24, "n_parameters": 304326632} 26 | {"train_lr": 0.010099363321695844, "train_loss": 2.874984144228697, "test_loss": 0.6802691061235965, "test_acc1": 84.30902114603967, "test_acc5": 97.22488802759142, "epoch": 25, "n_parameters": 304326632} 27 | {"train_lr": 0.009539828828420426, "train_loss": 2.852267661267519, "test_loss": 0.6850866706669331, "test_acc1": 84.41898594143599, "test_acc5": 97.29486562941827, "epoch": 26, "n_parameters": 304326632} 28 | {"train_lr": 0.00896774893876856, "train_loss": 2.837763201504946, "test_loss": 0.6828102863952518, "test_acc1": 84.65091173876118, "test_acc5": 97.3268554027616, "epoch": 27, "n_parameters": 304326632} 29 | {"train_lr": 0.008386650715495802, "train_loss": 2.81947190862298, "test_loss": 0.6762189302407206, "test_acc1": 84.7188899800782, "test_acc5": 97.34884836501368, "epoch": 28, "n_parameters": 304326632} 30 | {"train_lr": 0.00780011682238341, "train_loss": 2.8003201848089696, "test_loss": 0.6725861196033657, "test_acc1": 84.82285671179217, "test_acc5": 97.32285668235212, "epoch": 29, "n_parameters": 304326632} 31 | {"train_lr": 0.007211763435924688, "train_loss": 2.7866385659873485, "test_loss": 0.671936163790524, "test_acc1": 84.95481448881304, "test_acc5": 97.38883556682028, "epoch": 30, "n_parameters": 304326632} 32 | {"train_lr": 0.006625217950394574, "train_loss": 2.7746526652514936, "test_loss": 0.6678782022558153, "test_acc1": 84.89283432917799, "test_acc5": 97.4168266078561, "epoch": 31, "n_parameters": 304326632} 33 | {"train_lr": 0.006044096613757472, "train_loss": 2.7576689450562, "test_loss": 0.6610171441733838, "test_acc1": 85.12675947130145, "test_acc5": 97.46681061770316, "epoch": 32, "n_parameters": 304326632} 34 | {"train_lr": 0.00547198223229625, "train_loss": 2.7347684874773024, "test_loss": 0.6683760618418455, "test_acc1": 85.13675626942688, "test_acc5": 97.39683300702906, "epoch": 33, "n_parameters": 304326632} 35 | {"train_lr": 0.004912402081419917, "train_loss": 2.723790532976389, "test_loss": 0.6556776543706655, "test_acc1": 85.26271595713884, "test_acc5": 97.47680741643875, "epoch": 34, "n_parameters": 304326632} 36 | {"train_lr": 0.004368806158837928, "train_loss": 2.7088236126720906, "test_loss": 0.654360967874527, "test_acc1": 85.24072299839516, "test_acc5": 97.47280869541913, "epoch": 35, "n_parameters": 304326632} 37 | {"train_lr": 0.003844545914176986, "train_loss": 2.694744017738104, "test_loss": 0.6538684133067727, "test_acc1": 85.33869164430858, "test_acc5": 97.53278950156115, "epoch": 36, "n_parameters": 304326632} 38 | {"train_lr": 0.0033428535861796433, "train_loss": 2.6908254801392557, "test_loss": 0.6542927216365934, "test_acc1": 85.39467373049877, "test_acc5": 97.53878758217536, "epoch": 37, "n_parameters": 304326632} 39 | {"train_lr": 0.002866822274877639, "train_loss": 2.671278304463625, "test_loss": 0.6524978142604232, "test_acc1": 85.49464174439643, "test_acc5": 97.49280229456822, "epoch": 38, "n_parameters": 304326632} 40 | {"train_lr": 0.0024193868716016085, "train_loss": 2.657200170958042, "test_loss": 0.650126696806401, "test_acc1": 85.59660910492285, "test_acc5": 97.52879077825345, "epoch": 39, "n_parameters": 304326632} 41 | {"train_lr": 0.0020033059644001382, "train_loss": 2.652334677708149, "test_loss": 0.6520910476334393, "test_acc1": 85.52263278619495, "test_acc5": 97.50279909849014, "epoch": 40, "n_parameters": 304326632} 42 | {"train_lr": 0.001621144830427048, "train_loss": 2.6431411161601543, "test_loss": 0.647436778191477, "test_acc1": 85.6365963131361, "test_acc5": 97.54478566339972, "epoch": 41, "n_parameters": 304326632} 43 | {"train_lr": 0.0012752596201547688, "train_loss": 2.637372990643978, "test_loss": 0.6462450991012156, "test_acc1": 85.61260398945898, "test_acc5": 97.54678502360446, "epoch": 42, "n_parameters": 304326632} 44 | {"train_lr": 0.0009677828309231273, "train_loss": 2.6305615900933743, "test_loss": 0.6458461854793132, "test_acc1": 85.75455856750352, "test_acc5": 97.53878758278552, "epoch": 43, "n_parameters": 304326632} 45 | {"train_lr": 0.0007006101593841485, "train_loss": 2.627352162593603, "test_loss": 0.6431183713674545, "test_acc1": 85.75455856231719, "test_acc5": 97.5627799058525, "epoch": 44, "n_parameters": 304326632} 46 | {"train_lr": 0.0004753888139017931, "train_loss": 2.6245033386409284, "test_loss": 0.6450332224182784, "test_acc1": 85.80654192580981, "test_acc5": 97.56877798524638, "epoch": 45, "n_parameters": 304326632} 47 | {"train_lr": 0.0002935073589646598, "train_loss": 2.6220774190187455, "test_loss": 0.6432638500258326, "test_acc1": 85.85252721585758, "test_acc5": 97.56078054442744, "epoch": 46, "n_parameters": 304326632} 48 | {"train_lr": 0.00015608715422415792, "train_loss": 2.611486408829689, "test_loss": 0.6422065225988627, "test_acc1": 85.82453617009305, "test_acc5": 97.57077734545112, "epoch": 47, "n_parameters": 304326632} 49 | {"train_lr": 6.397544093936805e-05, "train_loss": 2.6108330062150955, "test_loss": 0.6433782994002104, "test_acc1": 85.822536808668, "test_acc5": 97.57677542606532, "epoch": 48, "n_parameters": 304326632} 50 | {"train_lr": 1.7740118452942777e-05, "train_loss": 2.6155946560740473, "test_loss": 0.6427758732996881, "test_acc1": 85.822536808668, "test_acc5": 97.5807741464748, "epoch": 49, "n_parameters": 304326632} 51 | -------------------------------------------------------------------------------- /CV/MAE/models_vit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm 9 | # DeiT: https://github.com/facebookresearch/deit 10 | # -------------------------------------------------------- 11 | 12 | from functools import partial 13 | 14 | import torch 15 | import torch.nn as nn 16 | 17 | import timm.models.vision_transformer 18 | 19 | 20 | class VisionTransformer(timm.models.vision_transformer.VisionTransformer): 21 | """ Vision Transformer with support for global average pooling 22 | """ 23 | def __init__(self, global_pool=False, **kwargs): 24 | super(VisionTransformer, self).__init__(**kwargs) 25 | 26 | self.global_pool = global_pool 27 | if self.global_pool: 28 | norm_layer = kwargs['norm_layer'] 29 | embed_dim = kwargs['embed_dim'] 30 | self.fc_norm = norm_layer(embed_dim) 31 | 32 | del self.norm # remove the original norm 33 | 34 | def forward_features(self, x): 35 | B = x.shape[0] 36 | x = self.patch_embed(x) 37 | 38 | cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks 39 | x = torch.cat((cls_tokens, x), dim=1) 40 | x = x + self.pos_embed 41 | x = self.pos_drop(x) 42 | 43 | for blk in self.blocks: 44 | x = blk(x) 45 | 46 | if self.global_pool: 47 | x = x[:, 1:, :].mean(dim=1) # global pool without cls token 48 | outcome = self.fc_norm(x) 49 | else: 50 | x = self.norm(x) 51 | outcome = x[:, 0] 52 | 53 | return outcome 54 | 55 | 56 | def vit_base_patch16(**kwargs): 57 | model = VisionTransformer( 58 | patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 59 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 60 | return model 61 | 62 | 63 | def vit_large_patch16(**kwargs): 64 | model = VisionTransformer( 65 | patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, 66 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 67 | return model 68 | 69 | 70 | def vit_huge_patch14(**kwargs): 71 | model = VisionTransformer( 72 | patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, 73 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 74 | return model -------------------------------------------------------------------------------- /CV/MAE/util/crop.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | import torch 10 | 11 | from torchvision import transforms 12 | from torchvision.transforms import functional as F 13 | 14 | 15 | class RandomResizedCrop(transforms.RandomResizedCrop): 16 | """ 17 | RandomResizedCrop for matching TF/TPU implementation: no for-loop is used. 18 | This may lead to results different with torchvision's version. 19 | Following BYOL's TF code: 20 | https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206 21 | """ 22 | @staticmethod 23 | def get_params(img, scale, ratio): 24 | width, height = F._get_image_size(img) 25 | area = height * width 26 | 27 | target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item() 28 | log_ratio = torch.log(torch.tensor(ratio)) 29 | aspect_ratio = torch.exp( 30 | torch.empty(1).uniform_(log_ratio[0], log_ratio[1]) 31 | ).item() 32 | 33 | w = int(round(math.sqrt(target_area * aspect_ratio))) 34 | h = int(round(math.sqrt(target_area / aspect_ratio))) 35 | 36 | w = min(w, width) 37 | h = min(h, height) 38 | 39 | i = torch.randint(0, height - h + 1, size=(1,)).item() 40 | j = torch.randint(0, width - w + 1, size=(1,)).item() 41 | 42 | return i, j, h, w -------------------------------------------------------------------------------- /CV/MAE/util/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # DeiT: https://github.com/facebookresearch/deit 9 | # -------------------------------------------------------- 10 | 11 | import os 12 | import PIL 13 | 14 | from torchvision import datasets, transforms 15 | 16 | from timm.data import create_transform 17 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 18 | 19 | 20 | def build_dataset(is_train, args): 21 | transform = build_transform(is_train, args) 22 | 23 | root = os.path.join(args.data_path, 'train' if is_train else 'val') 24 | dataset = datasets.ImageFolder(root, transform=transform) 25 | 26 | print(dataset) 27 | 28 | return dataset 29 | 30 | 31 | def build_transform(is_train, args): 32 | mean = IMAGENET_DEFAULT_MEAN 33 | std = IMAGENET_DEFAULT_STD 34 | # train transform 35 | if is_train: 36 | # this should always dispatch to transforms_imagenet_train 37 | transform = create_transform( 38 | input_size=args.input_size, 39 | is_training=True, 40 | color_jitter=args.color_jitter, 41 | auto_augment=args.aa, 42 | interpolation='bicubic', 43 | re_prob=args.reprob, 44 | re_mode=args.remode, 45 | re_count=args.recount, 46 | mean=mean, 47 | std=std, 48 | ) 49 | return transform 50 | 51 | # eval transform 52 | t = [] 53 | if args.input_size <= 224: 54 | crop_pct = 224 / 256 55 | else: 56 | crop_pct = 1.0 57 | size = int(args.input_size / crop_pct) 58 | t.append( 59 | transforms.Resize(size, interpolation=PIL.Image.BICUBIC), # to maintain same ratio w.r.t. 224 images 60 | ) 61 | t.append(transforms.CenterCrop(args.input_size)) 62 | 63 | t.append(transforms.ToTensor()) 64 | t.append(transforms.Normalize(mean, std)) 65 | return transforms.Compose(t) 66 | -------------------------------------------------------------------------------- /CV/MAE/util/lars.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # LARS optimizer, implementation from MoCo v3: 8 | # https://github.com/facebookresearch/moco-v3 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | 13 | 14 | class LARS(torch.optim.Optimizer): 15 | """ 16 | LARS optimizer, no rate scaling or weight decay for parameters <= 1D. 17 | """ 18 | def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001): 19 | defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient) 20 | super().__init__(params, defaults) 21 | 22 | @torch.no_grad() 23 | def step(self): 24 | for g in self.param_groups: 25 | for p in g['params']: 26 | dp = p.grad 27 | 28 | if dp is None: 29 | continue 30 | 31 | if p.ndim > 1: # if not normalization gamma/beta or bias 32 | dp = dp.add(p, alpha=g['weight_decay']) 33 | param_norm = torch.norm(p) 34 | update_norm = torch.norm(dp) 35 | one = torch.ones_like(param_norm) 36 | q = torch.where(param_norm > 0., 37 | torch.where(update_norm > 0, 38 | (g['trust_coefficient'] * param_norm / update_norm), one), 39 | one) 40 | dp = dp.mul(q) 41 | 42 | param_state = self.state[p] 43 | if 'mu' not in param_state: 44 | param_state['mu'] = torch.zeros_like(p) 45 | mu = param_state['mu'] 46 | mu.mul_(g['momentum']).add_(dp) 47 | p.add_(mu, alpha=-g['lr']) -------------------------------------------------------------------------------- /CV/MAE/util/lr_decay.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # ELECTRA https://github.com/google-research/electra 9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit 10 | # -------------------------------------------------------- 11 | 12 | import json 13 | 14 | 15 | def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75): 16 | """ 17 | Parameter groups for layer-wise lr decay 18 | Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58 19 | """ 20 | param_group_names = {} 21 | param_groups = {} 22 | 23 | num_layers = len(model.blocks) + 1 24 | 25 | layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1)) 26 | 27 | for n, p in model.named_parameters(): 28 | if not p.requires_grad: 29 | continue 30 | 31 | # no decay: all 1D parameters and model specific ones 32 | if p.ndim == 1 or n in no_weight_decay_list: 33 | g_decay = "no_decay" 34 | this_decay = 0. 35 | else: 36 | g_decay = "decay" 37 | this_decay = weight_decay 38 | 39 | layer_id = get_layer_id_for_vit(n, num_layers) 40 | group_name = "layer_%d_%s" % (layer_id, g_decay) 41 | 42 | if group_name not in param_group_names: 43 | this_scale = layer_scales[layer_id] 44 | 45 | param_group_names[group_name] = { 46 | "lr_scale": this_scale, 47 | "weight_decay": this_decay, 48 | "params": [], 49 | } 50 | param_groups[group_name] = { 51 | "lr_scale": this_scale, 52 | "weight_decay": this_decay, 53 | "params": [], 54 | } 55 | 56 | param_group_names[group_name]["params"].append(n) 57 | param_groups[group_name]["params"].append(p) 58 | 59 | # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2)) 60 | 61 | return list(param_groups.values()) 62 | 63 | 64 | def get_layer_id_for_vit(name, num_layers): 65 | """ 66 | Assign a parameter with its layer id 67 | Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33 68 | """ 69 | if name in ['cls_token', 'pos_embed']: 70 | return 0 71 | elif name.startswith('patch_embed'): 72 | return 0 73 | elif name.startswith('blocks'): 74 | return int(name.split('.')[1]) + 1 75 | else: 76 | return num_layers -------------------------------------------------------------------------------- /CV/MAE/util/lr_sched.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | def adjust_learning_rate(optimizer, epoch, args): 10 | """Decay the learning rate with half-cycle cosine after warmup""" 11 | if epoch < args.warmup_epochs: 12 | lr = args.lr * epoch / args.warmup_epochs 13 | else: 14 | lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \ 15 | (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) 16 | for param_group in optimizer.param_groups: 17 | if "lr_scale" in param_group: 18 | param_group["lr"] = lr * param_group["lr_scale"] 19 | else: 20 | param_group["lr"] = lr 21 | return lr 22 | -------------------------------------------------------------------------------- /CV/MAE/util/pos_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # Position embedding utils 8 | # -------------------------------------------------------- 9 | 10 | import numpy as np 11 | 12 | import torch 13 | 14 | # -------------------------------------------------------- 15 | # 2D sine-cosine position embedding 16 | # References: 17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py 18 | # MoCo v3: https://github.com/facebookresearch/moco-v3 19 | # -------------------------------------------------------- 20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): 21 | """ 22 | grid_size: int of the grid height and width 23 | return: 24 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 25 | """ 26 | grid_h = np.arange(grid_size, dtype=np.float32) 27 | grid_w = np.arange(grid_size, dtype=np.float32) 28 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 29 | grid = np.stack(grid, axis=0) 30 | 31 | grid = grid.reshape([2, 1, grid_size, grid_size]) 32 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 33 | if cls_token: 34 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 35 | return pos_embed 36 | 37 | 38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 39 | assert embed_dim % 2 == 0 40 | 41 | # use half of dimensions to encode grid_h 42 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 43 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 44 | 45 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 46 | return emb 47 | 48 | 49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 50 | """ 51 | embed_dim: output dimension for each position 52 | pos: a list of positions to be encoded: size (M,) 53 | out: (M, D) 54 | """ 55 | assert embed_dim % 2 == 0 56 | omega = np.arange(embed_dim // 2, dtype=np.float) 57 | omega /= embed_dim / 2. 58 | omega = 1. / 10000**omega # (D/2,) 59 | 60 | pos = pos.reshape(-1) # (M,) 61 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 62 | 63 | emb_sin = np.sin(out) # (M, D/2) 64 | emb_cos = np.cos(out) # (M, D/2) 65 | 66 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 67 | return emb 68 | 69 | 70 | # -------------------------------------------------------- 71 | # Interpolate position embeddings for high-resolution 72 | # References: 73 | # DeiT: https://github.com/facebookresearch/deit 74 | # -------------------------------------------------------- 75 | def interpolate_pos_embed(model, checkpoint_model): 76 | if 'pos_embed' in checkpoint_model: 77 | pos_embed_checkpoint = checkpoint_model['pos_embed'] 78 | embedding_size = pos_embed_checkpoint.shape[-1] 79 | num_patches = model.patch_embed.num_patches 80 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches 81 | # height (== width) for the checkpoint position embedding 82 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) 83 | # height (== width) for the new position embedding 84 | new_size = int(num_patches ** 0.5) 85 | # class_token and dist_token are kept unchanged 86 | if orig_size != new_size: 87 | print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) 88 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] 89 | # only the position tokens are interpolated 90 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] 91 | pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) 92 | pos_tokens = torch.nn.functional.interpolate( 93 | pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) 94 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 95 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 96 | checkpoint_model['pos_embed'] = new_pos_embed 97 | -------------------------------------------------------------------------------- /CV/timm/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | For vision tasks, our implementation is based on the official [`timm`](https://github.com/rwightman/pytorch-image-models). To reproduce our results, please first refer to [`timm`](https://github.com/rwightman/pytorch-image-models) and install it. Then you can follow the following two steps to reproduce our experiments in paper. 4 | 5 | 6 | 7 | ## Environment 8 | 9 | Our experiments for this task are based on the following pkg version. 10 | 11 | ```python 12 | torch.__version__ = '1.10.0+cu113' 13 | torchvision.__version__ = '0.11.1+cu113' 14 | timm.__version__ = '0.6.1' 15 | torchaudio.__version__ = '0.10.0+cu113' 16 | ``` 17 | 18 | Note that our timm is a developer version. If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:timm](https://hub.docker.com/repository/docker/xyxie/adan-image). 19 | 20 | 21 | 22 | ## Usage of Adan in timm 23 | 24 | ### Two steps to use Adan 25 | 26 | **Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the `train.py`: 27 | 28 | ```python 29 | parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient (default: 0.0, no gradient clip)') 30 | parser.add_argument('--weight-decay', type=float, default=0.02, help='weight decay, similar one used in AdamW (default: 0.02)') 31 | parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)') 32 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)') 33 | parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)') 34 | parser.add_argument('--bias-decay', action='store_true', default=False, help='Perform the weight decay on bias term (default=False)') 35 | 36 | ``` 37 | 38 | * `bias-decay`: It decides whether or not to perform the weight decay on 1) bias term, 2) bn, and 3) other 1d params, which are all filtered out by the default setting in timm. 39 | 40 | * `no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper: 41 | 42 | $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$ 43 | 44 | But one also can update the parameter like Adamw: 45 | 46 | $$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$ 47 | **In all experiments, we set `no-prox=False` in our paper.** 48 | 49 | 50 | 51 | **Step 2.** creat the Adan optimizer as follows. In this step, we directly replace the vanilla optimizer creator by using the following three substeps. 52 | 53 | i) add Adan into `optim_factory.py`: 54 | ```python 55 | elif opt_lower == 'adan': 56 | optimizer = Adan(parameters, **opt_args) 57 | ``` 58 | 59 | ii) import the optimizer creator into your training file `train.py` from `optim_factory` : 60 | 61 | ```python 62 | from optim_factory import create_optimizer 63 | ``` 64 | 65 | iii) replace the vanilla creator (`optimizer = create_optimizer(args, model)`) in the training file `train.py` with Adan: 66 | 67 | ```python 68 | opt_lower = args.opt.lower() 69 | if opt_lower == 'adan': 70 | args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox} 71 | optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay) 72 | ``` 73 | 74 | 75 | 76 | ## ImageNet-1K Training Recipe 77 | 78 | We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this [recipe](./supervised.md). 79 | 80 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m9-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: false 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: null 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.1 29 | epoch_repeats: 0.0 30 | epochs: 150 31 | eval_metric: top1 32 | experiment: 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 0.0 52 | mean: null 53 | min_lr: 0.0001 54 | mixup: 0.8 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: convnext_tiny_hnf 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/cvnext 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.25 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.1 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: bicubic 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-08 110 | weight_decay: 0.04 111 | workers: 8 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m9-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: false 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: null 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.1 29 | epoch_repeats: 0.0 30 | epochs: 300 31 | eval_metric: top1 32 | experiment: 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.016 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 0.0 52 | mean: null 53 | min_lr: 0.0001 54 | mixup: 0.8 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: convnext_tiny_hnf 60 | model_ema: true 61 | model_ema_decay: 0.9999 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.9 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/cvnext 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.25 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.1 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 150 109 | warmup_lr: 1.0e-08 110 | weight_decay: 0.02 111 | workers: 8 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res101/args_res101_100.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | configure: job_101_adan2.yaml 17 | cooldown_epochs: 10 18 | crop_pct: 0.95 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/imagenet 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.1 30 | epoch_repeats: 0.0 31 | epochs: 100 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.01 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 5.0 53 | mean: null 54 | min_lr: 0.0001 55 | mixup: 0.1 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: resnet101 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_prox: false 69 | no_resume_opt: false 70 | num_classes: null 71 | opt: adan 72 | opt_betas: 73 | - 0.98 74 | - 0.92 75 | - 0.99 76 | opt_eps: 1.0e-08 77 | output: ./exp_results/res101-100- 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.0 88 | resplit: false 89 | resume: model_best.pth.tar 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 3407 96 | smoothing: 0.0 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: random 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 40 110 | warmup_lr: 1.0e-09 111 | weight_decay: 0.02 112 | workers: 8 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res101/args_res101_200.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: 0.95 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.1 29 | epoch_repeats: 0.0 30 | epochs: 200 31 | eval_metric: top1 32 | experiment: '' 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 5.0 52 | mean: null 53 | min_lr: 0.0001 54 | mixup: 0.1 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: resnet101 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/res101-epoch- 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.0 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.0 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-09 110 | weight_decay: 0.02 111 | workers: 8 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res101/args_res101_300.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | configure: job_101_adan.yaml 17 | cooldown_epochs: 10 18 | crop_pct: 0.95 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/imagenet 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.2 30 | epoch_repeats: 0.0 31 | epochs: 300 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.015 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 5.0 53 | mean: null 54 | min_lr: 1.0e-05 55 | mixup: 0.1 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: resnet101 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_prox: false 69 | no_resume_opt: false 70 | num_classes: null 71 | opt: adan 72 | opt_betas: 73 | - 0.98 74 | - 0.92 75 | - 0.99 76 | opt_eps: 1.0e-08 77 | output: ./exp_results/res101-300- 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.0 88 | resplit: false 89 | resume: model_best.pth.tar 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 3407 96 | smoothing: 0.0 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: random 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 90 110 | warmup_lr: 1.0e-09 111 | weight_decay: 0.02 112 | workers: 8 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res101/summary_res101_100.csv: -------------------------------------------------------------------------------- 1 | epoch,train_loss,eval_loss,eval_top1,eval_top5 2 | 0,0.6999703517981938,6.9381575,0.102,0.4979999999666214 3 | 1,0.05811868847480842,6.87584375,0.3939999999332428,1.7039999963378907 4 | 2,0.007747739586713058,6.76303875,1.6039999908447267,5.338000015869141 5 | 3,0.0073236161510327035,5.3613025,6.767999984130859,18.46600002319336 6 | 4,0.006898723675736359,4.6371725,13.537999970092773,31.59400007080078 7 | 5,0.006467533497405904,4.0580925,20.71000001831055,43.00000006103516 8 | 6,0.0060907453298568726,3.52676375,27.20999994140625,52.32400007324219 9 | 7,0.005859104982976403,3.19898375,33.092000048828126,58.69000002685547 10 | 8,0.005392835708335042,2.749453125,39.679999995117186,66.31800005371093 11 | 9,0.005332435946911573,2.4970771875,44.67999998046875,70.7900000415039 12 | 10,0.004835383068504078,2.29522875,48.786000073242185,74.15399998535156 13 | 11,0.004691791022196412,2.0821503125,52.47399998535156,77.29200000732422 14 | 12,0.004713796977219837,2.0056796875,54.215999931640624,78.56200026367188 15 | 13,0.0046068779192864895,1.965403125,55.21000003173828,79.31600004638672 16 | 14,0.004495224349999002,1.8351496875,57.713999921875,81.02600004638671 17 | 15,0.004356943453396005,1.8534640625,57.819999921875,81.29599999023438 18 | 16,0.004283945841182556,1.72975828125,59.72999997314453,82.90000000732422 19 | 17,0.0041249994620946905,1.6525403125,61.77600001220703,84.05799998779297 20 | 18,0.004143890575505793,1.61844859375,61.91799998779297,84.07199997802735 21 | 19,0.0041115750126274565,1.57048875,63.07800004394531,85.23199998291015 22 | 20,0.004011271621233651,1.53420265625,64.04999995849609,85.62200005859376 23 | 21,0.0038510982579152498,1.55004109375,63.43200009765625,85.19999993652344 24 | 22,0.0038439546312604633,1.5229715625,64.74600000976562,86.03400003173829 25 | 23,0.0038987634304378715,1.51144328125,65.10999988525391,86.39199997558593 26 | 24,0.0038393755676224828,1.46528921875,65.5220000024414,86.72600003173828 27 | 25,0.0039864167171929565,1.43677015625,65.95400012939453,86.9820000048828 28 | 26,0.003912502034966435,1.44618640625,65.75600003662109,86.69800005859375 29 | 27,0.003806904241043542,1.41935078125,66.46599997070312,87.32600012939453 30 | 28,0.0037424968489046606,1.427661875,66.50000010986328,87.29399997802734 31 | 29,0.003874790804859783,1.367300625,67.45599989746094,87.88599997558593 32 | 30,0.0038485014478542973,1.38268578125,67.30000010498047,87.80599994873047 33 | 31,0.00382271282640951,1.34994140625,67.810000078125,88.08400021240234 34 | 32,0.00378438870289496,1.394751875,66.83399997314453,87.6119999975586 35 | 33,0.0037495847625125733,1.35988171875,67.53400002197266,88.06600005371094 36 | 34,0.00387607079132327,1.39839203125,66.71800006103516,87.71599990234375 37 | 35,0.0036058901376756175,1.39179953125,66.84600000976563,87.66000002929688 38 | 36,0.0035983206471428275,1.40048484375,66.83400002929687,87.65799994873046 39 | 37,0.0035912682402080725,1.4286309375,66.94400002929687,87.706 40 | 38,0.00369265178285007,1.3891053125,67.31400002685547,87.800000078125 41 | 39,0.003823710572240608,1.369221875,67.4979999975586,87.82800002197266 42 | 40,0.003413957726609494,1.246991875,69.91399991210938,89.59400009521484 43 | 41,0.0034744960388966967,1.2547934375,70.14600015136719,89.61400020263672 44 | 42,0.00342402803445501,1.26230765625,69.94400004638672,89.19399997314453 45 | 43,0.0034545807700072017,1.2105721875,71.06400001953125,89.91600007080078 46 | 44,0.0033984247129410505,1.225816875,70.70999999267578,89.90400004638671 47 | 45,0.0035708143675167647,1.2220984375,71.30400004882813,90.18400009521484 48 | 46,0.0034103883330577184,1.21230796875,71.87999998779297,90.40800010009765 49 | 47,0.0033485869644209743,1.19272125,71.76399999267578,90.49800009765625 50 | 48,0.0034036182332783937,1.15048734375,72.35600006591797,90.75400014892578 51 | 49,0.0034680215176194906,1.163283125,72.40999996337891,90.73800007080078 52 | 50,0.003379080627512719,1.1675125,72.2219999633789,90.87000004394531 53 | 51,0.003313158971390554,1.1659403125,72.33799996826171,90.63599999267578 54 | 52,0.003398957579130573,1.12288484375,72.96600006591797,91.19399994140625 55 | 53,0.003229137593215065,1.10655890625,73.06600004150391,91.44800012451172 56 | 54,0.003297736668693168,1.12947734375,73.59000001708985,91.52800009521485 57 | 55,0.003102192488898124,1.13240875,72.63600001464843,91.07600004882812 58 | 56,0.003310556390455791,1.07455796875,73.82199999023437,91.73399988525391 59 | 57,0.003254913375712931,1.070454375,74.12600001220703,91.82000006591797 60 | 58,0.003150251860331212,1.0846184375,73.90800006835937,91.64800017333984 61 | 59,0.0032559275361044066,1.0725778125,74.06800004150391,91.95000004394531 62 | 60,0.0031484989948304637,1.059663125,74.60599998291016,91.99600006835938 63 | 61,0.003113441352200295,1.05973453125,74.53799998535156,92.01800006835937 64 | 62,0.003175405037057187,1.04974859375,74.59600006347657,92.17199991455078 65 | 63,0.0030697412855390993,1.02565875,75.1760000390625,92.41599991455078 66 | 64,0.002913903827512903,1.0269609375,75.22199998535156,92.38000001464843 67 | 65,0.003126694039175553,1.025984375,75.27800006347657,92.49400011962891 68 | 66,0.0031302267452701926,1.01711203125,75.39400001220703,92.56000001708985 69 | 67,0.0030674594454467297,1.015896875,75.63200011962891,92.67800009277343 70 | 68,0.003019252243185682,0.9928059375,75.96400008544921,92.88200011474609 71 | 69,0.003121067354056452,1.00143140625,76.01400003173828,92.64000004150391 72 | 70,0.0028890574384214623,0.9874996875,76.20200000976563,92.8920000415039 73 | 71,0.0029056143913684146,0.98568921875,76.20599990478516,93.06000011474609 74 | 72,0.0029515030827107175,0.9696134375,76.52800013427735,93.0959999633789 75 | 73,0.0029652343031817247,0.9613771875,76.79200016357422,93.05400011962891 76 | 74,0.002942567242176405,0.9542409375,76.91399998291016,93.22800014160157 77 | 75,0.002948857095491673,0.9467534375,77.12999998291015,93.40400014160156 78 | 76,0.002977430753942047,0.9387584375,77.26999992675782,93.44200017089844 79 | 77,0.002892624304097678,0.9361546875,77.32999990234374,93.5340000366211 80 | 78,0.0028546288875596865,0.9220828125,77.7280000024414,93.61599998535156 81 | 79,0.0029401361742722137,0.92840078125,77.70400003173827,93.65800003662109 82 | 80,0.0027510516312239425,0.90395640625,78.03200008789062,93.8580001171875 83 | 81,0.0027062457354207125,0.9050746875,78.17399998291016,93.9360000390625 84 | 82,0.002744901110418141,0.8973440625,78.25999998046875,93.95200006347656 85 | 83,0.002718568014513169,0.8877671875,78.62400013427734,94.14600001220703 86 | 84,0.0028618485001581056,0.88632234375,78.51400000732421,94.10800006347657 87 | 85,0.0027493445834677133,0.876911015625,78.77400000976563,94.19799998535156 88 | 86,0.002734048092471702,0.8764725,78.82800005859374,94.22000011230469 89 | 87,0.0025374879062707934,0.86807703125,79.05199995361328,94.27600016601562 90 | 88,0.002690665889531374,0.8684296875,79.06000008300781,94.2960000366211 91 | 89,0.002699888893403113,0.8860525,79.10200010986328,94.32600001220703 92 | 90,0.002796581364236772,0.8614496875,79.20399995361328,94.4160000366211 93 | 91,0.002500709379091859,0.854776171875,79.29600013427735,94.47400006103516 94 | 92,0.0024909183183418854,0.852921328125,79.42600000488281,94.52999993164063 95 | 93,0.0024433880296003607,0.84558203125,79.54000005615234,94.55600016601562 96 | 94,0.002702345955185592,0.8445765625,79.57800015869141,94.6319999584961 97 | 95,0.002313682609902961,0.846581875,79.72599995361328,94.66400000976563 98 | 96,0.0025091321939336403,0.844203828125,79.76800008056641,94.64400016357422 99 | 97,0.0026143502577074935,0.842131484375,79.7340000805664,94.61999998535157 100 | 98,0.002516709908377379,0.841174921875,79.65399995361328,94.66400003662109 101 | 99,0.00261627570060747,0.838111484375,79.86200008300781,94.69399998291016 102 | 100,0.002597623238606112,0.8395615625,79.81399995361328,94.69400003662109 103 | 101,0.002467484595919294,0.845680859375,79.71800000488281,94.59600008789063 104 | 102,0.002401946045990501,0.839708046875,79.89400000488281,94.76199993164063 105 | 103,0.0026869618200830053,0.835429453125,79.88600010986328,94.72400000976563 106 | 104,0.0025204311407703373,0.83972015625,79.74200003173829,94.67600011230469 107 | 105,0.0026361714283536586,0.84152625,79.95200013671875,94.73999993164063 108 | 106,0.0027149992362995234,0.839165859375,79.91000003173828,94.69000011230469 109 | 107,0.002398747401977224,0.836501796875,80.00600005615235,94.75400006103516 110 | 108,0.00262980888198529,0.852977421875,79.71200008300781,94.63800000976562 111 | 109,0.002572902212185519,0.838661171875,79.85999997802735,94.63199998291016 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: 0.95 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.05 29 | epoch_repeats: 0.0 30 | epochs: 100 31 | eval_metric: top1 32 | experiment: e100-aug0-w60-minlr1e6-wrlr1e9-initRdm-bias-lr3e2 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.03 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 5.0 52 | mean: null 53 | min_lr: 1.0e-06 54 | mixup: 0.1 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: resnet50 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/res50-epoch- 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.0 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.0 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-09 110 | weight_decay: 0.02 111 | workers: 8 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: 0.95 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.05 29 | epoch_repeats: 0.0 30 | epochs: 200 31 | eval_metric: top1 32 | experiment: e200-aug0-w60-minlr1e4-wrlr1e9-initRdm-bias 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 5.0 52 | mean: null 53 | min_lr: 0.0001 54 | mixup: 0.1 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: resnet50 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/res50-epoch- 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.0 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.0 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-09 110 | weight_decay: 0.02 111 | workers: 8 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: 0.95 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.05 29 | epoch_repeats: 0.0 30 | epochs: 300 31 | eval_metric: top1 32 | experiment: res50-aug0-retrain 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 5.0 52 | mean: null 53 | min_lr: 1.0e-05 54 | mixup: 0.1 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: resnet50 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_debug: 5 76 | opt_eps: 1.0e-08 77 | output: ./exp_results/res50-epoch- 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.0 88 | resplit: false 89 | resume: null 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 42 96 | smoothing: 0.0 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: bicubic 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 60 110 | warmup_lr: 1.0e-06 111 | weight_decay: 0.02 112 | workers: 8 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv: -------------------------------------------------------------------------------- 1 | epoch,train_loss,eval_loss,eval_top1,eval_top5 2 | 0,0.7045409509113857,6.9416,0.064,0.418 3 | 1,0.058665430905031304,6.89746625,0.3079999999332428,1.2019999998664856 4 | 2,0.007796582133908357,6.2966525,1.7179999993896484,5.899999989013672 5 | 3,0.007212148014722126,5.116435,8.078000043945313,21.984000035400392 6 | 4,0.006597742538100907,4.30874625,16.604000009765624,37.32800003540039 7 | 5,0.006309278409129807,3.7494875,24.503999986572264,48.297999992675784 8 | 6,0.00587210977183921,3.23308,31.903999926757812,57.66999989746094 9 | 7,0.005444032173337681,2.87593875,38.16399994140625,63.99200002685547 10 | 8,0.0054282506462186575,2.59584875,43.517999924316406,69.46200001464844 11 | 9,0.005179691860186202,2.359841875,47.206000029296874,72.58200003417969 12 | 10,0.004889545729383826,2.1719675,50.609999997558596,75.55400000244141 13 | 11,0.00470197234036667,2.1567584375,51.69199992919922,76.44600010253906 14 | 12,0.004586202425083944,1.98930375,54.606000112304685,78.83600004394532 15 | 13,0.004271666053682566,1.8706825,56.328000031738284,80.30800010009766 16 | 14,0.004447908040934375,1.806950625,58.472000075683596,81.5399999633789 17 | 15,0.0041762767692229575,1.7647315625,58.741999968261716,82.09000006103516 18 | 16,0.004471837143812861,1.708065625,60.30200004394531,82.98200011230469 19 | 17,0.004270398956058281,1.67571921875,61.048000041503904,83.32200005859374 20 | 18,0.004100026030625615,1.65201375,61.26000004150391,83.77400021972656 21 | 19,0.0041242205105455855,1.63376078125,61.504000068359375,84.07800001220703 22 | 20,0.004059118734273527,1.67590984375,60.91800009765625,83.5019999584961 23 | 21,0.0041561292850279385,1.63649734375,61.82800004882812,84.22399995361329 24 | 22,0.004249815163867814,1.5946559375,62.68000001220703,84.70800006347656 25 | 23,0.0039470667751239875,1.64520578125,61.93799990234375,84.07400013427734 26 | 24,0.003988273092545569,1.671076875,61.05600004394531,83.42199993164063 27 | 25,0.004096939311628895,1.7034496875,61.12399993652344,83.56399995605469 28 | 26,0.004087086118358586,1.60285265625,62.73200006347656,84.75999995605468 29 | 27,0.00399751916328179,1.61492046875,62.43800003662109,84.32400010742188 30 | 28,0.003949649166315794,1.701069375,60.77399994628906,83.2460001147461 31 | 29,0.004051400797574648,1.6202353125,62.64599990722656,84.67000006103515 32 | 30,0.004139024115699742,1.6344540625,62.20200006591797,84.12200026855469 33 | 31,0.003921386137205575,1.62690984375,62.05000011474609,84.17200008544921 34 | 32,0.00411509963617261,1.68366421875,61.46400011474609,83.86600005859376 35 | 33,0.003911659786743777,1.67565765625,60.84800007324219,83.32999993408202 36 | 34,0.00395727701418634,1.62554953125,62.0080000390625,84.16199998291016 37 | 35,0.004033969731868378,1.71603296875,60.70599999267578,83.0460000390625 38 | 36,0.004010531336202153,1.6436690625,62.05400001953125,84.18800013916015 39 | 37,0.0039575622982478565,1.67731390625,61.35800016845703,83.65600013671875 40 | 38,0.0039316649615232435,1.61552953125,62.22400010986328,84.39000005126952 41 | 39,0.003873389430477151,1.63947921875,61.81200003662109,84.1440000366211 42 | 40,0.004065845494291612,1.653141875,61.8460001147461,83.82200008789063 43 | 41,0.004109910373309893,1.714169375,60.308000017089846,83.24199985595703 44 | 42,0.003946930452782128,1.94490875,56.48200006103516,79.57000004638672 45 | 43,0.0041138056798705035,1.6267740625,61.803999931640625,84.29799997802735 46 | 44,0.004048073315061629,1.62808609375,62.09799998291015,84.28800000976563 47 | 45,0.0039734537546922055,1.784985625,59.12400004882812,82.1780000390625 48 | 46,0.0038987650768831372,1.713120625,60.78800010498047,83.26599994628906 49 | 47,0.0040997504090358105,1.88673,57.57800005615234,80.4180000415039 50 | 48,0.003935285162047616,1.6685634375,61.34400001220703,83.64799995605469 51 | 49,0.004107319034769067,1.7783765625,59.22000000244141,82.05199999023438 52 | 50,0.00387493397907487,1.6779953125,61.276,83.92200001464843 53 | 51,0.004015801890221026,1.847471875,58.37599998046875,81.37399998779297 54 | 52,0.003935897473378905,1.859410625,58.18199997802734,81.15000001708984 55 | 53,0.004190738429315388,1.821818125,58.34600005615234,81.56200009277343 56 | 54,0.004043174558319151,1.823231875,58.122000075683594,81.2140000390625 57 | 55,0.004158310043359441,1.86400625,57.84399987792969,81.45800022460938 58 | 56,0.003960915591700801,1.7923175,58.804000024414066,81.96200001220703 59 | 57,0.004142970977617162,1.7743928125,59.36600004394531,82.41600017333984 60 | 58,0.004029840646710779,1.7658021875,59.30400007080078,82.15200016845704 61 | 59,0.004218896684635963,1.88195375,56.881999975585934,80.56000011474609 62 | 60,0.0036925061971747448,1.3517940625,67.70000002197266,88.15399987304687 63 | 61,0.0035992927150800824,1.34404765625,68.08800004882812,88.23600020751954 64 | 62,0.003520481986925006,1.283674375,69.1300000805664,88.94400007568359 65 | 63,0.003616590718073504,1.3082865625,68.802000078125,88.65599994384766 66 | 64,0.0036838793894276023,1.27181484375,69.44200001953125,89.27800005126953 67 | 65,0.003572586092299649,1.29942640625,69.78399999267579,89.41400007324219 68 | 66,0.0036129531716661794,1.2370415625,70.27599992431641,89.516 69 | 67,0.0032376082381233573,1.2114928125,70.86000002197265,90.03600010009765 70 | 68,0.0035054978714989765,1.224236875,70.44400004394531,89.89400004394531 71 | 69,0.0034192517466310945,1.23175109375,70.51399994628906,89.67800012451171 72 | 70,0.00328368427498,1.19048328125,71.48400014648438,90.30600015136719 73 | 71,0.00327613196402256,1.16209390625,71.9240000366211,90.69200007080079 74 | 72,0.0030484608806935804,1.16013578125,71.9080000390625,90.63800004394531 75 | 73,0.0034537422138133217,1.1457075,72.4540000390625,90.89400011962891 76 | 74,0.003460384572723082,1.13635015625,72.41000006835938,90.91400004638672 77 | 75,0.0033204310374068363,1.12647875,72.77400001464844,91.23800009521484 78 | 76,0.0032639388061527696,1.113355625,72.89800006347656,91.27400009521484 79 | 77,0.0032552302914804648,1.1143825,72.92800009033203,91.40200001708985 80 | 78,0.003150941720897598,1.0993584375,73.49799998779297,91.53000014892578 81 | 79,0.0031130987585389186,1.0650625,74.15800011474609,92.03200009521484 82 | 80,0.0032726521603763103,1.0721525,74.11400006591796,91.98600006591796 83 | 81,0.00320629304873624,1.0649465625,74.26599995605469,92.1619999633789 84 | 82,0.0029540062449606402,1.0372840625,74.79800008789063,92.30600001708984 85 | 83,0.003026906833318727,1.0280375,75.05400014160156,92.53800022460938 86 | 84,0.0029979831805186613,1.017864375,75.4720000366211,92.63999999267578 87 | 85,0.00299135923186051,0.99109765625,75.92600000732422,92.9679999633789 88 | 86,0.003011097732399191,0.99155703125,75.93799998291016,92.82800001464844 89 | 87,0.003033405419306031,0.970643125,76.38000008789062,93.0640001171875 90 | 88,0.0028323159287018435,0.9561534375,76.69000000976563,93.1739999633789 91 | 89,0.0030302958163831916,0.9529859375,76.86200008544922,93.20600006591796 92 | 90,0.0030514331634289454,0.9512065625,77.03400000976562,93.26600009033203 93 | 91,0.002754983675133969,0.9374346875,77.17200000488282,93.41999996337891 94 | 92,0.002925087830850056,0.92438484375,77.47400018554687,93.53399991210938 95 | 93,0.002743347780779004,0.9260734375,77.55600011230469,93.63999993652344 96 | 94,0.0028534684097394347,0.95646546875,77.48199992675781,93.63399998779298 97 | 95,0.0028282569421987447,0.91486703125,77.77999995361328,93.66999996337891 98 | 96,0.0026793425869462745,0.90815390625,77.85000003173828,93.7900001171875 99 | 97,0.002686592417636088,0.909225625,78.00000013427734,93.78399993652344 100 | 98,0.002937979913050575,0.90744421875,77.98200003173828,93.79199998779296 101 | 99,0.002853604283050767,0.90461453125,78.05800000488281,93.84199993652344 102 | 100,0.002864615060389042,0.9053496875,78.0300000830078,93.79799998779296 103 | 101,0.002886664870727275,0.9070628125,78.00999995361327,93.78799993652343 104 | 102,0.002906581253877708,0.91363046875,77.93799995361329,93.71799998779296 105 | 103,0.0030246374164042728,0.90368484375,78.14200010986328,93.83399998779296 106 | 104,0.0028219220860462102,0.906053125,78.03600005615235,93.78400006591797 107 | 105,0.002867467302296843,0.90486140625,78.06800013427734,93.80799998779297 108 | 106,0.002776414771298213,0.90622484375,78.1760000830078,93.84400001464844 109 | 107,0.0027404509518029435,0.90221796875,78.09400000488282,93.82399998779297 110 | 108,0.002886704235736813,0.90330140625,78.10999998046876,93.80999993652344 111 | 109,0.0028225835911663516,0.9019365625,78.07000000488281,93.81399993652344 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/base/args_vit-B_150.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m9-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 3 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: false 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | bn_tf: false 12 | channels_last: false 13 | checkpoint_hist: 2 14 | clip_grad: null 15 | clip_mode: norm 16 | color_jitter: 0.4 17 | cooldown_epochs: 10 18 | crop_pct: null 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/common/imagenet-raw 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.1 30 | epoch_repeats: 0.0 31 | epochs: 150 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.015 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 5.0 53 | mean: null 54 | min_lr: 1.0e-08 55 | mixup: 0.8 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: deit_base_patch16_224 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_debug: 5 76 | opt_eps: 1.0e-08 77 | output: ./exp_results/deit-base-ori- 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.25 88 | resplit: false 89 | resume: '' 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 42 96 | smoothing: 0.1 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: random 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 60 110 | warmup_lr: 1.0e-06 111 | weight_decay: 0.02 112 | workers: 10 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/base/args_vit-B_300.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m9-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 3 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: false 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | bn_tf: false 12 | channels_last: false 13 | checkpoint_hist: 2 14 | clip_grad: null 15 | clip_mode: norm 16 | color_jitter: 0.4 17 | cooldown_epochs: 10 18 | crop_pct: null 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/common/imagenet-raw 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.1 30 | epoch_repeats: 0.0 31 | epochs: 300 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.015 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 5.0 53 | mean: null 54 | min_lr: 1.0e-05 55 | mixup: 0.8 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: deit_base_patch16_224 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_debug: 5 76 | opt_eps: 1.0e-08 77 | output: ./exp_results/deit-base-ori- 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.25 88 | resplit: false 89 | resume: '' 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 42 96 | smoothing: 0.1 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: random 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 60 110 | warmup_lr: 1.0e-08 111 | weight_decay: 0.02 112 | workers: 10 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/base/args_vit-B_300_T.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m9-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 3 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: false 8 | bias_decay: true 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | configure: job_base_adan.yaml 17 | cooldown_epochs: 10 18 | crop_pct: null 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/imagenet 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.2 30 | epoch_repeats: 0.0 31 | epochs: 300 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.015 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 5.0 53 | mean: null 54 | min_lr: 1.0e-06 55 | mixup: 0.8 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: deit_base_patch16_224 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_prox: false 69 | no_resume_opt: false 70 | num_classes: null 71 | opt: adan 72 | opt_betas: 73 | - 0.98 74 | - 0.92 75 | - 0.99 76 | opt_eps: 1.0e-08 77 | output: ./exp_res/vit-base-300 78 | patience_epochs: 10 79 | pin_mem: false 80 | pretrained: false 81 | ratio: 82 | - 0.75 83 | - 1.3333333333333333 84 | recount: 1 85 | recovery_interval: 0 86 | remode: pixel 87 | reprob: 0.25 88 | resplit: false 89 | resume: model_best.pth.tar 90 | save_images: false 91 | scale: 92 | - 0.08 93 | - 1.0 94 | sched: cosine 95 | seed: 3407 96 | smoothing: 0.1 97 | split_bn: false 98 | start_epoch: null 99 | std: null 100 | sync_bn: false 101 | torchscript: false 102 | train_interpolation: random 103 | train_split: train 104 | tta: 0 105 | use_multi_epochs_loader: false 106 | val_split: validation 107 | validation_batch_size: null 108 | vflip: 0.0 109 | warmup_epochs: 90 110 | warmup_lr: 1.0e-08 111 | weight_decay: 0.02 112 | workers: 10 113 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | bn_tf: false 12 | channels_last: false 13 | checkpoint_hist: 2 14 | clip_grad: null 15 | clip_mode: norm 16 | color_jitter: 0.4 17 | cooldown_epochs: 10 18 | crop_pct: null 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/common/imagenet-raw 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.05 30 | epoch_repeats: 0.0 31 | epochs: 150 32 | eval_metric: top1 33 | experiment: '' 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.015 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 0.0 53 | mean: null 54 | min_lr: 1.0e-08 55 | mixup: 0.2 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: deit_small_patch16_224 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_prox: false 69 | no_resume_opt: false 70 | num_classes: null 71 | opt: adan 72 | opt_betas: 73 | - 0.98 74 | - 0.92 75 | - 0.99 76 | opt_debug: 5 77 | opt_eps: 1.0e-08 78 | output: ./exp_results/deit-small-bs-test- 79 | patience_epochs: 10 80 | pin_mem: false 81 | pretrained: false 82 | ratio: 83 | - 0.75 84 | - 1.3333333333333333 85 | recount: 1 86 | recovery_interval: 0 87 | remode: pixel 88 | reprob: 0.0 89 | resplit: false 90 | resume: '' 91 | save_images: false 92 | scale: 93 | - 0.08 94 | - 1.0 95 | sched: cosine 96 | seed: 1005 97 | smoothing: 0.1 98 | split_bn: false 99 | start_epoch: null 100 | std: null 101 | sync_bn: false 102 | torchscript: false 103 | train_interpolation: random 104 | train_split: train 105 | tta: 0 106 | use_multi_epochs_loader: false 107 | val_split: validation 108 | validation_batch_size: null 109 | vflip: 0.0 110 | warmup_epochs: 60 111 | warmup_lr: 1.0e-08 112 | weight_decay: 0.02 113 | workers: 10 114 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/small/args_vit-s_150.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: null 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.1 29 | epoch_repeats: 0.0 30 | epochs: 150 31 | eval_metric: top1 32 | experiment: wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 0.0 52 | mean: null 53 | min_lr: 1.0e-05 54 | mixup: 0.8 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: deit_small_patch16_224 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/deit-small 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.25 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.1 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-08 110 | weight_decay: 0.02 111 | workers: 10 112 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | bn_tf: false 12 | channels_last: false 13 | checkpoint_hist: 2 14 | clip_grad: null 15 | clip_mode: norm 16 | color_jitter: 0.4 17 | cooldown_epochs: 10 18 | crop_pct: null 19 | cutmix: 1.0 20 | cutmix_minmax: null 21 | data_dir: /dataset/common/imagenet-raw 22 | dataset: '' 23 | decay_epochs: 100 24 | decay_rate: 0.1 25 | dist_bn: reduce 26 | drop: 0.0 27 | drop_block: null 28 | drop_connect: null 29 | drop_path: 0.05 30 | epoch_repeats: 0.0 31 | epochs: 300 32 | eval_metric: top1 33 | experiment: bs4096 34 | gp: null 35 | hflip: 0.5 36 | img_size: null 37 | initial_checkpoint: '' 38 | input_size: null 39 | interpolation: '' 40 | jsd_loss: false 41 | local_rank: 0 42 | log_interval: 50 43 | log_wandb: false 44 | lr: 0.02121 45 | lr_cycle_decay: 0.5 46 | lr_cycle_limit: 1 47 | lr_cycle_mul: 1.0 48 | lr_k_decay: 1.0 49 | lr_noise: null 50 | lr_noise_pct: 0.67 51 | lr_noise_std: 1.0 52 | max_grad_norm: 0.0 53 | mean: null 54 | min_lr: 1.0e-08 55 | mixup: 0.2 56 | mixup_mode: batch 57 | mixup_off_epoch: 0 58 | mixup_prob: 1.0 59 | mixup_switch_prob: 0.5 60 | model: deit_small_patch16_224 61 | model_ema: false 62 | model_ema_decay: 0.9998 63 | model_ema_force_cpu: false 64 | momentum: 0.9 65 | native_amp: false 66 | no_aug: false 67 | no_prefetcher: false 68 | no_prox: false 69 | no_resume_opt: false 70 | num_classes: null 71 | opt: adan 72 | opt_betas: 73 | - 0.98 74 | - 0.92 75 | - 0.99 76 | opt_debug: 5 77 | opt_eps: 1.0e-08 78 | output: ./exp_results/deit-small-bs-test- 79 | patience_epochs: 10 80 | pin_mem: false 81 | pretrained: false 82 | ratio: 83 | - 0.75 84 | - 1.3333333333333333 85 | recount: 1 86 | recovery_interval: 0 87 | remode: pixel 88 | reprob: 0.0 89 | resplit: false 90 | resume: '' 91 | save_images: false 92 | scale: 93 | - 0.08 94 | - 1.0 95 | sched: cosine 96 | seed: 1005 97 | smoothing: 0.1 98 | split_bn: false 99 | start_epoch: null 100 | std: null 101 | sync_bn: false 102 | torchscript: false 103 | train_interpolation: random 104 | train_split: train 105 | tta: 0 106 | use_multi_epochs_loader: false 107 | val_split: validation 108 | validation_batch_size: null 109 | vflip: 0.0 110 | warmup_epochs: 80 111 | warmup_lr: 1.0e-08 112 | weight_decay: 0.02 113 | workers: 10 114 | -------------------------------------------------------------------------------- /CV/timm/exp_results/ViT/small/args_vit-s_300.yaml: -------------------------------------------------------------------------------- 1 | aa: rand-m7-mstd0.5-inc1 2 | amp: true 3 | apex_amp: false 4 | aug_repeats: 0 5 | aug_splits: 0 6 | batch_size: 256 7 | bce_loss: true 8 | bias_decay: false 9 | bn_eps: null 10 | bn_momentum: null 11 | channels_last: false 12 | checkpoint_hist: 2 13 | clip_grad: null 14 | clip_mode: norm 15 | color_jitter: 0.4 16 | cooldown_epochs: 10 17 | crop_pct: null 18 | cutmix: 1.0 19 | cutmix_minmax: null 20 | data_dir: /dataset/common/imagenet-raw 21 | dataset: '' 22 | decay_epochs: 100 23 | decay_rate: 0.1 24 | dist_bn: reduce 25 | drop: 0.0 26 | drop_block: null 27 | drop_connect: null 28 | drop_path: 0.1 29 | epoch_repeats: 0.0 30 | epochs: 300 31 | eval_metric: top1 32 | experiment: e300-wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08-bce 33 | gp: null 34 | hflip: 0.5 35 | img_size: null 36 | initial_checkpoint: '' 37 | input_size: null 38 | interpolation: '' 39 | jsd_loss: false 40 | local_rank: 0 41 | log_interval: 50 42 | log_wandb: false 43 | lr: 0.015 44 | lr_cycle_decay: 0.5 45 | lr_cycle_limit: 1 46 | lr_cycle_mul: 1.0 47 | lr_k_decay: 1.0 48 | lr_noise: null 49 | lr_noise_pct: 0.67 50 | lr_noise_std: 1.0 51 | max_grad_norm: 0.0 52 | mean: null 53 | min_lr: 1.0e-05 54 | mixup: 0.8 55 | mixup_mode: batch 56 | mixup_off_epoch: 0 57 | mixup_prob: 1.0 58 | mixup_switch_prob: 0.5 59 | model: deit_small_patch16_224 60 | model_ema: false 61 | model_ema_decay: 0.9998 62 | model_ema_force_cpu: false 63 | momentum: 0.9 64 | native_amp: false 65 | no_aug: false 66 | no_prefetcher: false 67 | no_prox: false 68 | no_resume_opt: false 69 | num_classes: null 70 | opt: adan 71 | opt_betas: 72 | - 0.98 73 | - 0.92 74 | - 0.99 75 | opt_eps: 1.0e-08 76 | output: ./exp_results/deit-small 77 | patience_epochs: 10 78 | pin_mem: false 79 | pretrained: false 80 | ratio: 81 | - 0.75 82 | - 1.3333333333333333 83 | recount: 1 84 | recovery_interval: 0 85 | remode: pixel 86 | reprob: 0.25 87 | resplit: false 88 | resume: null 89 | save_images: false 90 | scale: 91 | - 0.08 92 | - 1.0 93 | sched: cosine 94 | seed: 42 95 | smoothing: 0.1 96 | split_bn: false 97 | start_epoch: null 98 | std: null 99 | sync_bn: false 100 | torchscript: false 101 | train_interpolation: random 102 | train_split: train 103 | tta: 0 104 | use_multi_epochs_loader: false 105 | val_split: validation 106 | validation_batch_size: null 107 | vflip: 0.0 108 | warmup_epochs: 60 109 | warmup_lr: 1.0e-08 110 | weight_decay: 0.02 111 | workers: 10 112 | -------------------------------------------------------------------------------- /CV/timm/sam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class SAM(torch.optim.Optimizer): 5 | def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs): 6 | assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}" 7 | 8 | defaults = dict(rho=rho, adaptive=adaptive, **kwargs) 9 | super(SAM, self).__init__(params, defaults) 10 | 11 | self.base_optimizer = base_optimizer(self.param_groups, **kwargs) 12 | self.param_groups = self.base_optimizer.param_groups 13 | 14 | @torch.no_grad() 15 | def first_step(self, zero_grad=False): 16 | grad_norm = self._grad_norm() 17 | for group in self.param_groups: 18 | scale = group["rho"] / (grad_norm + 1e-12) 19 | 20 | for p in group["params"]: 21 | if p.grad is None: continue 22 | self.state[p]["old_p"] = p.data.clone() 23 | e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p) 24 | p.add_(e_w) # climb to the local maximum "w + e(w)" 25 | 26 | if zero_grad: self.zero_grad() 27 | 28 | @torch.no_grad() 29 | def second_step(self, zero_grad=False): 30 | for group in self.param_groups: 31 | for p in group["params"]: 32 | if p.grad is None: continue 33 | p.data = self.state[p]["old_p"] # get back to "w" from "w + e(w)" 34 | 35 | self.base_optimizer.step() # do the actual "sharpness-aware" update 36 | 37 | if zero_grad: self.zero_grad() 38 | 39 | @torch.no_grad() 40 | def step(self, closure=None): 41 | assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided" 42 | closure = torch.enable_grad()(closure) # the closure should do a full forward-backward pass 43 | 44 | self.first_step(zero_grad=True) 45 | closure() 46 | self.second_step() 47 | 48 | def _grad_norm(self): 49 | shared_device = self.param_groups[0]["params"][0].device # put everything on the same device, in case of model parallelism 50 | norm = torch.norm( 51 | torch.stack([ 52 | ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device) 53 | for group in self.param_groups for p in group["params"] 54 | if p.grad is not None 55 | ]), 56 | p=2 57 | ) 58 | return norm 59 | 60 | def load_state_dict(self, state_dict): 61 | super().load_state_dict(state_dict) 62 | self.base_optimizer.param_groups = self.param_groups -------------------------------------------------------------------------------- /NLP/BERT/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | 4 | 5 | ## Installation of Fairseq 6 | 7 | Our experiment is based on the repo [Fairseq](https://github.com/facebookresearch/fairseq). For the requirements and installation of [Fairseq](https://github.com/facebookresearch/fairseq) and Apex, please refer to that repo. 8 | 9 | 10 | 11 | ## Environment 12 | 13 | Our experiments for this task are based on the following pkg version. 14 | 15 | ```python 16 | torch.__version__ = '1.10.1+cu111' 17 | torchvision.__version__ = '0.11.2+cu111' 18 | torchaudio.__version__ = '0.10.1+cu111' 19 | fairseq.__version__ = '0.12.2' 20 | ``` 21 | 22 | If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:fairseq](https://hub.docker.com/repository/docker/xyxie/adan-image). 23 | 24 | 25 | 26 | ## Usage of Adan in Fairseq 27 | 28 | ### One step to use Adan 29 | 30 | Please first put the file [`adan.py`](./adan.py) to the directory `path/to/fairseq/fairseq/optim`. Then you can choose Adan as the optimizer in the config file. See following example for pre-training: 31 | 32 | ```yaml 33 | optimizer: 34 | _name: adan 35 | weight_decay: 0.02 36 | adan_betas: (0.98,0.92,0.99) 37 | adan_eps: 1e-08 38 | ``` 39 | 40 | 41 | 42 | ## Pretraining 43 | 44 | The following steps are modified from [Fairseq-Roberta](https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.pretraining.md). For completeness, we list some key steps here. 45 | 46 | 47 | ### 1) Preprocess the data 48 | 49 | Data should be preprocessed following the [language modeling format](https://github.com/facebookresearch/fairseq/tree/main/examples/language_model). That is, each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`, and all lines should be concatenated as a 1D text stream during training. 50 | 51 | 52 | 53 | In the following steps, we use the [Bookcorpus dataset](https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz) and [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Database_download) to demonstrate how to preprocess raw text data with the GPT-2 BPE. 54 | 55 | #### i) Download the dataset: 56 | 57 | ```bash 58 | wget https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz 59 | tar -zxvf books1.tar.gz -C ./bert-corpus/ 60 | ``` 61 | 62 | ```python 63 | pip install datasets 64 | from datasets import load_dataset 65 | 66 | dataset = load_dataset("wikipedia", "20220301.en") 67 | ``` 68 | 69 | #### ii) Generate Raw data: 70 | 71 | - For wikipedia dataset, we need to read each line of the json line file , replace the `\n` in the text field with a space, and write the line (add `\n` at the end), to the file new `all_data.raw`. 72 | 73 | - For bookcorpus dataset, read out the contexts of each book, then replace the `\n` with the space, and then write the context of the book as one line in `all_data.raw`, ended up with `\n`. 74 | 75 | - Split the `all_data.raw` in to `wiki.train.raw` and `wiki.dev.raw` with the ratio of 99:1. Set `wiki.test.raw = wiki.dev.raw` for compatibility of fairseq. 76 | 77 | 78 | 79 | #### iii) Encode data with the GPT-2 BPE: 80 | 81 | ```bash 82 | mkdir -p gpt2_bpe 83 | wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json 84 | wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe 85 | for SPLIT in train valid test; do \ 86 | python -m examples.roberta.multiprocessing_bpe_encoder \ 87 | --encoder-json gpt2_bpe/encoder.json \ 88 | --vocab-bpe gpt2_bpe/vocab.bpe \ 89 | --inputs bert-corpus/wiki.${SPLIT}.raw \ 90 | --outputs bert-corpus/wiki.${SPLIT}.bpe \ 91 | --keep-empty \ 92 | --workers 60; \ 93 | done 94 | ``` 95 | 96 | 97 | 98 | #### iv) Binarize the data using the GPT-2 fairseq dictionary: 99 | 100 | ```bash 101 | wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt 102 | fairseq-preprocess \ 103 | --only-source \ 104 | --srcdict gpt2_bpe/dict.txt \ 105 | --trainpref bert-corpus/wiki.train.bpe \ 106 | --validpref bert-corpus/wiki.valid.bpe \ 107 | --testpref bert-corpus/wiki.test.bpe \ 108 | --destdir data-bin/bert-corpus \ 109 | --workers 60 110 | ``` 111 | 112 | 113 | 114 | ### 2) Train BERT base 115 | 116 | Put the provided [config files](./config/pretraining) to the directory `path/to/fairseq/examples/roberta/config/pretraining` 117 | 118 | ```bash 119 | DATA_DIR=/path/to/fairseq/bert-corpus 120 | 121 | fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \ 122 | --config-name ${NAME} task.data=$DATA_DIR \ 123 | checkpoint.save_dir=/path/to/save_dir/ 124 | 125 | ``` 126 | 127 | We can optionally resume the training of the released BERT-base model by adding `checkpoint.restore_file=/path/to/model.pt`. Note, in our experiments, we use Adan to train BERT-base from scratch. You can use the following config files to train BERT-base with Adam or Adan: 128 | 129 | | NAME | Optimizer | Config | Download | 130 | | :-------: | :-------: | :----------------------------------------------------: | :------------------------------------------------------: | 131 | | bert-base | Adam | [config](./exp_results/pretrain/full_config-adam.yaml) | [log](./exp_results/pretrain/hydra_train-adam.log)/model | 132 | | bert-adan | Adan | [config](./exp_results/pretrain/full_config-adan.yaml) | [log](./exp_results/pretrain/hydra_train-adan.log)/model | 133 | 134 | The above command assumes the training is on 8x40GB A100 GPUs. Each GPU uses a batch size of 32 sequences (`dataset.batch_size`). If you have fewer GPUs or GPUs with less memory, you may need to reduce `dataset.batch_size` and increase `dataset.update_freq` to compensate. Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to improve the training speed. 135 | 136 | 137 | ## Finetuning BERT-base on GLUE tasks 138 | 139 | ### 1) Download the data from [GLUE website](https://gluebenchmark.com/tasks) using following commands: 140 | ```bash 141 | wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py 142 | python download_glue_data.py --data_dir glue_data --tasks all 143 | ``` 144 | There some problems to download `MRPC` and `MNLI` , hence we pass the `MRPC` task and download the data of `MNLI` from the unofficial sources. 145 | 146 | 147 | 148 | ### 2) Preprocess GLUE task data: 149 | 150 | ```bash 151 | ./examples/roberta/preprocess_GLUE_tasks.sh glue_data 152 | ``` 153 | - `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, RTE, STS-B, SST-2, CoLA}`. Use `ALL` for preprocessing all the glue tasks. 154 | 155 | 156 | 157 | ### 3) Fine-tuning on GLUE task: 158 | 159 | Example fine-tuning cmd for `RTE` task 160 | ```bash 161 | TASK=RTE; 162 | 163 | python path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --avg_num 1 \ 164 | --data_path /path/to/fairseq/GLUE/glue_data/$TASK \ 165 | --bin_path /path/to/fairseq/GLUE/$TASK-bin \ 166 | --pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \ 167 | --finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \ 168 | --task rte-adan 169 | ``` 170 | 171 | - `avg_num` number of repetitions. 172 | 173 | - `data_path` path to the data of GLUE task, e.g., CoLA, MNLI, etc. 174 | 175 | - `bin_path` similar to `data_path`, but is path to the binarized data after processing. 176 | 177 | - `pre_path` path to the pre-trained model. 178 | 179 | - `finetune_path` path to save/load fine-tuned model. 180 | 181 | - `task` config name, please refer to the directory of [fine-tuning](./config/finetuning) for the additional config files for each of the GLUE tasks. 182 | 183 | - This cmd-args and hyperparams are tested on one Nvidia `A100` GPU with `40gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. 184 | 185 | 186 | 187 | ### 4) Inference on GLUE task 188 | After training the model by using previous step, we can perform inference with checkpoints in `finetune_path` directory using following code snippet: 189 | 190 | ```bash 191 | TASK=RTE; 192 | 193 | python path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --inference \ 194 | --data_path /path/to/fairseq/GLUE/glue_data/$TASK \ 195 | --bin_path /path/to/fairseq/GLUE/$TASK-bin \ 196 | --pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \ 197 | --finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \ 198 | --task rte-adan 199 | 200 | ``` 201 | 202 | This should give: 203 | 204 | | GLUE-Task | Metric | Result | Config | 205 | | --------- | :--------------------------- | :-------: | :-------------------------------------------: | 206 | | CoLA | Matthew's corr. | 64.6 | [config](./config/finetuning/cola-adan.yaml) | 207 | | SST-2 | Accuracy | 93.2 | [config](./config/finetuning/sst_2-adan.yaml) | 208 | | STS-B | Person corr. | 89.3 | [config](./config/finetuning/sts_b-adan.yaml) | 209 | | QQP | Accuracy | 91.2 | [config](./config/finetuning/qqp-adan.yaml) | 210 | | MNLI | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./config/finetuning/mnli-adan.yaml) | 211 | | QNLI | Accuracy | 91.3 | [config](./config/finetuning/qnli-adan.yaml) | 212 | | RTE | Accuracy | 73.3 | [config](./config/finetuning/rte-adan.yaml) | 213 | 214 | -------------------------------------------------------------------------------- /NLP/BERT/adan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Garena Online Private Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import math 18 | from collections.abc import Collection 19 | from dataclasses import dataclass, field 20 | from typing import Any, List 21 | 22 | import torch 23 | import torch.distributed as dist 24 | import torch.optim 25 | from fairseq.dataclass import FairseqDataclass 26 | from fairseq.optim import FairseqOptimizer, register_optimizer 27 | from omegaconf import II, OmegaConf 28 | 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | 33 | @dataclass 34 | class FairseqAdanConfig(FairseqDataclass): 35 | adan_betas: Any = field( 36 | default=(0.98, 0.92, 0.99), metadata={"help": "betas for Adan optimizer"} 37 | ) 38 | adan_eps: float = field( 39 | default=1e-8, metadata={"help": "epsilon for Adam optimizer"} 40 | ) 41 | weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) 42 | 43 | no_prox: bool = field( 44 | default=False, metadata={"help": "wether to perform prox operator"} 45 | ) 46 | fp16_adan_stats: bool = field( 47 | default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} 48 | ) 49 | # TODO common vars below in parent 50 | tpu: bool = II("common.tpu") 51 | lr: List[float] = II("optimization.lr") 52 | 53 | 54 | @register_optimizer("adan", dataclass=FairseqAdanConfig) 55 | class FairseqAdan(FairseqOptimizer): 56 | """ 57 | Adan optimizer for fairseq. 58 | """ 59 | 60 | def __init__(self, cfg: FairseqAdanConfig, params): 61 | super().__init__(cfg) 62 | fused_adan_cls = None 63 | use_fused_adan = ( 64 | fused_adan_cls is not None 65 | and torch.cuda.is_available() 66 | ) 67 | if getattr(cfg, "tpu", False): 68 | if self.cfg.fp16_adan_stats: 69 | raise NotImplementedError("--fp16-adam-stats is only supported on GPU") 70 | # on TPUs we use the Adam defined here, since it 71 | # automatically casts gradients to FP32 72 | self._optimizer = Adan(params, **self.optimizer_config) 73 | elif use_fused_adan: 74 | raise NotImplementedError("--fp16-adam-stats is only supported on GPU") 75 | else: 76 | if self.cfg.fp16_adan_stats: 77 | raise NotImplementedError( 78 | "--fp16-adam-stats is only supported with FusedAdanV1" 79 | ) 80 | self._optimizer = Adan(params, **self.optimizer_config) 81 | 82 | @property 83 | def optimizer_config(self): 84 | """ 85 | Return a kwarg dictionary that will be used to override optimizer 86 | args stored in checkpoints. This allows us to load a checkpoint and 87 | resume training using a different set of optimizer args, e.g., with a 88 | different learning rate. 89 | """ 90 | return { 91 | "lr": self.cfg.lr[0] 92 | if isinstance(self.cfg.lr, Collection) 93 | else self.cfg.lr, 94 | "betas": eval(self.cfg.adan_betas) 95 | if isinstance(self.cfg.adan_betas, str) 96 | else OmegaConf.to_container(self.cfg.adan_betas), 97 | "eps": self.cfg.adan_eps, 98 | "weight_decay": self.cfg.weight_decay, 99 | } 100 | 101 | def average_params(self): 102 | """Reduce Params is only used during BMUF distributed training.""" 103 | state_dict = self.optimizer.state_dict() 104 | total_gpus = float(dist.get_world_size()) 105 | 106 | for _, value in state_dict["state"].items(): 107 | value["exp_avg"] /= total_gpus 108 | value["exp_avg_sq"] /= total_gpus 109 | value['exp_avg_diff'] /= total_gpus 110 | dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM) 111 | dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM) 112 | dist.all_reduce(value["exp_avg_diff"], op=dist.ReduceOp.SUM) 113 | 114 | 115 | class Adan(torch.optim.Optimizer): 116 | r"""Implements Adan algorithm. 117 | 118 | Args: 119 | params (iterable): iterable of parameters to optimize or dicts defining 120 | parameter groups 121 | lr (float, optional): learning rate (default: 1e-3) 122 | betas (Tuple[float, float, float], optional): coefficients used for computing 123 | running averages of gradient and its square (default: (0.98, 0.92, 0.99)) 124 | eps (float, optional): term added to the denominator to improve 125 | numerical stability (default: 1e-8) 126 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 127 | """ 128 | def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, 129 | weight_decay=0.0, no_prox = False): 130 | defaults = dict(lr=lr, betas=betas, eps=eps, 131 | weight_decay=weight_decay, no_prox = no_prox) 132 | super(Adan, self).__init__(params, defaults) 133 | 134 | def __setstate__(self, state): 135 | super(Adan, self).__setstate__(state) 136 | for group in self.param_groups: 137 | group.setdefault('no_prox', False) 138 | 139 | @property 140 | def supports_memory_efficient_fp16(self): 141 | return True 142 | 143 | @property 144 | def supports_flat_params(self): 145 | return True 146 | 147 | def step(self, closure=None): 148 | """Performs a single optimization step. 149 | Arguments: 150 | closure (callable, optional): A closure that reevaluates the model 151 | and returns the loss. 152 | """ 153 | loss = None 154 | if closure is not None: 155 | loss = closure() 156 | 157 | 158 | for group in self.param_groups: 159 | beta1, beta2, beta3 = group['betas'] 160 | # assume same step across group now to simplify things 161 | # per parameter step can be easily support by making it tensor, or pass list into kernel 162 | if 'step' in group: 163 | group['step'] += 1 164 | else: 165 | group['step'] = 1 166 | 167 | 168 | bias_correction1 = 1.0 - beta1 ** group['step'] 169 | 170 | bias_correction2 = 1.0 - beta2 ** group['step'] 171 | 172 | bias_correction3 = 1.0 - beta3 ** group['step'] 173 | 174 | for p in group['params']: 175 | if p.grad is None: 176 | continue 177 | 178 | p_data_fp32 = p.data 179 | if p.data.dtype in {torch.float16, torch.bfloat16}: 180 | p_data_fp32 = p_data_fp32.float() 181 | 182 | state = self.state[p] 183 | if len(state) == 0: 184 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 185 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 186 | state['exp_avg_diff'] = torch.zeros_like(p_data_fp32) 187 | else: 188 | state["exp_avg"] = state["exp_avg"].to(p_data_fp32) 189 | state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) 190 | state['exp_avg_diff'] = state['exp_avg_diff'].to(p_data_fp32) 191 | 192 | 193 | grad = p.grad.data 194 | if grad.dtype in {torch.float16, torch.bfloat16}: 195 | grad = grad.float() 196 | if grad.is_sparse: 197 | raise RuntimeError( 198 | "Adan does not support sparse gradients, please consider SparseAdam instead" 199 | ) 200 | 201 | if 'pre_grad' not in state or group['step'] == 1: 202 | state['pre_grad'] = grad 203 | 204 | 205 | copy_grad = grad.clone() 206 | 207 | 208 | exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] 209 | diff = grad - state['pre_grad'] 210 | 211 | 212 | update = grad+beta2*diff 213 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t 214 | exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t 215 | exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # v_t 216 | 217 | denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) 218 | update = ((exp_avg/bias_correction1+beta2*exp_avg_diff/bias_correction2) ).div_(denom) 219 | 220 | if group['no_prox']: 221 | p_data_fp32.mul_(1 - group['lr'] * group['weight_decay']) 222 | p_data_fp32.add_(update, alpha=-group['lr']) 223 | else: 224 | p_data_fp32.add_(update, alpha=-group['lr']) 225 | p_data_fp32.div_(1 + group['lr'] * group['weight_decay']) 226 | 227 | state['pre_grad'] = copy_grad 228 | 229 | if p.data.dtype in {torch.float16, torch.bfloat16}: 230 | p.data.copy_(p_data_fp32) 231 | return loss 232 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/acc_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from fairseq.models.roberta import RobertaModel 3 | import argparse 4 | from scipy.stats import pearsonr 5 | from sklearn.metrics import matthews_corrcoef 6 | 7 | 8 | def get_acc(model_path, data_path, bin_path, task='rte'): 9 | acc_list = [] 10 | gold, pred = [], [] 11 | roberta = RobertaModel.from_pretrained( 12 | model_path, 13 | checkpoint_file='checkpoint_best.pt', 14 | data_name_or_path=bin_path#'RTE-bin' 15 | ) 16 | 17 | label_fn = lambda label: roberta.task.label_dictionary.string( 18 | [label + roberta.task.label_dictionary.nspecial] 19 | ) 20 | ncorrect, nsamples = 0, 0 21 | roberta.cuda() 22 | roberta.eval() 23 | if 'mnli' not in task: 24 | dev_files = ['dev.tsv'] 25 | else: dev_files = ['dev_mismatched.tsv', 'dev_matched.tsv'] 26 | for dev_file in dev_files: 27 | with open(os.path.join(data_path, dev_file)) as fin: 28 | fin.readline() 29 | for index, line in enumerate(fin): 30 | tokens = line.strip().split('\t') 31 | if 'rte' in task or 'qnli' in task: 32 | sent1, sent2, target = tokens[1], tokens[2], tokens[3] 33 | tokens = roberta.encode(sent1, sent2) 34 | elif 'qqp' in task: 35 | sent1, sent2, target = tokens[3], tokens[4], tokens[5] 36 | tokens = roberta.encode(sent1, sent2) 37 | elif 'mnli' in task: 38 | sent1, sent2, target = tokens[8], tokens[9], tokens[11] 39 | tokens = roberta.encode(sent1, sent2) 40 | elif 'mrpc' in task: 41 | sent1, sent2, target = tokens[3], tokens[4], tokens[0] 42 | tokens = roberta.encode(sent1, sent2) 43 | elif 'sts_b' in task: 44 | sent1, sent2, target = tokens[7], tokens[8], float(tokens[9]) 45 | tokens = roberta.encode(sent1, sent2) 46 | elif 'sst_2' in task: 47 | sent, target = tokens[0], tokens[1] 48 | tokens = roberta.encode(sent) 49 | 50 | elif 'cola' in task: 51 | sent, target = tokens[3], tokens[1] 52 | tokens = roberta.encode(sent) 53 | if 'sts_b' not in task: 54 | prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() 55 | prediction_label = label_fn(prediction) 56 | ncorrect += int(prediction_label == target) 57 | 58 | nsamples += 1 59 | if 'cola' in task: 60 | target = int(target) 61 | prediction_label = int(prediction_label) 62 | pred.append(prediction_label) 63 | gold.append(target) 64 | 65 | else: 66 | features = roberta.extract_features(tokens) 67 | predictions = 5.0 * roberta.model.classification_heads['sentence_classification_head'](features) 68 | gold.append(target) 69 | pred.append(predictions.item()) 70 | if 'cola' in task: 71 | out = matthews_corrcoef(gold, pred) 72 | elif 'sts_b' in task: 73 | out = pearsonr(gold, pred)[0] 74 | else: out = float(ncorrect)/float(nsamples) 75 | 76 | acc_list.append(out) 77 | return acc_list 78 | 79 | 80 | parser = argparse.ArgumentParser(description='GLUE test for acc') 81 | parser.add_argument('--avg_num', type=int, default=1, 82 | help='number of try') 83 | parser.add_argument('--pre_path', type=str, default='./baseline/checkpoint_20_1000000.pt', 84 | help='path to pre-trained model') 85 | parser.add_argument('--data_path', type=str, default='./GLUE/glue_data/STS-B', 86 | help='path to data') 87 | parser.add_argument('--bin_path', type=str, default='./GLUE/STS-B-bin', 88 | help='path to -bin data') 89 | parser.add_argument('--finetune_path', type=str, default='./bert-fintune/adam/STS-B/', 90 | help='path to finetuned model') 91 | parser.add_argument('--task', type=str, default='sts_b', 92 | help='task of finetune') 93 | parser.add_argument('--inference', action='store_true', default=False, 94 | help='inference only') 95 | args = parser.parse_args() 96 | 97 | 98 | acc_avg = 0.0 99 | acc_avg2 = 0.0 100 | for _ in range(args.avg_num): 101 | if not args.inference: 102 | val = os.system(' fairseq-hydra-train --config-dir ./fairseq/examples/roberta/config/finetuning \ 103 | --config-name {} \ 104 | task.data={} checkpoint.restore_file={} \ 105 | checkpoint.save_dir={}'.format(args.task, args.bin_path, args.pre_path, args.finetune_path)) 106 | all_acc = get_acc(args.finetune_path, args.data_path, args.bin_path, args.task) 107 | acc_avg+=all_acc[0] 108 | if len(all_acc)>1: 109 | acc_avg2+=all_acc[1] 110 | 111 | if acc_avg2>0: 112 | print('Mismatched Accuracy1:{}, Matched Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num), float(acc_avg2)/float(args.avg_num))) 113 | else: 114 | print('AVG Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num))) 115 | 116 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/cola-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.01 43 | adan_betas: (0.98,0.99,0.99) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 320 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [4e-05] 53 | max_update: 5336 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/cola.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 320 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [1e-05] 53 | max_update: 5336 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/mnli-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 3 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.01 43 | adan_betas: (0.98,0.92,0.999) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 7432 49 | 50 | optimization: 51 | clip_norm: 1.0 52 | lr: [2.0e-05] 53 | max_update: 123873 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/mnli.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 3 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 32 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 7432 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [1e-05] 53 | max_update: 123873 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/qnli-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.001 43 | adan_betas: (0.98,0.99,0.99) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: cosine 48 | warmup_updates: 1986 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [2e-05] 53 | max_update: 33112 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/qnli.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 32 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 1986 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [1e-05] 53 | max_update: 33112 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/qqp-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.001 43 | adan_betas: (0.98,0.99,0.99) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 28318 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [4e-05] 53 | max_update: 113272 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/qqp.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 32 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 28318 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [1e-05] 53 | max_update: 113272 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/rte-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.01 43 | adan_betas: (0.98,0.99,0.99) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 122 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [2e-05] 53 | max_update: 2036 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/rte.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 16 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 122 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [2e-05] 53 | max_update: 2036 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/sst_2-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 32 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adan 42 | weight_decay: 0.01 43 | adan_betas: (0.98,0.92,0.99) 44 | adan_eps: 1e-08 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 1256 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [4e-05] 53 | max_update: 20935 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/sst_2.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 2 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | best_checkpoint_metric: accuracy 25 | maximize_best_checkpoint_metric: true 26 | no_epoch_checkpoints: true 27 | 28 | distributed_training: 29 | find_unused_parameters: true 30 | distributed_world_size: 1 31 | 32 | criterion: 33 | _name: sentence_prediction 34 | 35 | dataset: 36 | batch_size: 32 37 | required_batch_size_multiple: 1 38 | max_tokens: 4400 39 | 40 | optimizer: 41 | _name: adam 42 | weight_decay: 0.1 43 | adam_betas: (0.9,0.98) 44 | adam_eps: 1e-06 45 | 46 | lr_scheduler: 47 | _name: polynomial_decay 48 | warmup_updates: 1256 49 | 50 | optimization: 51 | clip_norm: 0.0 52 | lr: [1e-05] 53 | max_update: 20935 54 | max_epoch: 10 55 | 56 | model: 57 | _name: roberta 58 | dropout: 0.1 59 | attention_dropout: 0.1 60 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/sts_b-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 1 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | no_epoch_checkpoints: true 25 | 26 | distributed_training: 27 | find_unused_parameters: true 28 | distributed_world_size: 1 29 | 30 | criterion: 31 | _name: sentence_prediction 32 | regression_target: true 33 | 34 | dataset: 35 | batch_size: 16 36 | required_batch_size_multiple: 1 37 | max_tokens: 4400 38 | 39 | optimizer: 40 | _name: adan 41 | weight_decay: 0.01 42 | adan_betas: (0.98,0.99,0.99) 43 | adan_eps: 1e-8 44 | 45 | lr_scheduler: 46 | _name: cosine 47 | warmup_updates: 214 48 | 49 | optimization: 50 | clip_norm: 0.5 51 | lr: [4e-05] 52 | max_update: 3598 53 | max_epoch: 10 54 | 55 | model: 56 | _name: roberta 57 | dropout: 0.1 58 | attention_dropout: 0.1 59 | -------------------------------------------------------------------------------- /NLP/BERT/config/finetuning/sts_b.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | fp16_init_scale: 4 6 | threshold_loss_scale: 1 7 | fp16_scale_window: 128 8 | log_format: json 9 | log_interval: 200 10 | 11 | task: 12 | _name: sentence_prediction 13 | data: ??? 14 | init_token: 0 15 | separator_token: 2 16 | num_classes: 1 17 | max_positions: 512 18 | 19 | checkpoint: 20 | restore_file: ??? 21 | reset_optimizer: true 22 | reset_dataloader: true 23 | reset_meters: true 24 | no_epoch_checkpoints: true 25 | 26 | distributed_training: 27 | find_unused_parameters: true 28 | distributed_world_size: 1 29 | 30 | criterion: 31 | _name: sentence_prediction 32 | regression_target: true 33 | 34 | dataset: 35 | batch_size: 16 36 | required_batch_size_multiple: 1 37 | max_tokens: 4400 38 | 39 | optimizer: 40 | _name: adam 41 | weight_decay: 0.1 42 | adam_betas: (0.9,0.98) 43 | adam_eps: 1e-06 44 | 45 | lr_scheduler: 46 | _name: polynomial_decay 47 | warmup_updates: 214 48 | 49 | optimization: 50 | clip_norm: 0.0 51 | lr: [2e-05] 52 | max_update: 3598 53 | max_epoch: 10 54 | 55 | model: 56 | _name: roberta 57 | dropout: 0.1 58 | attention_dropout: 0.1 59 | -------------------------------------------------------------------------------- /NLP/BERT/config/pretraining/base.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: true 4 | log_format: json 5 | log_interval: 200 6 | 7 | checkpoint: 8 | no_epoch_checkpoints: true 9 | 10 | task: 11 | _name: masked_lm 12 | data: ??? 13 | sample_break_mode: complete 14 | tokens_per_sample: 512 15 | 16 | criterion: masked_lm 17 | 18 | dataset: 19 | batch_size: 16 20 | ignore_unused_valid_subsets: true 21 | 22 | optimizer: 23 | _name: adam 24 | weight_decay: 0.01 25 | adam_betas: (0.9,0.98) 26 | adam_eps: 1e-06 27 | 28 | lr_scheduler: 29 | _name: polynomial_decay 30 | warmup_updates: 10000 31 | 32 | optimization: 33 | clip_norm: 0 34 | lr: [0.0005] 35 | max_update: 125000 36 | update_freq: [16] 37 | 38 | model: 39 | _name: roberta 40 | max_positions: 512 41 | dropout: 0.1 42 | attention_dropout: 0.1 43 | -------------------------------------------------------------------------------- /NLP/BERT/config/pretraining/bert-adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: true 4 | log_format: json 5 | log_interval: 200 6 | 7 | checkpoint: 8 | no_epoch_checkpoints: true 9 | save_interval: 5 10 | save_interval_updates: 50000 11 | 12 | task: 13 | _name: masked_lm 14 | data: ??? 15 | sample_break_mode: complete 16 | tokens_per_sample: 512 17 | 18 | criterion: masked_lm 19 | 20 | 21 | 22 | optimizer: 23 | _name: adan 24 | weight_decay: 0.02 25 | adan_betas: (0.98,0.92,0.99) 26 | adan_eps: 1e-08 27 | 28 | lr_scheduler: 29 | _name: polynomial_decay 30 | warmup_updates: 10000 31 | 32 | optimization: 33 | clip_norm: 5.0 34 | lr: [0.001] 35 | max_update: 1000000 36 | update_freq: [1] 37 | 38 | model: 39 | _name: roberta 40 | max_positions: 512 41 | dropout: 0.1 42 | attention_dropout: 0.1 43 | 44 | distributed_training: 45 | ddp_backend: no_c10d 46 | 47 | dataset: 48 | skip_invalid_size_inputs_valid_test: true 49 | validate_interval: 5 50 | validate_interval_updates: 50000 51 | batch_size: 32 52 | ignore_unused_valid_subsets: true 53 | -------------------------------------------------------------------------------- /NLP/BERT/config/pretraining/bert-base.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | common: 3 | fp16: true 4 | log_format: json 5 | log_interval: 200 6 | 7 | checkpoint: 8 | save_dir: 'bert/baseline/' 9 | no_epoch_checkpoints: true 10 | save_interval: 5 11 | save_interval_updates: 50000 12 | 13 | task: 14 | _name: masked_lm 15 | data: ??? 16 | sample_break_mode: complete 17 | tokens_per_sample: 512 18 | 19 | criterion: masked_lm 20 | 21 | 22 | 23 | optimizer: 24 | _name: adam 25 | weight_decay: 0.01 26 | adam_betas: (0.9,0.98) 27 | adam_eps: 1e-06 28 | 29 | lr_scheduler: 30 | _name: polynomial_decay 31 | warmup_updates: 10000 32 | 33 | optimization: 34 | clip_norm: 0 35 | lr: [0.0001] 36 | max_update: 1000000 37 | update_freq: [1] 38 | 39 | model: 40 | _name: roberta 41 | max_positions: 512 42 | dropout: 0.1 43 | attention_dropout: 0.1 44 | 45 | distributed_training: 46 | ddp_backend: no_c10d 47 | 48 | dataset: 49 | skip_invalid_size_inputs_valid_test: true 50 | validate_interval: 5 51 | validate_interval_updates: 50000 52 | batch_size: 32 53 | ignore_unused_valid_subsets: true 54 | 55 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | We first provide the instruction to modify the official training files from [Transformer-XL](https://github.com/kimiyoung/transformer-xl) to support Adan. **For data preparation, please follow that repo.** 4 | 5 | ## Environment 6 | 7 | As recommended by the official [Transformer-XL](https://github.com/kimiyoung/transformer-xl), our experiments for this task are based on the following pkg version. 8 | 9 | ```python 10 | torch.__version__ = '1.1.0' 11 | ``` 12 | 13 | ## Usage of Adan for Transformer-XL 14 | 15 | ### Two steps to use Adan 16 | 17 | **Step 1.** add the following parameters to the file `train.py`. 18 | 19 | ```python 20 | parser.add_argument('--optim', default='adam', type=str, choices=['adam', 'sgd', 'adagrad', 'adan'], help='optimizer to use.') 21 | parser.add_argument('--wd', type=float, default=0.02, help='weight decay (default: 0.02)') 22 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='Optimizer Betas (default: None, use opt default)') 23 | ``` 24 | 25 | - `optim`: the choice of optimizers. We add Adan in the choices. 26 | 27 | - `wd`: decoupled weight decay. 28 | 29 | - `opt-betas`: optimizer betas for Adan. 30 | 31 | **Step 2.** replace the original optimizitor creation with the following: 32 | 33 | ```python 34 | from adan import Adan 35 | 36 | elif args.optim.lower() == 'adan': 37 | if args.sample_softmax > 0: 38 | dense_params, sparse_params = [], [] 39 | for param in model.parameters(): 40 | if param.size() == model.word_emb.weight.size(): 41 | sparse_params.append(param) 42 | else: 43 | dense_params.append(param) 44 | optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd) 45 | optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd) 46 | else: 47 | optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd) 48 | 49 | ``` 50 | 51 | ## Data Preparation 52 | 53 | see `bash getdata.sh` in repo [Transformer-XL](https://github.com/kimiyoung/transformer-xl). 54 | 55 | ## Training and Evaluation 56 | 57 | - #### Training 58 | 59 | `bash run_wt103_adan.sh train --work_dir PATH_TO_WORK_DIR` 60 | 61 | - #### Evaluation 62 | 63 | `bash run_wt103_adan.sh eval --work_dir PATH_TO_WORK_DIR` 64 | 65 | - #### Tips for Experiments 66 | 67 | - For Adan, we set `args.wd = 0.02` for all steps, which is consistent with the other experiments. 68 | - For the experiment using `steps = 50k`, we choose a slightly larger `LR`. 69 | 70 | ## Results and Logs 71 | 72 | With a different setting for `lr` and `max_step` in `run_wt103_adan.sh`, we have the following results: 73 | 74 | | | LR | Steps | Test PPL | Download | 75 | | ------------------- | :----: | :---: | :------: | :--------------------------------------: | 76 | | Baseline (Adam) | 2.5e-4 | 200k | 24.2 | [log&config](./exp_results/log-adam.txt) | 77 | | Transformer-XL-base | 1.5e-3 | 50k | 26.2 | [log&config](./exp_results/log-50k.txt) | 78 | | Transformer-XL-base | 1e-3 | 100k | 24.2 | [log&config](./exp_results/log-100k.txt) | 79 | | Transformer-XL-base | 1e-3 | 200k | 23.5 | [log&config](./exp_results/log-200k.txt) | 80 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/eval.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import argparse 3 | import time 4 | import math 5 | import os, sys 6 | 7 | import torch 8 | 9 | from data_utils import get_lm_corpus 10 | from mem_transformer import MemTransformerLM 11 | from utils.exp_utils import get_logger 12 | 13 | parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') 14 | parser.add_argument('--data', type=str, default='../data/wikitext-103', 15 | help='location of the data corpus') 16 | parser.add_argument('--dataset', type=str, default='wt103', 17 | choices=['wt103', 'lm1b', 'enwik8', 'text8'], 18 | help='dataset name') 19 | parser.add_argument('--split', type=str, default='all', 20 | choices=['all', 'valid', 'test'], 21 | help='which split to evaluate') 22 | parser.add_argument('--batch_size', type=int, default=10, 23 | help='batch size') 24 | parser.add_argument('--tgt_len', type=int, default=5, 25 | help='number of tokens to predict') 26 | parser.add_argument('--ext_len', type=int, default=0, 27 | help='length of the extended context') 28 | parser.add_argument('--mem_len', type=int, default=0, 29 | help='length of the retained previous heads') 30 | parser.add_argument('--clamp_len', type=int, default=-1, 31 | help='max positional embedding index') 32 | parser.add_argument('--cuda', action='store_true', 33 | help='use CUDA') 34 | parser.add_argument('--work_dir', type=str, required=True, 35 | help='path to the work_dir') 36 | parser.add_argument('--no_log', action='store_true', 37 | help='do not log the eval result') 38 | parser.add_argument('--same_length', action='store_true', 39 | help='set same length attention with masking') 40 | args = parser.parse_args() 41 | assert args.ext_len >= 0, 'extended context length must be non-negative' 42 | 43 | device = torch.device("cuda" if args.cuda else "cpu") 44 | 45 | # Get logger 46 | logging = get_logger(os.path.join(args.work_dir, 'log.txt'), 47 | log_=not args.no_log) 48 | 49 | # Load dataset 50 | corpus = get_lm_corpus(args.data, args.dataset) 51 | ntokens = len(corpus.vocab) 52 | 53 | va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, 54 | device=device, ext_len=args.ext_len) 55 | te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, 56 | device=device, ext_len=args.ext_len) 57 | 58 | # Load the best saved model. 59 | with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f: 60 | model = torch.load(f) 61 | model.backward_compatible() 62 | model = model.to(device) 63 | 64 | logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( 65 | args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) 66 | 67 | model.reset_length(args.tgt_len, args.ext_len, args.mem_len) 68 | if args.clamp_len > 0: 69 | model.clamp_len = args.clamp_len 70 | if args.same_length: 71 | model.same_length = True 72 | 73 | ############################################################################### 74 | # Evaluation code 75 | ############################################################################### 76 | def evaluate(eval_iter): 77 | # Turn on evaluation mode which disables dropout. 78 | model.eval() 79 | total_len, total_loss = 0, 0. 80 | start_time = time.time() 81 | with torch.no_grad(): 82 | mems = tuple() 83 | for idx, (data, target, seq_len) in enumerate(eval_iter): 84 | ret = model(data, target, *mems) 85 | loss, mems = ret[0], ret[1:] 86 | loss = loss.mean() 87 | total_loss += seq_len * loss.item() 88 | total_len += seq_len 89 | total_time = time.time() - start_time 90 | logging('Time : {:.2f}s, {:.2f}ms/segment'.format( 91 | total_time, 1000 * total_time / (idx+1))) 92 | return total_loss / total_len 93 | 94 | # Run on test data. 95 | if args.split == 'all': 96 | test_loss = evaluate(te_iter) 97 | valid_loss = evaluate(va_iter) 98 | elif args.split == 'valid': 99 | valid_loss = evaluate(va_iter) 100 | test_loss = None 101 | elif args.split == 'test': 102 | test_loss = evaluate(te_iter) 103 | valid_loss = None 104 | 105 | def format_log(loss, split): 106 | if args.dataset in ['enwik8', 'text8']: 107 | log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format( 108 | split, loss, loss / math.log(2)) 109 | else: 110 | log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( 111 | split, loss, math.exp(loss)) 112 | return log_str 113 | 114 | log_str = '' 115 | if valid_loss is not None: 116 | log_str += format_log(valid_loss, 'valid') 117 | if test_loss is not None: 118 | log_str += format_log(test_loss, 'test') 119 | 120 | logging('=' * 100) 121 | logging(log_str) 122 | logging('=' * 100) 123 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/run_wt103_adan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $1 == 'train' ]]; then 4 | echo 'Run training...' 5 | python train.py \ 6 | --cuda \ 7 | --data /root/autodl-tmp/data/wikitext-103/ \ 8 | --dataset wt103 \ 9 | --adaptive \ 10 | --n_layer 16 \ 11 | --d_model 410 \ 12 | --n_head 10 \ 13 | --d_head 41 \ 14 | --d_inner 2100 \ 15 | --dropout 0.1 \ 16 | --dropatt 0.0 \ 17 | --optim adan \ 18 | --wd 0.02 \ 19 | --lr 0.0015 \ 20 | --opt-betas 0.9 0.9 0.999 \ 21 | --clip 0.25 \ 22 | --lr_min 1e-6 \ 23 | --warmup_step 5000 \ 24 | --max_step 200000 \ 25 | --tgt_len 150 \ 26 | --mem_len 150 \ 27 | --eval_tgt_len 150 \ 28 | --batch_size 60 \ 29 | --multi_gpu \ 30 | --gpu0_bsz 4 \ 31 | ${@:2} 32 | elif [[ $1 == 'eval' ]]; then 33 | echo 'Run evaluation...' 34 | python eval.py \ 35 | --cuda \ 36 | --data /root/autodl-tmp/data/wikitext-103/ \ 37 | --dataset wt103 \ 38 | --tgt_len 64 \ 39 | --mem_len 640 \ 40 | --clamp_len 400 \ 41 | --same_length \ 42 | --split test \ 43 | ${@:2} 44 | else 45 | echo 'unknown argment 1' 46 | fi 47 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/adaptive_softmax.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | class AdaptiveLogSoftmax(nn.Module): 10 | def __init__(self, in_features, n_classes, cutoffs, keep_order=False): 11 | super(AdaptiveLogSoftmax, self).__init__() 12 | 13 | cutoffs = list(cutoffs) 14 | 15 | if (cutoffs != sorted(cutoffs)) \ 16 | or (min(cutoffs) <= 0) \ 17 | or (max(cutoffs) >= (n_classes - 1)) \ 18 | or (len(set(cutoffs)) != len(cutoffs)) \ 19 | or any([int(c) != c for c in cutoffs]): 20 | 21 | raise ValueError("cutoffs should be a sequence of unique, positive " 22 | "integers sorted in an increasing order, where " 23 | "each value is between 1 and n_classes-1") 24 | 25 | self.in_features = in_features 26 | self.n_classes = n_classes 27 | self.cutoffs = cutoffs + [n_classes] 28 | 29 | self.shortlist_size = self.cutoffs[0] 30 | self.n_clusters = len(self.cutoffs) - 1 31 | self.head_size = self.shortlist_size + self.n_clusters 32 | 33 | self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features)) 34 | self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) 35 | 36 | self.keep_order = keep_order 37 | 38 | 39 | def forward(self, hidden, target, weight, bias, keep_order=False): 40 | if hidden.size(0) != target.size(0): 41 | raise RuntimeError('Input and target should have the same size ' 42 | 'in the batch dimension.') 43 | 44 | head_weight = torch.cat( 45 | [weight[:self.shortlist_size], self.cluster_weight], dim=0) 46 | head_bias = torch.cat( 47 | [bias[:self.shortlist_size], self.cluster_bias], dim=0) 48 | 49 | head_logit = F.linear(hidden, head_weight, bias=head_bias) 50 | head_logprob = F.log_softmax(head_logit, dim=1) 51 | 52 | nll = torch.zeros_like(target, 53 | dtype=hidden.dtype, device=hidden.device) 54 | 55 | offset = 0 56 | cutoff_values = [0] + self.cutoffs 57 | for i in range(len(cutoff_values) - 1): 58 | l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1] 59 | 60 | mask_i = (target >= l_idx) & (target < h_idx) 61 | indices_i = mask_i.nonzero().squeeze() 62 | 63 | if indices_i.numel() == 0: 64 | continue 65 | 66 | target_i = target.index_select(0, indices_i) - l_idx 67 | head_logprob_i = head_logprob.index_select(0, indices_i) 68 | 69 | if i == 0: 70 | logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) 71 | else: 72 | weight_i = weight[l_idx:h_idx] 73 | bias_i = bias[l_idx:h_idx] 74 | 75 | hidden_i = hidden.index_select(0, indices_i) 76 | 77 | tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i) 78 | tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) 79 | 80 | logprob_i = head_logprob_i[:, -i] \ 81 | + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) 82 | 83 | if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: 84 | nll.index_copy_(0, indices_i, -logprob_i) 85 | else: 86 | nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) 87 | 88 | offset += logprob_i.size(0) 89 | 90 | return nll 91 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/data_parallel.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.nn.parallel import DataParallel 3 | import torch 4 | from torch.nn.parallel._functions import Scatter 5 | from torch.nn.parallel.parallel_apply import parallel_apply 6 | 7 | def scatter(inputs, target_gpus, chunk_sizes, dim=0): 8 | r""" 9 | Slices tensors into approximately equal chunks and 10 | distributes them across given GPUs. Duplicates 11 | references to objects that are not tensors. 12 | """ 13 | def scatter_map(obj): 14 | if isinstance(obj, torch.Tensor): 15 | try: 16 | return Scatter.apply(target_gpus, chunk_sizes, dim, obj) 17 | except: 18 | print('obj', obj.size()) 19 | print('dim', dim) 20 | print('chunk_sizes', chunk_sizes) 21 | quit() 22 | if isinstance(obj, tuple) and len(obj) > 0: 23 | return list(zip(*map(scatter_map, obj))) 24 | if isinstance(obj, list) and len(obj) > 0: 25 | return list(map(list, zip(*map(scatter_map, obj)))) 26 | if isinstance(obj, dict) and len(obj) > 0: 27 | return list(map(type(obj), zip(*map(scatter_map, obj.items())))) 28 | return [obj for targets in target_gpus] 29 | 30 | # After scatter_map is called, a scatter_map cell will exist. This cell 31 | # has a reference to the actual function scatter_map, which has references 32 | # to a closure that has a reference to the scatter_map cell (because the 33 | # fn is recursive). To avoid this reference cycle, we set the function to 34 | # None, clearing the cell 35 | try: 36 | return scatter_map(inputs) 37 | finally: 38 | scatter_map = None 39 | 40 | def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0): 41 | r"""Scatter with support for kwargs dictionary""" 42 | inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else [] 43 | kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else [] 44 | if len(inputs) < len(kwargs): 45 | inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) 46 | elif len(kwargs) < len(inputs): 47 | kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) 48 | inputs = tuple(inputs) 49 | kwargs = tuple(kwargs) 50 | return inputs, kwargs 51 | 52 | class BalancedDataParallel(DataParallel): 53 | def __init__(self, gpu0_bsz, *args, **kwargs): 54 | self.gpu0_bsz = gpu0_bsz 55 | super().__init__(*args, **kwargs) 56 | 57 | def forward(self, *inputs, **kwargs): 58 | if not self.device_ids: 59 | return self.module(*inputs, **kwargs) 60 | if self.gpu0_bsz == 0: 61 | device_ids = self.device_ids[1:] 62 | else: 63 | device_ids = self.device_ids 64 | inputs, kwargs = self.scatter(inputs, kwargs, device_ids) 65 | if len(self.device_ids) == 1: 66 | return self.module(*inputs[0], **kwargs[0]) 67 | replicas = self.replicate(self.module, self.device_ids) 68 | if self.gpu0_bsz == 0: 69 | replicas = replicas[1:] 70 | outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs) 71 | return self.gather(outputs, self.output_device) 72 | 73 | def parallel_apply(self, replicas, device_ids, inputs, kwargs): 74 | return parallel_apply(replicas, inputs, kwargs, device_ids) 75 | 76 | def scatter(self, inputs, kwargs, device_ids): 77 | bsz = inputs[0].size(self.dim) 78 | num_dev = len(self.device_ids) 79 | gpu0_bsz = self.gpu0_bsz 80 | bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1) 81 | if gpu0_bsz < bsz_unit: 82 | chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1) 83 | delta = bsz - sum(chunk_sizes) 84 | for i in range(delta): 85 | chunk_sizes[i + 1] += 1 86 | if gpu0_bsz == 0: 87 | chunk_sizes = chunk_sizes[1:] 88 | else: 89 | return super().scatter(inputs, kwargs, device_ids) 90 | return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim) 91 | 92 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/exp_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os, shutil 3 | 4 | import numpy as np 5 | 6 | import torch 7 | 8 | 9 | def logging(s, log_path, print_=True, log_=True): 10 | if print_: 11 | print(s) 12 | if log_: 13 | with open(log_path, 'a+') as f_log: 14 | f_log.write(s + '\n') 15 | 16 | def get_logger(log_path, **kwargs): 17 | return functools.partial(logging, log_path=log_path, **kwargs) 18 | 19 | def create_exp_dir(dir_path, scripts_to_save=None, debug=False): 20 | if debug: 21 | print('Debug Mode : no experiment dir created') 22 | return functools.partial(logging, log_path=None, log_=False) 23 | 24 | if not os.path.exists(dir_path): 25 | os.makedirs(dir_path) 26 | 27 | print('Experiment dir : {}'.format(dir_path)) 28 | if scripts_to_save is not None: 29 | script_path = os.path.join(dir_path, 'scripts') 30 | if not os.path.exists(script_path): 31 | os.makedirs(script_path) 32 | for script in scripts_to_save: 33 | dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script)) 34 | shutil.copyfile(script, dst_file) 35 | 36 | return get_logger(log_path=os.path.join(dir_path, 'log.txt')) 37 | 38 | def save_checkpoint(model, optimizer, path, epoch): 39 | torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch))) 40 | torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch))) 41 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/log_uniform_sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import numpy as np 4 | 5 | class LogUniformSampler(object): 6 | def __init__(self, range_max, n_sample): 7 | """ 8 | Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py 9 | `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` 10 | 11 | expected count can be approximated by 1 - (1 - p)^n 12 | and we use a numerically stable version -expm1(num_tries * log1p(-p)) 13 | 14 | Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run 15 | """ 16 | with torch.no_grad(): 17 | self.range_max = range_max 18 | log_indices = torch.arange(1., range_max+2., 1.).log_() 19 | self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] 20 | # print('P', self.dist.numpy().tolist()[-30:]) 21 | 22 | self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() 23 | 24 | self.n_sample = n_sample 25 | 26 | def sample(self, labels): 27 | """ 28 | labels: [b1, b2] 29 | Return 30 | true_log_probs: [b1, b2] 31 | samp_log_probs: [n_sample] 32 | neg_samples: [n_sample] 33 | """ 34 | 35 | # neg_samples = torch.empty(0).long() 36 | n_sample = self.n_sample 37 | n_tries = 2 * n_sample 38 | 39 | with torch.no_grad(): 40 | neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() 41 | device = labels.device 42 | neg_samples = neg_samples.to(device) 43 | true_log_probs = self.log_q[labels].to(device) 44 | samp_log_probs = self.log_q[neg_samples].to(device) 45 | return true_log_probs, samp_log_probs, neg_samples 46 | 47 | def sample_logits(embedding, bias, labels, inputs, sampler): 48 | """ 49 | embedding: an nn.Embedding layer 50 | bias: [n_vocab] 51 | labels: [b1, b2] 52 | inputs: [b1, b2, n_emb] 53 | sampler: you may use a LogUniformSampler 54 | Return 55 | logits: [b1, b2, 1 + n_sample] 56 | """ 57 | true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) 58 | n_sample = neg_samples.size(0) 59 | b1, b2 = labels.size(0), labels.size(1) 60 | all_ids = torch.cat([labels.view(-1), neg_samples]) 61 | all_w = embedding(all_ids) 62 | true_w = all_w[: -n_sample].view(b1, b2, -1) 63 | sample_w = all_w[- n_sample:].view(n_sample, -1) 64 | 65 | all_b = bias[all_ids] 66 | true_b = all_b[: -n_sample].view(b1, b2) 67 | sample_b = all_b[- n_sample:] 68 | 69 | hit = (labels[:, :, None] == neg_samples).detach() 70 | 71 | true_logits = torch.einsum('ijk,ijk->ij', 72 | [true_w, inputs]) + true_b - true_log_probs 73 | sample_logits = torch.einsum('lk,ijk->ijl', 74 | [sample_w, inputs]) + sample_b - samp_log_probs 75 | sample_logits.masked_fill_(hit, -1e30) 76 | logits = torch.cat([true_logits[:, :, None], sample_logits], -1) 77 | 78 | return logits 79 | 80 | 81 | # class LogUniformSampler(object): 82 | # def __init__(self, range_max, unique=False): 83 | # """ 84 | # Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py 85 | # `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` 86 | # """ 87 | # self.range_max = range_max 88 | # log_indices = torch.arange(1., range_max+2., 1.).log_() 89 | # self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] 90 | 91 | # self.unique = unique 92 | 93 | # if self.unique: 94 | # self.exclude_mask = torch.ByteTensor(range_max).fill_(0) 95 | 96 | # def sample(self, n_sample, labels): 97 | # pos_sample, new_labels = labels.unique(return_inverse=True) 98 | # n_pos_sample = pos_sample.size(0) 99 | # n_neg_sample = n_sample - n_pos_sample 100 | 101 | # if self.unique: 102 | # self.exclude_mask.index_fill_(0, pos_sample, 1) 103 | # sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0) 104 | # self.exclude_mask.index_fill_(0, pos_sample, 0) 105 | # else: 106 | # sample_dist = self.dist 107 | 108 | # neg_sample = torch.multinomial(sample_dist, n_neg_sample) 109 | 110 | # sample = torch.cat([pos_sample, neg_sample]) 111 | # sample_prob = self.dist[sample] 112 | 113 | # return new_labels, sample, sample_prob 114 | 115 | 116 | if __name__ == '__main__': 117 | S, B = 3, 4 118 | n_vocab = 10000 119 | n_sample = 5 120 | H = 32 121 | 122 | labels = torch.LongTensor(S, B).random_(0, n_vocab) 123 | 124 | # sampler = LogUniformSampler(n_vocab, unique=False) 125 | # new_labels, sample, sample_prob = sampler.sample(n_sample, labels) 126 | 127 | sampler = LogUniformSampler(n_vocab, unique=True) 128 | # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels) 129 | 130 | # print('true_probs', true_probs.numpy().tolist()) 131 | # print('samp_probs', samp_probs.numpy().tolist()) 132 | # print('neg_samples', neg_samples.numpy().tolist()) 133 | 134 | # print('sum', torch.sum(sampler.dist).item()) 135 | 136 | # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item() 137 | 138 | embedding = nn.Embedding(n_vocab, H) 139 | bias = torch.zeros(n_vocab) 140 | inputs = torch.Tensor(S, B, H).normal_() 141 | 142 | logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) 143 | print('logits', logits.detach().numpy().tolist()) 144 | print('logits shape', logits.size()) 145 | print('out_labels', out_labels.detach().numpy().tolist()) 146 | print('out_labels shape', out_labels.size()) 147 | 148 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/proj_adaptive_softmax.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) 10 | CUDA_MINOR = int(torch.version.cuda.split('.')[1]) 11 | 12 | class ProjectedAdaptiveLogSoftmax(nn.Module): 13 | def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 14 | keep_order=False): 15 | super(ProjectedAdaptiveLogSoftmax, self).__init__() 16 | 17 | self.n_token = n_token 18 | self.d_embed = d_embed 19 | self.d_proj = d_proj 20 | 21 | self.cutoffs = cutoffs + [n_token] 22 | self.cutoff_ends = [0] + self.cutoffs 23 | self.div_val = div_val 24 | 25 | self.shortlist_size = self.cutoffs[0] 26 | self.n_clusters = len(self.cutoffs) - 1 27 | self.head_size = self.shortlist_size + self.n_clusters 28 | 29 | if self.n_clusters > 0: 30 | self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) 31 | self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) 32 | 33 | self.out_layers = nn.ModuleList() 34 | self.out_projs = nn.ParameterList() 35 | 36 | if div_val == 1: 37 | for i in range(len(self.cutoffs)): 38 | if d_proj != d_embed: 39 | self.out_projs.append( 40 | nn.Parameter(torch.Tensor(d_proj, d_embed)) 41 | ) 42 | else: 43 | self.out_projs.append(None) 44 | 45 | self.out_layers.append(nn.Linear(d_embed, n_token)) 46 | else: 47 | for i in range(len(self.cutoffs)): 48 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] 49 | d_emb_i = d_embed // (div_val ** i) 50 | 51 | self.out_projs.append( 52 | nn.Parameter(torch.Tensor(d_proj, d_emb_i)) 53 | ) 54 | 55 | self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) 56 | 57 | self.keep_order = keep_order 58 | 59 | def _compute_logit(self, hidden, weight, bias, proj): 60 | if proj is None: 61 | logit = F.linear(hidden, weight, bias=bias) 62 | else: 63 | # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: 64 | proj_hid = F.linear(hidden, proj.t().contiguous()) 65 | logit = F.linear(proj_hid, weight, bias=bias) 66 | # else: 67 | # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) 68 | # if bias is not None: 69 | # logit = logit + bias 70 | 71 | return logit 72 | 73 | def forward(self, hidden, target, keep_order=False): 74 | ''' 75 | hidden :: [len*bsz x d_proj] 76 | target :: [len*bsz] 77 | ''' 78 | 79 | if hidden.size(0) != target.size(0): 80 | raise RuntimeError('Input and target should have the same size ' 81 | 'in the batch dimension.') 82 | 83 | if self.n_clusters == 0: 84 | logit = self._compute_logit(hidden, self.out_layers[0].weight, 85 | self.out_layers[0].bias, self.out_projs[0]) 86 | nll = -F.log_softmax(logit, dim=-1) \ 87 | .gather(1, target.unsqueeze(1)).squeeze(1) 88 | else: 89 | # construct weights and biases 90 | weights, biases = [], [] 91 | for i in range(len(self.cutoffs)): 92 | if self.div_val == 1: 93 | l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] 94 | weight_i = self.out_layers[0].weight[l_idx:r_idx] 95 | bias_i = self.out_layers[0].bias[l_idx:r_idx] 96 | else: 97 | weight_i = self.out_layers[i].weight 98 | bias_i = self.out_layers[i].bias 99 | 100 | if i == 0: 101 | weight_i = torch.cat( 102 | [weight_i, self.cluster_weight], dim=0) 103 | bias_i = torch.cat( 104 | [bias_i, self.cluster_bias], dim=0) 105 | 106 | weights.append(weight_i) 107 | biases.append(bias_i) 108 | 109 | head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] 110 | 111 | head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) 112 | head_logprob = F.log_softmax(head_logit, dim=1) 113 | 114 | nll = torch.zeros_like(target, 115 | dtype=hidden.dtype, device=hidden.device) 116 | 117 | offset = 0 118 | cutoff_values = [0] + self.cutoffs 119 | for i in range(len(cutoff_values) - 1): 120 | l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] 121 | 122 | mask_i = (target >= l_idx) & (target < r_idx) 123 | indices_i = mask_i.nonzero().squeeze() 124 | 125 | if indices_i.numel() == 0: 126 | continue 127 | 128 | target_i = target.index_select(0, indices_i) - l_idx 129 | head_logprob_i = head_logprob.index_select(0, indices_i) 130 | 131 | if i == 0: 132 | logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) 133 | else: 134 | weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] 135 | 136 | hidden_i = hidden.index_select(0, indices_i) 137 | 138 | tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) 139 | tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) 140 | 141 | logprob_i = head_logprob_i[:, -i] \ 142 | + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) 143 | 144 | if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: 145 | nll.index_copy_(0, indices_i, -logprob_i) 146 | else: 147 | nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) 148 | 149 | offset += logprob_i.size(0) 150 | 151 | return nll 152 | -------------------------------------------------------------------------------- /NLP/Transformer-XL/utils/vocabulary.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import Counter, OrderedDict 3 | 4 | import torch 5 | 6 | class Vocab(object): 7 | def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, 8 | delimiter=None, vocab_file=None): 9 | self.counter = Counter() 10 | self.special = special 11 | self.min_freq = min_freq 12 | self.max_size = max_size 13 | self.lower_case = lower_case 14 | self.delimiter = delimiter 15 | self.vocab_file = vocab_file 16 | 17 | def tokenize(self, line, add_eos=False, add_double_eos=False): 18 | line = line.strip() 19 | # convert to lower case 20 | if self.lower_case: 21 | line = line.lower() 22 | 23 | # empty delimiter '' will evaluate False 24 | if self.delimiter == '': 25 | symbols = line 26 | else: 27 | symbols = line.split(self.delimiter) 28 | 29 | if add_double_eos: # lm1b 30 | return [''] + symbols + [''] 31 | elif add_eos: 32 | return symbols + [''] 33 | else: 34 | return symbols 35 | 36 | def count_file(self, path, verbose=False, add_eos=False): 37 | if verbose: print('counting file {} ...'.format(path)) 38 | assert os.path.exists(path) 39 | 40 | sents = [] 41 | with open(path, 'r', encoding='utf-8') as f: 42 | for idx, line in enumerate(f): 43 | if verbose and idx > 0 and idx % 500000 == 0: 44 | print(' line {}'.format(idx)) 45 | symbols = self.tokenize(line, add_eos=add_eos) 46 | self.counter.update(symbols) 47 | sents.append(symbols) 48 | 49 | return sents 50 | 51 | def count_sents(self, sents, verbose=False): 52 | """ 53 | sents : a list of sentences, each a list of tokenized symbols 54 | """ 55 | if verbose: print('counting {} sents ...'.format(len(sents))) 56 | for idx, symbols in enumerate(sents): 57 | if verbose and idx > 0 and idx % 500000 == 0: 58 | print(' line {}'.format(idx)) 59 | self.counter.update(symbols) 60 | 61 | def _build_from_file(self, vocab_file): 62 | self.idx2sym = [] 63 | self.sym2idx = OrderedDict() 64 | 65 | with open(vocab_file, 'r', encoding='utf-8') as f: 66 | for line in f: 67 | symb = line.strip().split()[0] 68 | self.add_symbol(symb) 69 | self.unk_idx = self.sym2idx[''] 70 | 71 | def build_vocab(self): 72 | if self.vocab_file: 73 | print('building vocab from {}'.format(self.vocab_file)) 74 | self._build_from_file(self.vocab_file) 75 | print('final vocab size {}'.format(len(self))) 76 | else: 77 | print('building vocab with min_freq={}, max_size={}'.format( 78 | self.min_freq, self.max_size)) 79 | self.idx2sym = [] 80 | self.sym2idx = OrderedDict() 81 | 82 | for sym in self.special: 83 | self.add_special(sym) 84 | 85 | for sym, cnt in self.counter.most_common(self.max_size): 86 | if cnt < self.min_freq: break 87 | self.add_symbol(sym) 88 | 89 | print('final vocab size {} from {} unique tokens'.format( 90 | len(self), len(self.counter))) 91 | 92 | def encode_file(self, path, ordered=False, verbose=False, add_eos=True, 93 | add_double_eos=False): 94 | if verbose: print('encoding file {} ...'.format(path)) 95 | assert os.path.exists(path) 96 | encoded = [] 97 | with open(path, 'r', encoding='utf-8') as f: 98 | for idx, line in enumerate(f): 99 | if verbose and idx > 0 and idx % 500000 == 0: 100 | print(' line {}'.format(idx)) 101 | symbols = self.tokenize(line, add_eos=add_eos, 102 | add_double_eos=add_double_eos) 103 | encoded.append(self.convert_to_tensor(symbols)) 104 | 105 | if ordered: 106 | encoded = torch.cat(encoded) 107 | 108 | return encoded 109 | 110 | def encode_sents(self, sents, ordered=False, verbose=False): 111 | if verbose: print('encoding {} sents ...'.format(len(sents))) 112 | encoded = [] 113 | for idx, symbols in enumerate(sents): 114 | if verbose and idx > 0 and idx % 500000 == 0: 115 | print(' line {}'.format(idx)) 116 | encoded.append(self.convert_to_tensor(symbols)) 117 | 118 | if ordered: 119 | encoded = torch.cat(encoded) 120 | 121 | return encoded 122 | 123 | def add_special(self, sym): 124 | if sym not in self.sym2idx: 125 | self.idx2sym.append(sym) 126 | self.sym2idx[sym] = len(self.idx2sym) - 1 127 | setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) 128 | 129 | def add_symbol(self, sym): 130 | if sym not in self.sym2idx: 131 | self.idx2sym.append(sym) 132 | self.sym2idx[sym] = len(self.idx2sym) - 1 133 | 134 | def get_sym(self, idx): 135 | assert 0 <= idx < len(self), 'Index {} out of range'.format(idx) 136 | return self.idx2sym[idx] 137 | 138 | def get_idx(self, sym): 139 | if sym in self.sym2idx: 140 | return self.sym2idx[sym] 141 | else: 142 | # print('encounter unk {}'.format(sym)) 143 | assert '' not in sym 144 | assert hasattr(self, 'unk_idx') 145 | return self.sym2idx.get(sym, self.unk_idx) 146 | 147 | def get_symbols(self, indices): 148 | return [self.get_sym(idx) for idx in indices] 149 | 150 | def get_indices(self, symbols): 151 | return [self.get_idx(sym) for sym in symbols] 152 | 153 | def convert_to_tensor(self, symbols): 154 | return torch.LongTensor(self.get_indices(symbols)) 155 | 156 | def convert_to_sent(self, indices, exclude=None): 157 | if exclude is None: 158 | return ' '.join([self.get_sym(idx) for idx in indices]) 159 | else: 160 | return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) 161 | 162 | def __len__(self): 163 | return len(self.idx2sym) 164 | -------------------------------------------------------------------------------- /dreamfusion/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | We show the results of the text-to-3D task supported by the [DreamFusion Project](https://github.com/ashawkey/stable-dreamfusion). 4 | 5 | ## Usage of Adan for DreamFusion 6 | 7 | Adan is the default optimizer for the [DreamFusion Project](https://github.com/ashawkey/stable-dreamfusion); please refer to its repo to run these experiments. 8 | 9 | The project calls the Adan as follows: 10 | 11 | ``` 12 | optimizer = lambda model: Adan(model.get_params(5 * opt.lr), eps=1e-8, weight_decay=2e-5, max_grad_norm=5.0, foreach=False) 13 | ``` 14 | 15 | We may tune learning rate `opt.lr` and maximal gradient norm `max_grad_norm` to refine the results w.r.t. some text prompts. 16 | 17 | ## Training and Evaluation 18 | 19 | - #### Training 20 | 21 | ` python main.py --text $PROMPT --workspace $SAVE_PATH -O` 22 | 23 | - #### Evaluation 24 | 25 | `python main.py --workspace $SAVE_PATH -O --test` 26 | 27 | ## Results 28 | 29 | **prompt:** `a DSLR photo of the leaning tower of Pisa, aerial view`. Adan‘s model has more refined details. 30 | 31 | https://user-images.githubusercontent.com/10042844/211014605-3860b816-cc1c-4367-b96e-406cd375240a.mp4 32 | 33 | https://user-images.githubusercontent.com/10042844/211014603-82564238-cf5b-4ffa-b7a3-175bd565e5ce.mp4 34 | 35 | **prompt:** `Sydney opera house, aerial view`. Adan provides better details. 36 | 37 | https://user-images.githubusercontent.com/10042844/211014601-da430196-021d-4f6b-962b-8441feff5d02.mp4 38 | 39 | https://user-images.githubusercontent.com/10042844/211014594-3b5c05e3-9018-4a39-b5db-d6f2fc111cce.mp4 40 | 41 | **prompt:** `the Statue of Liberty, aerial view`. Adan has a better picture with this prompt. 42 | 43 | https://user-images.githubusercontent.com/10042844/211014579-4db62a55-fd05-4616-9793-5af5fea81c62.mp4 44 | 45 | https://user-images.githubusercontent.com/10042844/211014575-db8b9b1b-7e81-4a27-ba36-2ef74c00f0bc.mp4 46 | 47 | **prompt:** `the Imperial State Crown of England` 48 | 49 | https://user-images.githubusercontent.com/10042844/211014561-7a943df3-ed8f-4c1a-b51f-8ca5bccf1819.mp4 50 | 51 | https://user-images.githubusercontent.com/10042844/211014554-b7f696dd-8635-4d75-81c3-218dd0231c76.mp4 52 | 53 | **prompt:** `a candelabra with many candles`. Adam's model has some candles suspended in the air while Adan's result is more clear. 54 | 55 | https://user-images.githubusercontent.com/10042844/211014542-47f19116-9fb9-4e65-ad08-522d1c97ba11.mp4 56 | 57 | https://user-images.githubusercontent.com/10042844/211014532-6dec1554-c552-4fc5-92c4-cf9954d844cb.mp4 58 | 59 | **prompt:** `an extravagant mansion, aerial view`. Adan's result is more meaningful. 60 | 61 | https://user-images.githubusercontent.com/10042844/211014591-82d6e57e-bc9f-4b38-8d23-9b156a35334c.mp4 62 | 63 | https://user-images.githubusercontent.com/10042844/211014584-aa038ea9-58ae-422f-a128-e885d7d7ab08.mp4 64 | 65 | **prompt:** `Neuschwanstein Castle, aerial view` 66 | 67 | https://user-images.githubusercontent.com/10042844/211014548-160c7416-d74f-48aa-b3dc-bfd55e809b62.mp4 68 | 69 | https://user-images.githubusercontent.com/10042844/211014545-2515b2be-bff8-4e7c-9718-0ee0210c98e9.mp4 70 | 71 | **prompt:** `a delicious hamburger` 72 | 73 | https://user-images.githubusercontent.com/10042844/211014566-ae9c6f72-2bbf-4e4b-8f15-27851464a620.mp4 74 | 75 | https://user-images.githubusercontent.com/10042844/211014571-af207d24-1119-4b34-a31d-5250046cc426.mp4 76 | 77 | **prompt:** `a palm tree, low poly 3d model`. Adan's model has a better shadow part. 78 | 79 | https://user-images.githubusercontent.com/10042844/211014613-6373253d-7a37-4b66-ac1b-d04bb7819c01.mp4 80 | 81 | https://user-images.githubusercontent.com/10042844/211014610-67817157-fe9e-4ace-a188-e84d88bf0f66.mp4 82 | -------------------------------------------------------------------------------- /fused_adan/README.md: -------------------------------------------------------------------------------- 1 | # Adan Optimizer fused kernel 2 | 3 | ## Dependence 4 | 5 | 1. Libtorch/Pytorch (ATen is required, Compile passed on Pytorch 1.13.1) 6 | 2. CUDA Toolkit (Compile passed on CUDA 11.6+) 7 | 3. ninja 8 | 9 | ## Usage 10 | 11 | Using `Adan(..., foreach=False, fused=True)` enables fused Adan kernel with single tensor access. 12 | Using `Adan(..., foreach=True, fused=True)` enables fused Adan kernel with multi-tensor access. 13 | 14 | `foreach=True` is recommended for better performance. 15 | 16 | **Single tensor access** 17 | A *for loop* is used to traverse each layer when calculating the gradient of each Layer, requiring multiple kernels starts. Theoretically, accessing only one layer of parameters at a time is good for reducing peak memory usage, but it introduces kernel launch overhead. 18 | 19 | **Multi tensor access** 20 | The parameters of all layers are passed into the kernel at once, and the kernel internally uses a for loop to traverse each layer, requiring only one kernel start. Theoretically, this will lead to an increase in peak memory usage but will reduce the overhead of kernel startup. In actual tests, the increase in memory usage is not significant, but the kernel launch overhead is reduced. 21 | 22 | ## Benchmarking Results 23 | 24 | Benchmarking peak memory and wall duration of optimizers: Adam v.s. FusedAdan. The benchmarking uses GPT-2 with the different numbers of heads, layers, and Emb. Dim on a single NVIDIA A100 GPU (40G). 25 | 26 | The benchmarking is conducted based on the following config: 27 | 28 | - vocab size: 49280 29 | - batch size: 1 30 | - sequence length: 2048 31 | 32 | #### Memory Comparison 33 | 34 | | Head | Layers | Emb. Dim | Model Size (MB) | Adam Peak (MB) | FusedAdan Peak (MB) | Δ (%) | 35 | | :--: | :----: | :------: | :-------------: | :------------: | :-----------------: | :---: | 36 | | 6 | 6 | 768 | 81 | 4490 | 4490 | 0.00 | 37 | | 12 | 6 | 768 | 81 | 5848 | 5848 | 0.00 | 38 | | 16 | 6 | 768 | 81 | 6775 | 6775 | 0.00 | 39 | | 6 | 12 | 768 | 124 | 7151 | 7153 | 0.03 | 40 | | 12 | 12 | 768 | 124 | 9869 | 9871 | 0.02 | 41 | | 16 | 12 | 768 | 124 | 11733 | 11735 | 0.02 | 42 | | 16 | 6 | 1024 | 128 | 7302 | 7302 | 0.00 | 43 | | 16 | 12 | 1024 | 203 | 12719 | 12719 | 0.00 | 44 | | 6 | 24 | 768 | 209 | 12471 | 12473 | 0.02 | 45 | | 12 | 24 | 768 | 209 | 17907 | 17909 | 0.01 | 46 | | 16 | 24 | 768 | 209 | 21596 | 21598 | 0.01 | 47 | | 6 | 6 | 1536 | 248 | 6880 | 7308 | 6.22 | 48 | | 12 | 6 | 1536 | 248 | 8235 | 8235 | 0.00 | 49 | | 16 | 6 | 1536 | 248 | 9141 | 9141 | 0.00 | 50 | | 16 | 24 | 1024 | 354 | 23530 | 23532 | 0.01 | 51 | | 16 | 6 | 2048 | 407 | 11098 | 11098 | 0.00 | 52 | | 6 | 12 | 1536 | 418 | 11137 | 12213 | 9.66 | 53 | | 12 | 12 | 1536 | 418 | 13855 | 13857 | 0.01 | 54 | | 16 | 12 | 1536 | 418 | 15667 | 15669 | 0.01 | 55 | | 16 | 6 | 2560 | 603 | 13967 | 15965 | 14.30 | 56 | | 16 | 12 | 2048 | 709 | 18851 | 18853 | 0.01 | 57 | | 6 | 24 | 1536 | 758 | 19660 | 21997 | 11.88 | 58 | | 12 | 24 | 1536 | 758 | 25096 | 25100 | 0.02 | 59 | | 16 | 24 | 1536 | 758 | 28720 | 28724 | 0.01 | 60 | | 16 | 24 | 2048 | 1313 | 34357 | 34363 | 0.02 | 61 | 62 | #### Time Comparison 63 | 64 | The duration time is the total time of 200 `optimizer.step()`. 65 | 66 | | Head | Layers | Emb. Dim | Model Size (MB) | Adam Time (ms) | FusedAdan Time (ms) | FusedAdan/Adam (%) | 67 | | :--: | :----: | :------: | :-------------: | :------------: | :-----------------: | :----------------: | 68 | | 6 | 6 | 768 | 81 | 5.40 | 4.07 | 81.6 | 69 | | 12 | 6 | 768 | 81 | 5.41 | 4.16 | 76.9 | 70 | | 16 | 6 | 768 | 81 | 5.41 | 4.11 | 76.0 | 71 | | 6 | 12 | 768 | 124 | 8.47 | 6.25 | 73.8 | 72 | | 12 | 12 | 768 | 124 | 8.46 | 6.18 | 73.0 | 73 | | 16 | 12 | 768 | 124 | 8.48 | 6.20 | 73.1 | 74 | | 16 | 6 | 1024 | 128 | 7.57 | 6.28 | 83.0 | 75 | | 16 | 12 | 1024 | 203 | 12.10 | 10.25 | 84.7 | 76 | | 6 | 24 | 768 | 209 | 16.40 | 10.56 | 64.4 | 77 | | 12 | 24 | 768 | 209 | 16.40 | 10.47 | 63.8 | 78 | | 16 | 24 | 768 | 209 | 16.35 | 10.56 | 64.6 | 79 | | 6 | 6 | 1536 | 248 | 15.92 | 12.29 | 77.2 | 80 | | 12 | 6 | 1536 | 248 | 15.94 | 12.35 | 77.5 | 81 | | 16 | 6 | 1536 | 248 | 15.94 | 12.36 | 77.5 | 82 | | 16 | 24 | 1024 | 354 | 21.05 | 17.51 | 83.2 | 83 | | 16 | 6 | 2048 | 407 | 25.05 | 19.84 | 79.2 | 84 | | 6 | 12 | 1536 | 418 | 27.24 | 20.58 | 75.6 | 85 | | 12 | 12 | 1536 | 418 | 27.25 | 20.54 | 75.4 | 86 | | 16 | 12 | 1536 | 418 | 27.25 | 20.46 | 75.1 | 87 | | 16 | 6 | 2560 | 603 | 36.86 | 29.55 | 80.1 | 88 | | 16 | 12 | 2048 | 709 | 44.00 | 34.89 | 79.3 | 89 | | 6 | 24 | 1536 | 758 | 49.87 | 37.52 | 75.2 | 90 | | 12 | 24 | 1536 | 758 | 49.87 | 37.42 | 75.0 | 91 | | 16 | 24 | 1536 | 758 | 49.92 | 37.56 | 75.2 | 92 | | 16 | 24 | 2048 | 1313 | 81.81 | 64.48 | 77.9 | 93 | 94 | ## Conclusion 95 | 96 | - The extra memory consumption does not increase linearly with the model's size. 97 | 98 | - In most cases, FusedAdan has no additional memory footprint and the time consumption is only 80% of Adam's. 99 | 100 | - In the extreme case, FusedAdan's additional memory footprint does not exceed 15%. 101 | -------------------------------------------------------------------------------- /fused_adan/include/fused_adan_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 The LightSeq Team 2 | Copyright NVIDIA/apex 3 | Copyright AlexwellChen 4 | This kernel is adapted from NVIDIA/apex and LightSeq Team 5 | */ 6 | #include 7 | #include 8 | 9 | // CUDA forward declaration 10 | void fused_adan_cuda( 11 | at::Tensor& p, at::Tensor& p_copy, at::Tensor& g, at::Tensor& exp_avg, 12 | at::Tensor& exp_avg_sq, at::Tensor& exp_avg_diff, 13 | at::Tensor& neg_grad, float beta1, float beta2, float beta3, 14 | float bias_correction1, float bias_correction2, float bias_correction3_sqrt, 15 | float lr, float decay, float eps, bool no_prox, float clip_global_grad_norm); 16 | 17 | void multi_tensor_adan_cuda( 18 | int chunk_size, 19 | at::Tensor noop_flag, 20 | std::vector> tensor_lists, 21 | const float beta1, 22 | const float beta2, 23 | const float beta3, 24 | const float bias_correction1, 25 | const float bias_correction2, 26 | const float bias_correction3_sqrt, 27 | const float lr, 28 | const float decay, 29 | const float epsilon, 30 | const bool no_prox, 31 | const float clip_global_grad_norm); -------------------------------------------------------------------------------- /fused_adan/include/multi_tensor_apply.cuh: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 The LightSeq Team 2 | Copyright NVIDIA/apex 3 | This file is adapted from NVIDIA/apex 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // #include 13 | 14 | // This header is the one-stop shop for all your multi-tensor apply needs. 15 | 16 | // TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson) 17 | constexpr int depth_to_max_tensors[6] = {110, 64, 48, 36, 30, 24}; 18 | constexpr int depth_to_max_blocks[6] = {320, 320, 320, 320, 320, 320}; 19 | 20 | #ifndef TORCH_CHECK 21 | #define TORCH_CHECK AT_CHECK 22 | #endif 23 | 24 | #ifdef VERSION_GE_1_3 25 | #define DATA_PTR data_ptr 26 | #else 27 | #define DATA_PTR data 28 | #endif 29 | 30 | template 31 | struct TensorListMetadata { 32 | void* addresses[n][depth_to_max_tensors[n - 1]]; 33 | int sizes[depth_to_max_tensors[n - 1]]; 34 | unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; 35 | int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a 36 | // full int. 37 | int start_tensor_this_launch; 38 | }; 39 | 40 | template 41 | __global__ void multi_tensor_apply_kernel(int chunk_size, 42 | volatile int* noop_flag, T tl, 43 | U callable, ArgTypes... args) { 44 | // Hand the chunk information to the user-supplied functor to process however 45 | // it likes. 46 | callable(chunk_size, noop_flag, tl, args...); 47 | } 48 | 49 | template 50 | void multi_tensor_apply( 51 | int block_size, int chunk_size, const at::Tensor& noop_flag, 52 | const std::vector>& tensor_lists, T callable, 53 | ArgTypes... args) { 54 | TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); 55 | int len0 = tensor_lists[0].size(); 56 | TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); 57 | auto ref_device = tensor_lists[0][0].device(); 58 | TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda"); 59 | for (int l = 0; l < tensor_lists.size(); 60 | l++) // No range-based for because I need indices 61 | { 62 | TORCH_CHECK(tensor_lists[l].size() == len0, 63 | "Size mismatch among tensor lists"); 64 | for (int t = 0; t < tensor_lists[l].size(); t++) { 65 | // TODO: Print which tensor fails. 66 | bool contiguous_memory = tensor_lists[l][t].is_contiguous(); 67 | #ifdef VERSION_GE_1_5 68 | contiguous_memory = 69 | (contiguous_memory || 70 | tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); 71 | #endif 72 | TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); 73 | TORCH_CHECK(tensor_lists[l][t].device() == ref_device, 74 | "A tensor was not on the same device as the first tensor"); 75 | TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), 76 | "Size mismatch"); 77 | } 78 | } 79 | 80 | int ntensors = tensor_lists[0].size(); 81 | 82 | TensorListMetadata tl; 83 | 84 | const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); 85 | auto stream = at::cuda::getCurrentCUDAStream(); 86 | 87 | tl.start_tensor_this_launch = 0; 88 | int loc_block_info = 0; 89 | int loc_tensor_info = 0; 90 | for (int t = 0; t < ntensors; t++) { 91 | tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); 92 | for (int d = 0; d < depth; d++) 93 | tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); 94 | loc_tensor_info++; 95 | 96 | int chunks_this_tensor = 97 | (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; 98 | 99 | for (int chunk = 0; chunk < chunks_this_tensor; chunk++) { 100 | // std::cout << chunks_this_tensor << std::endl; 101 | tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; 102 | tl.block_to_chunk[loc_block_info] = chunk; 103 | loc_block_info++; 104 | 105 | bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && 106 | chunk == chunks_this_tensor - 1); 107 | bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); 108 | bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); 109 | if (tensors_full || blocks_full || last_chunk) { 110 | // using accscalar_t = acc_type; 111 | multi_tensor_apply_kernel<<>>( 112 | chunk_size, noop_flag.DATA_PTR(), tl, callable, args...); 113 | 114 | AT_CUDA_CHECK(cudaGetLastError()); 115 | 116 | // Reset. The control flow possibilities here make my brain hurt. 117 | loc_block_info = 0; 118 | if (chunk == chunks_this_tensor - 1) { 119 | // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 120 | // << std::endl; 121 | loc_tensor_info = 0; 122 | tl.start_tensor_this_launch = t + 1; 123 | } else { 124 | // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 125 | // << std::endl; 126 | tl.sizes[0] = tl.sizes[loc_tensor_info - 1]; 127 | for (int d = 0; d < depth; d++) 128 | tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1]; 129 | loc_tensor_info = 1; 130 | tl.start_tensor_this_launch = t; 131 | } 132 | } 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /fused_adan/multi_tensor_adan_kernel.cu: -------------------------------------------------------------------------------- 1 | /* Copyright NVIDIA/apex 2 | Copyright AlexwellChen 3 | This kernel is adapted from NVIDIA/apex. 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | // Another possibility: 10 | // #include 11 | 12 | #include 13 | 14 | #include "include/type_shim.h" // Used for DISPATCH 15 | #include "include/multi_tensor_apply.cuh" 16 | #include "include/fused_adan_kernel.cuh" 17 | 18 | #define BLOCK_SIZE 512 19 | #define ILP 4 20 | 21 | using MATH_T = float; 22 | 23 | template 24 | struct AdanFunctor 25 | { 26 | __device__ __forceinline__ void operator()( 27 | int chunk_size, 28 | volatile int* noop_gmem, 29 | TensorListMetadata<6>& tl, 30 | const float beta1, 31 | const float beta2, 32 | const float beta3, 33 | const float bias_correction1, 34 | const float bias_correction2, 35 | const float bias_correction3_sqrt, 36 | const float lr, 37 | const float decay, 38 | const float epsilon, 39 | const bool no_prox, 40 | const float clip_global_grad_norm 41 | ) 42 | { 43 | // I'd like this kernel to propagate infs/nans. 44 | // if(*noop_gmem == 1) 45 | // return; 46 | 47 | int tensor_loc = tl.block_to_tensor[blockIdx.x]; 48 | 49 | // potentially use to pass in list of scalar 50 | // int tensor_num = tl.start_tensor_this_launch + tensor_loc; 51 | 52 | int chunk_idx = tl.block_to_chunk[blockIdx.x]; 53 | int n = tl.sizes[tensor_loc]; 54 | 55 | T* p = (T*)tl.addresses[0][tensor_loc]; 56 | p += chunk_idx*chunk_size; 57 | 58 | T* g = (T*)tl.addresses[1][tensor_loc]; 59 | g += chunk_idx*chunk_size; 60 | 61 | T* exp_avg = (T*)tl.addresses[2][tensor_loc]; 62 | exp_avg += chunk_idx*chunk_size; 63 | 64 | T* exp_avg_sq = (T*)tl.addresses[3][tensor_loc]; 65 | exp_avg_sq += chunk_idx*chunk_size; 66 | 67 | T* exp_avg_diff = (T*)tl.addresses[4][tensor_loc]; 68 | exp_avg_diff += chunk_idx*chunk_size; 69 | 70 | T* neg_grad = (T*)tl.addresses[5][tensor_loc]; 71 | neg_grad += chunk_idx*chunk_size; 72 | 73 | n -= chunk_idx*chunk_size; 74 | 75 | for(int i_start = 0; 76 | i_start < n && i_start < chunk_size; 77 | i_start += blockDim.x*ILP) 78 | { 79 | MATH_T r_p[ILP]; 80 | MATH_T r_g[ILP]; 81 | MATH_T r_exp_avg[ILP]; 82 | MATH_T r_exp_avg_sq[ILP]; 83 | MATH_T r_exp_avg_diff[ILP]; 84 | MATH_T r_neg_grad_diff[ILP]; 85 | #pragma unroll 86 | for(int ii = 0; ii < ILP; ii++) 87 | { 88 | int i = i_start + threadIdx.x + ii*blockDim.x; 89 | if(i < n && i < chunk_size) 90 | { 91 | r_p[ii] = p[i]; 92 | r_g[ii] = g[i]; 93 | r_exp_avg[ii] = exp_avg[i]; 94 | r_exp_avg_sq[ii] = exp_avg_sq[i]; 95 | r_exp_avg_diff[ii] = exp_avg_diff[i]; 96 | r_neg_grad_diff[ii] = neg_grad[i]; 97 | } else { 98 | r_p[ii] = MATH_T(0); 99 | r_g[ii] = MATH_T(0); 100 | r_exp_avg[ii] = MATH_T(0); 101 | r_exp_avg_sq[ii] = MATH_T(0); 102 | r_exp_avg_diff[ii] = MATH_T(0); 103 | r_neg_grad_diff[ii] = MATH_T(0); 104 | } 105 | } 106 | #pragma unroll 107 | for(int ii = 0; ii < ILP; ii++) 108 | { 109 | r_g[ii] *= clip_global_grad_norm; //scaled_grad 110 | MATH_T update; 111 | r_neg_grad_diff[ii] = r_g[ii] + r_neg_grad_diff[ii]; 112 | update = r_g[ii] + beta2 * r_neg_grad_diff[ii]; // 1 MAC, reused twice 113 | 114 | r_exp_avg[ii] = beta1 * r_exp_avg[ii] + (1 - beta1) * r_g[ii]; 115 | r_exp_avg_diff[ii] = beta2 * r_exp_avg_diff[ii] + (1 - beta2) * r_neg_grad_diff[ii]; 116 | 117 | r_exp_avg_sq[ii] = beta3 * r_exp_avg_sq[ii] + (1 - beta3) * update * update; 118 | 119 | MATH_T denom; 120 | denom = sqrtf(r_exp_avg_sq[ii]) / bias_correction3_sqrt + epsilon; 121 | MATH_T step_size_diff = lr * beta2 / bias_correction2; 122 | MATH_T step_size = lr / bias_correction1; 123 | 124 | if(no_prox){ 125 | r_p[ii] = r_p[ii] * (1 - lr * decay); 126 | r_p[ii] = r_p[ii] - step_size * r_exp_avg[ii] / denom; 127 | r_p[ii] = r_p[ii] - step_size_diff * r_exp_avg_diff[ii] / denom; 128 | } else { 129 | r_p[ii] = r_p[ii] - step_size * r_exp_avg[ii] / denom; 130 | r_p[ii] = r_p[ii] - step_size_diff * r_exp_avg_diff[ii] / denom; 131 | r_p[ii] = r_p[ii] / (1 + lr * decay); 132 | } 133 | } 134 | #pragma unroll 135 | for(int ii = 0; ii < ILP; ii++) 136 | { 137 | int i = i_start + threadIdx.x + ii*blockDim.x; 138 | if(i < n && i < chunk_size) 139 | { 140 | g[i] = r_g[ii]; 141 | p[i] = r_p[ii]; 142 | exp_avg[i] = r_exp_avg[ii]; 143 | exp_avg_sq[i] = r_exp_avg_sq[ii]; 144 | exp_avg_diff[i] = r_exp_avg_diff[ii]; 145 | } 146 | } 147 | } 148 | } 149 | }; 150 | 151 | void multi_tensor_adan_cuda( 152 | int chunk_size, 153 | at::Tensor noop_flag, 154 | std::vector> tensor_lists, 155 | const float beta1, 156 | const float beta2, 157 | const float beta3, 158 | const float bias_correction1, 159 | const float bias_correction2, 160 | const float bias_correction3_sqrt, 161 | const float lr, 162 | const float decay, 163 | const float epsilon, 164 | const bool no_prox, 165 | const float clip_global_grad_norm) 166 | { 167 | using namespace at; 168 | TORCH_CHECK(!tensor_lists.empty(), "tensor list cannot be empty") 169 | if (tensor_lists[0].empty()) { 170 | return; 171 | } 172 | 173 | // Assume single type across p,g,m1,m2 now 174 | DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT( 175 | tensor_lists[0][0].scalar_type(), 0, "adan", 176 | multi_tensor_apply<6>( 177 | BLOCK_SIZE, 178 | chunk_size, 179 | noop_flag, 180 | tensor_lists, 181 | AdanFunctor(), 182 | beta1, 183 | beta2, 184 | beta3, 185 | bias_correction1, 186 | bias_correction2, 187 | bias_correction3_sqrt, 188 | lr, 189 | decay, 190 | epsilon, 191 | no_prox, 192 | clip_global_grad_norm 193 | ); ) 194 | 195 | AT_CUDA_CHECK(cudaGetLastError()); 196 | 197 | } 198 | -------------------------------------------------------------------------------- /fused_adan/pybind_adan.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "include/fused_adan_kernel.cuh" 4 | 5 | // x is torch::Tensor 6 | #define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") 7 | #define CHECK_CONTIGUOUS(x) \ 8 | AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 9 | #define CHECK_INPUT(x) \ 10 | CHECK_CUDA(x); \ 11 | CHECK_CONTIGUOUS(x) 12 | 13 | // C++ interface 14 | 15 | void adan_single_tensor(at::Tensor& p, 16 | at::Tensor& p_copy, 17 | at::Tensor& g, 18 | at::Tensor& exp_avg, 19 | at::Tensor& exp_avg_sq, 20 | at::Tensor& exp_avg_diff, 21 | at::Tensor& pre_g, 22 | float beta1, float beta2, float beta3, 23 | float bias_correction1, float bias_correction2, float bias_correction3_sqrt, 24 | float lr, float decay, float eps, bool no_prox, float grad_scale) { 25 | CHECK_INPUT(p); 26 | if (p_copy.numel() > 0) CHECK_INPUT(p_copy); 27 | CHECK_INPUT(exp_avg); 28 | CHECK_INPUT(exp_avg_sq); 29 | CHECK_INPUT(exp_avg_diff); 30 | CHECK_INPUT(g); 31 | CHECK_INPUT(pre_g); 32 | int64_t num_elem = p.numel(); 33 | AT_ASSERTM(exp_avg.numel() == num_elem, 34 | "number of elements in exp_avg and p tensors should be equal"); 35 | AT_ASSERTM(exp_avg_sq.numel() == num_elem, 36 | "number of elements in exp_avg_sq and p tensors should be equal"); 37 | AT_ASSERTM(exp_avg_diff.numel() == num_elem, 38 | "number of elements in exp_avg_diff and p tensors should be equal"); 39 | AT_ASSERTM(g.numel() == num_elem, 40 | "number of elements in g and p tensors should be equal"); 41 | AT_ASSERTM(pre_g.numel() == num_elem, 42 | "number of elements in pre_g and p tensors should be equal"); 43 | AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0, 44 | "number of elements in p_copy and p tensors should be equal, or " 45 | "p_copy should be empty"); 46 | 47 | fused_adan_cuda(p, p_copy, g, 48 | exp_avg, exp_avg_sq, exp_avg_diff, 49 | pre_g, beta1, beta2, beta3, 50 | bias_correction1, bias_correction2, bias_correction3_sqrt, 51 | lr, decay, eps, no_prox, grad_scale); 52 | } 53 | 54 | void adan_multi_tensor( 55 | int chunk_size, 56 | at::Tensor noop_flag, 57 | std::vector> tensor_lists, 58 | const float beta1, 59 | const float beta2, 60 | const float beta3, 61 | const float bias_correction1, 62 | const float bias_correction2, 63 | const float bias_correction3_sqrt, 64 | const float lr, 65 | const float decay, 66 | const float epsilon, 67 | const bool no_prox, 68 | const float clip_global_grad_norm){ 69 | multi_tensor_adan_cuda( 70 | chunk_size, 71 | noop_flag, 72 | tensor_lists, 73 | beta1, 74 | beta2, 75 | beta3, 76 | bias_correction1, 77 | bias_correction2, 78 | bias_correction3_sqrt, 79 | lr, 80 | decay, 81 | epsilon, 82 | no_prox, 83 | clip_global_grad_norm 84 | ); 85 | } 86 | 87 | 88 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 89 | m.def("adan_single_tensor", &adan_single_tensor, "Adan optimized CUDA single tensor implementation."); 90 | m.def("adan_multi_tensor", &adan_multi_tensor, "Adan optimized CUDA multi tensor implementation."); 91 | } 92 | -------------------------------------------------------------------------------- /gpt2/README.md: -------------------------------------------------------------------------------- 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models 2 | 3 | This experiment is based on the warped repo of [Megatron-LM](https://github.com/bigcode-project/Megatron-LM), provided by [BigCode](https://www.bigcode-project.org/). And the task of this experiment is code generalization. 4 | 5 | ## Usage of Adan in Megatron-LM 6 | 7 | ### Two steps to use Adan 8 | 9 | **Step 1.** put `adan.py` in the path `Megatron-LM/megatron/optimizer/adan.py` and import it in the `Megatron-LM/megatron/optimizer/__init__.py`. 10 | 11 | ```python 12 | from .adan import Adan 13 | 14 | elif args.optimizer == 'adan': 15 | optimizer = Adan(param_groups,lr=args.lr, weight_decay=args.weight_decay, 16 | betas=(args.adan_beta1, args.adan_beta2, args.adan_beta3), 17 | eps=args.adan_eps) 18 | ``` 19 | 20 | **Step 2.** add the following parameters to the file `Megatron-LM/megatron/arguments.py`. 21 | 22 | ```python 23 | # beta3 is for the optimizer Adan, but not used in Adam. 24 | group.add_argument('--adan-beta1', type=float, default=0.98, 25 | help='First coefficient for computing running averages ' 26 | 'of gradient and its square') 27 | group.add_argument('--adan-beta2', type=float, default=0.92, 28 | help='Second coefficient for computing running averages ' 29 | 'of gradient and its square') 30 | group.add_argument('--adan-beta3', type=float, default=0.99, 31 | help='Second coefficient for computing running averages ' 32 | 'of gradient and its square') 33 | group.add_argument('--adan-eps', type=float, default=1e-08, 34 | help='Term added to the denominator to improve' 35 | 'numerical stability') 36 | group.add_argument('--optimizer', type=str, default='adam', 37 | choices=['adam', 'sgd', 'adan'], 38 | ``` 39 | 40 | - `adan-beta1,2,3`: optimizer betas for Adan. 41 | 42 | - `adan-eps`: stabilizing parameter. 43 | 44 | - `optimizer`: choices of optimizers. 45 | 46 | ## Data Preparation 47 | 48 | **Step 1.** download the dataset used for pre-training. The dataset is collected and released by [BigCode](https://www.bigcode-project.org/) project: 49 | 50 | ```python 51 | python ./download_dataset.py 52 | ``` 53 | 54 | **Step 2.** binarize the downloaded dataset: 55 | 56 | ```python 57 | python tools/preprocess_data.py \ 58 | --input stack_python.json \ 59 | --output-prefix codegpt \ 60 | --vocab checkpoints/gpt2-adan/tokenizer/vocab.json \ 61 | --json-key content \ 62 | --dataset-impl mmap \ 63 | --workers 16 \ 64 | --chunk-size 25 \ 65 | --tokenizer-type GPT2BPETokenizer \ 66 | --merge-file checkpoints/gpt2-adan/tokenizer/merges.txt \ 67 | --append-eod; \ 68 | ``` 69 | 70 | ## Pre-training 71 | 72 | - #### Installation and Export 73 | 74 | ```bash 75 | pip install wandb; \ 76 | pip install regex; \ 77 | pip install pybind11; \ 78 | pip install nltk; \ 79 | export MASTER_NODE=localhost; \ 80 | export NUM_NODES=8; \ 81 | export NODE_RANK=0; \ 82 | export WANDB_API_KEY=$YOUR_API; \ 83 | export WANDB_NAME=$PROJECT_NAME; \ 84 | export WANDB_NOTES=$NOTES; \ 85 | ``` 86 | 87 | - #### Training 88 | 89 | `bash ./pretrain.sh` 90 | 91 | ## Results and Logs on GPT2-345m 92 | 93 | We provide the config and log for GPT2-345m pre-trained on the dataset that comes from [BigCode](https://www.bigcode-project.org/) and evaluated on the [HumanEval](https://github.com/openai/human-eval) dataset by zero-shot learning. [HumanEval](https://github.com/openai/human-eval) is used to measure functional correctness for synthesizing programs from docstrings. It consists of 164 original programming problems, assessing language comprehension, algorithms, and simple mathematics, with some comparable to simple software interview questions. We set ` Temperature = 0.8` during evaluation. 94 | 95 | | | Steps | pass@1 | pass@10 | pass@100 | Download | 96 | | ---------------- | :---: | :----: | :-----: | :------: | :------------------------------------------------------------------------: | 97 | | GPT2-345m (Adam) | 300k | 0.0840 | 0.209 | 0.360 | [log&config](https://github.com/sail-sg/Adan/files/10362486/gpt2-adam.log) | 98 | | GPT2-345m (Adan) | 150k | 0.0843 | 0.221 | 0.377 | [log&config](https://github.com/sail-sg/Adan/files/10362485/gpt2-adan.log) | 99 | -------------------------------------------------------------------------------- /gpt2/checkpoints/gpt2-adan/gpt_args: -------------------------------------------------------------------------------- 1 | --num-layers 24 2 | --hidden-size 1024 3 | --num-attention-heads 16 4 | --seq-length 2048 5 | --max-position-embeddings 2048 6 | --micro-batch-size 10 7 | --global-batch-size 400 8 | --lr 0.0025 9 | --optimizer adan 10 | --train-iters 150000 11 | --lr-decay-iters 150000 12 | --lr-decay-style cosine 13 | --lr-warmup-iters 4000 14 | --weight-decay .05 15 | --adan-beta3 .95 16 | --fp16 17 | --log-interval 10 18 | --save-interval 5000 19 | --eval-interval 200 20 | --eval-iters 10 21 | --clip-grad 1.0 22 | -------------------------------------------------------------------------------- /gpt2/download_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset('bigcode/the-stack-dedup', 4 | use_auth_token=True, 5 | split='train', 6 | cache_dir='stack_dedup_python', 7 | data_dir='data/python') 8 | 9 | dataset.to_json('stack_python.json', num_proc=16) 10 | -------------------------------------------------------------------------------- /gpt2/pretrain.sh: -------------------------------------------------------------------------------- 1 | set -u # stop on unset variables 2 | 3 | GPUS_PER_NODE=8 4 | MASTER_ADDR=${MASTER_NODE} 5 | MASTER_PORT=6000 6 | NNODES=${NUM_NODES} 7 | # NODE_RANK=0 # env 8 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 9 | 10 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 11 | 12 | CHECKPOINT_NAME=gpt2-adan 13 | CHECKPOINT_PATH=checkpoints/$CHECKPOINT_NAME # Directory to store the checkpoints 14 | PREPROCESSED_DATA=preprocessed # Directory containing the preprocessed dataset. To preprocess a dataset, see https://github.com/bigcode-project/Megatron-LM#data-preprocessing 15 | VOCAB_FILE=${CHECKPOINT_PATH}/tokenizer/vocab.json 16 | MERGE_FILE=${CHECKPOINT_PATH}/tokenizer/merges.txt 17 | DATA_PATH=${PREPROCESSED_DATA}/codegpt_content_document 18 | 19 | GPT_ARGS=$(cat ${CHECKPOINT_PATH}/gpt_args) 20 | 21 | TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard" 22 | 23 | python -m torch.distributed.launch --nproc_per_node=8 \ 24 | pretrain_gpt.py \ 25 | --tensor-model-parallel-size 1 \ 26 | --pipeline-model-parallel-size 1 \ 27 | --recompute-activations \ 28 | $GPT_ARGS \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --wandb-entity-name xyxie \ 35 | --wandb-project-name $WANDB_NAME \ 36 | $TENSORBOARD_ARGS 37 | # Uncomment the next two lines to finetune from a pretrained model. 38 | # --finetune \ 39 | # --finetune-from /directory/containing/pretrained/model 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup 4 | 5 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 6 | from torch.cuda import is_available 7 | 8 | build_cuda_ext = is_available() or os.getenv('FORCE_CUDA', '0') == '1' 9 | 10 | cuda_extension = None 11 | if "--unfused" in sys.argv: 12 | print("Building unfused version of adan") 13 | sys.argv.remove("--unfused") 14 | elif build_cuda_ext: 15 | cuda_extension = CUDAExtension( 16 | 'fused_adan', 17 | sources=['fused_adan/pybind_adan.cpp', './fused_adan/fused_adan_kernel.cu', './fused_adan/multi_tensor_adan_kernel.cu'] 18 | ) 19 | 20 | setup( 21 | name='adan', 22 | python_requires='>=3.8', 23 | version='0.0.2', 24 | install_requires=['torch'], 25 | py_modules=['adan'], 26 | description=( 27 | 'Adan: Adaptive Nesterov Momentum Algorithm for ' 28 | 'Faster Optimizing Deep Models' 29 | ), 30 | author=( 31 | 'Xie, Xingyu and Zhou, Pan and Li, Huan and ' 32 | 'Lin, Zhouchen and Yan, Shuicheng' 33 | ), 34 | ext_modules=[cuda_extension] if cuda_extension is not None else [], 35 | cmdclass={'build_ext': BuildExtension} if build_cuda_ext else {}, 36 | ) --------------------------------------------------------------------------------