├── CV
    ├── MAE
    │   ├── README.md
    │   ├── engine_finetune.py
    │   ├── engine_pretrain.py
    │   ├── exp_results
    │   │   └── MAE
    │   │   │   ├── base
    │   │   │       ├── log_base_ft.txt
    │   │   │       └── log_base_pretrain.txt
    │   │   │   └── large
    │   │   │       ├── log_large_ft.txt
    │   │   │       └── log_large_pretrain.txt
    │   ├── main_finetune.py
    │   ├── main_linprobe.py
    │   ├── main_pretrain.py
    │   ├── models_mae.py
    │   ├── models_vit.py
    │   └── util
    │   │   ├── crop.py
    │   │   ├── datasets.py
    │   │   ├── lars.py
    │   │   ├── lr_decay.py
    │   │   ├── lr_sched.py
    │   │   ├── misc.py
    │   │   └── pos_embed.py
    └── timm
    │   ├── README.md
    │   ├── exp_results
    │       ├── ConvNext
    │       │   └── small
    │       │   │   ├── args_cvnext_150.yaml
    │       │   │   ├── args_cvnext_300.yaml
    │       │   │   ├── summary_cvnext_150.csv
    │       │   │   └── summary_cvnext_300.csv
    │       ├── ResNet
    │       │   ├── Res101
    │       │   │   ├── args_res101_100.yaml
    │       │   │   ├── args_res101_200.yaml
    │       │   │   ├── args_res101_300.yaml
    │       │   │   ├── summary_res101_100.csv
    │       │   │   ├── summary_res101_200.csv
    │       │   │   ├── summary_res101_300.csv
    │       │   │   └── summary_res50_200.csv
    │       │   └── Res50
    │       │   │   ├── args_res50_100.yaml
    │       │   │   ├── args_res50_200.yaml
    │       │   │   ├── args_res50_300.yaml
    │       │   │   ├── summary_res50_100.csv
    │       │   │   ├── summary_res50_200.csv
    │       │   │   └── summary_res50_300.csv
    │       └── ViT
    │       │   ├── base
    │       │       ├── args_vit-B_150.yaml
    │       │       ├── args_vit-B_300.yaml
    │       │       ├── args_vit-B_300_T.yaml
    │       │       ├── summary_vit-B_150.csv
    │       │       ├── summary_vit-B_300.csv
    │       │       └── summary_vit-B_300_T.csv
    │       │   └── small
    │       │       ├── args_vit-s_150-I.yaml
    │       │       ├── args_vit-s_150.yaml
    │       │       ├── args_vit-s_300-I.yaml
    │       │       ├── args_vit-s_300.yaml
    │       │       ├── summary_vit-s_150-I.csv
    │       │       ├── summary_vit-s_150.csv
    │       │       ├── summary_vit-s_300-I.csv
    │       │       └── summary_vit-s_300.csv
    │   ├── optim_factory.py
    │   ├── sam.py
    │   ├── supervised.md
    │   └── train.py
├── LICENSE
├── NLP
    ├── BERT
    │   ├── README.md
    │   ├── adan.py
    │   ├── config
    │   │   ├── finetuning
    │   │   │   ├── acc_test.py
    │   │   │   ├── cola-adan.yaml
    │   │   │   ├── cola.yaml
    │   │   │   ├── mnli-adan.yaml
    │   │   │   ├── mnli.yaml
    │   │   │   ├── qnli-adan.yaml
    │   │   │   ├── qnli.yaml
    │   │   │   ├── qqp-adan.yaml
    │   │   │   ├── qqp.yaml
    │   │   │   ├── rte-adan.yaml
    │   │   │   ├── rte.yaml
    │   │   │   ├── sst_2-adan.yaml
    │   │   │   ├── sst_2.yaml
    │   │   │   ├── sts_b-adan.yaml
    │   │   │   └── sts_b.yaml
    │   │   └── pretraining
    │   │   │   ├── base.yaml
    │   │   │   ├── bert-adan.yaml
    │   │   │   └── bert-base.yaml
    │   └── exp_results
    │   │   └── pretrain
    │   │       ├── full_config-adam.yaml
    │   │       ├── full_config-adan.yaml
    │   │       ├── hydra_train-adam.log
    │   │       ├── hydra_train-adan-2.log
    │   │       └── hydra_train-adan.log
    └── Transformer-XL
    │   ├── README.md
    │   ├── data_utils.py
    │   ├── eval.py
    │   ├── exp_results
    │       ├── log-100k.txt
    │       ├── log-200k.txt
    │       ├── log-50k.txt
    │       └── log-adam.txt
    │   ├── mem_transformer.py
    │   ├── run_wt103_adan.sh
    │   ├── train.py
    │   └── utils
    │       ├── adaptive_softmax.py
    │       ├── data_parallel.py
    │       ├── exp_utils.py
    │       ├── log_uniform_sampler.py
    │       ├── proj_adaptive_softmax.py
    │       └── vocabulary.py
├── README.md
├── adan.py
├── dreamfusion
    └── README.md
├── fused_adan
    ├── README.md
    ├── fused_adan_kernel.cu
    ├── include
    │   ├── fused_adan_kernel.cuh
    │   ├── multi_tensor_apply.cuh
    │   └── type_shim.h
    ├── multi_tensor_adan_kernel.cu
    └── pybind_adan.cpp
├── gpt2
    ├── README.md
    ├── checkpoints
    │   └── gpt2-adan
    │   │   ├── gpt_args
    │   │   └── tokenizer
    │   │       ├── merges.txt
    │   │       └── vocab.json
    ├── download_dataset.py
    └── pretrain.sh
└── setup.py


/CV/MAE/README.md:
--------------------------------------------------------------------------------
  1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
  2 | 
  3 | We provide the instruction to modify the official training and fine-tuning files used in [MAE](https://github.com/facebookresearch/mae) such that you can use Adan to train MAE. **Please follow MAE instruction to install necessary packages.**
  4 | 
  5 | 
  6 | 
  7 | ## Environment
  8 | 
  9 | Our experiments for this task are based on the following pkg version.
 10 | 
 11 | ```python
 12 | torch.__version__  = '1.7.1+cu110'
 13 | torchvision.__version__ = '0.8.2+cu110'
 14 | timm.__version__ = '0.4.5'
 15 | torchaudio.__version__ = '0.7.2'
 16 | ```
 17 | If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:mae](https://hub.docker.com/repository/docker/xyxie/adan-image).
 18 | 
 19 | 
 20 | 
 21 | ## Usage of Adan for MAE
 22 | 
 23 | ### Two steps to use Adan
 24 | 
 25 | **Step 1.** add the following parameters to the `main_pretrain.py` and `main_finetune.py`.
 26 | 
 27 | ```python
 28 | parser.add_argument('--use-adan', action='store_true', default=False, help='whether to use Adan')
 29 | parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient  (default: 0.0, no gradient clip)')
 30 | parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)')
 31 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)')
 32 | ```
 33 | 
 34 | * `use-adan`: whether to use Adan. The default optimizer is AdamW.
 35 | 
 36 | * `max-grad-norm`: it determines whether to perform gradient clipping. 
 37 | 
 38 | * `opt-eps`: optimizer epsilon to avoid the bad case where second-order moment is zero.
 39 | 
 40 | * `opt-betas`: optimizer betas for Adan.
 41 | 
 42 |   
 43 | 
 44 | **Step 2.** creat the Adan optimizer as follows. In this step, you can directly replace the vanilla optimizer creator :
 45 | 
 46 | ```python
 47 | # following timm: set wd as 0 for bias and norm layers
 48 | param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
 49 | if args.use_adan:
 50 |   if args.bias_decay:
 51 |     param = model_without_ddp.parameters() 
 52 |   else: 
 53 |     param = param_groups
 54 |     args.weight_decay = 0.0
 55 |     optimizer = Adan(param, weight_decay=args.weight_decay,
 56 |                      lr=args.lr, betas=args.opt_betas, 
 57 |                      eps = args.opt_eps, max_grad_norm=args.max_grad_norm)
 58 |   else:
 59 |     optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
 60 | ```
 61 | 
 62 | 
 63 | 
 64 | ## MAE Pre-training
 65 | 
 66 | ```python
 67 | python main_pretrain.py \
 68 |     --batch_size ${BS} --accum_iter 1 \
 69 |     --model ${MODEL_NAME} --norm_pix_loss --mask_ratio 0.75 \
 70 |     --epochs 800 \
 71 |     --lr ${LR}  --weight_decay 0.02 --warmup_epochs ${WR_EPOCH} \
 72 |     --min_lr ${MIN_LR} \
 73 |     --opt-betas 0.98 0.92 0.90 --opt-eps 1e-8 --max-grad-norm 10.0 \
 74 |     --use-adan  \
 75 |     --data_path ${IMAGENET_DIR}
 76 |     --output_dir ${OUT_DIR}
 77 | ```
 78 | 
 79 | - The pre-training file `main_pretrain.py` comes from [MAE](https://github.com/facebookresearch/mae).
 80 | - We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large.
 81 | - There are some differences between hyper-parameters for MAE-Base and MAE-Large
 82 | 
 83 | |           |      MODEL_NAME       |   LR   |  BS  | MIN_LR | WR_EPOCH |
 84 | | :-------: | :-------------------: | :----: | :--: | :----: | :------: |
 85 | | MAE-Base  | mae_vit_base_patch16  | 2.0e-3 | 256  |  1e-8  |    40    |
 86 | | MAE-Large | mae_vit_large_patch16 | 2.2e-3 | 128  |  1e-4  |    80    |
 87 | 
 88 | 
 89 | 
 90 | ## MAE Fine-tuning
 91 | 
 92 | ```python
 93 | python main_finetune.py \
 94 |   --accum_iter 1 \
 95 |   --batch_size ${BS} \
 96 |   --model ${MODEL_NAME} \
 97 |   --finetune  ${PATH to Ptr-trained Model} \
 98 |   --epochs ${EPOCH} \
 99 |   --lr 1.5e-2 --layer_decay ${LAYER_DECAY} \
100 |   --min-lr ${MIN_LR} \
101 |   --opt-betas 0.98 0.92 0.99 \
102 |   --opt-eps 1e-8 --max-grad-norm 0 \
103 |   --use-adan --warmup-epochs ${WR_EPOCH} \
104 |   --weight_decay ${WD} --drop_path ${DROP_PATH} \
105 |   --mixup 0.8 --cutmix 1.0 --reprob 0.25 \
106 |   --dist_eval --data_path ${IMAGENET_DIR}
107 | ```
108 | 
109 | - The fine-tune file `main_finetune.py` comes from [MAE](https://github.com/facebookresearch/mae).
110 | - We use **16** A100 GPUs for MAE-Base and **32** A100 GPUs for MAE-Large.
111 | - There are some differences between hyper-parameters for MAE-Base and MAE-Large
112 | 
113 | |           |    MODEL_NAME     | EPOCH | MIN_LR |  BS  | LAYER_DECAY | WR_EPOCH | WD   | DROP_PATH |
114 | | :-------: | :---------------: | :---: | :----: | :--: | :---------: | :------: | ---- | :-------: |
115 | | MAE-Base  | vit_base_patch16  |  100  |  1e-6  | 128  |    0.65     |    40    | 5e-3 |    0.1    |
116 | | MAE-Large | vit_large_patch16 |  50   |  1e-5  |  64  |    0.75     |    10    | 1e-3 |    0.2    |
117 | 
118 | 
119 | 
120 | ## Results and Logs
121 | 
122 | |          |                           MAE-Base                           |                          MAE-Large                           |
123 | | :------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
124 | | Top-1 Acc. (%) |                             83.8                             |                             85.9                             |
125 | | download | [log-pretrain](./exp_results/MAE/base/log_base_pretrain.txt)/[log-finetune](./exp_results/MAE/base/log_base_ft.txt)/model | [log-pretrain](./exp_results/MAE/large/log_large_pretrain.txt)/[log-finetune](./exp_results/MAE/large/log_large_ft.txt)/model |
126 | 
127 | 


--------------------------------------------------------------------------------
/CV/MAE/engine_finetune.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | # --------------------------------------------------------
  7 | # References:
  8 | # DeiT: https://github.com/facebookresearch/deit
  9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit
 10 | # --------------------------------------------------------
 11 | 
 12 | import math
 13 | import sys
 14 | from typing import Iterable, Optional
 15 | 
 16 | import torch
 17 | 
 18 | from timm.data import Mixup
 19 | from timm.utils import accuracy
 20 | 
 21 | import util.misc as misc
 22 | import util.lr_sched as lr_sched
 23 | 
 24 | 
 25 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
 26 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 27 |                     device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
 28 |                     mixup_fn: Optional[Mixup] = None, log_writer=None,
 29 |                     args=None):
 30 |     model.train(True)
 31 |     metric_logger = misc.MetricLogger(delimiter="  ")
 32 |     metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 33 |     header = 'Epoch: [{}]'.format(epoch)
 34 |     print_freq = 20
 35 | 
 36 |     accum_iter = args.accum_iter
 37 | 
 38 |     optimizer.zero_grad()
 39 | 
 40 |     if log_writer is not None:
 41 |         print('log_dir: {}'.format(log_writer.log_dir))
 42 | 
 43 |     for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
 44 | 
 45 |         # we use a per iteration (instead of per epoch) lr scheduler
 46 |         if data_iter_step % accum_iter == 0:
 47 |             lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
 48 | 
 49 |         samples = samples.to(device, non_blocking=True)
 50 |         targets = targets.to(device, non_blocking=True)
 51 | 
 52 |         if mixup_fn is not None:
 53 |             samples, targets = mixup_fn(samples, targets)
 54 | 
 55 |         with torch.cuda.amp.autocast():
 56 |             outputs = model(samples)
 57 |             loss = criterion(outputs, targets)
 58 | 
 59 |         loss_value = loss.item()
 60 | 
 61 |         
 62 | 
 63 |         loss /= accum_iter
 64 |         loss_scaler(loss, optimizer, clip_grad=max_norm,
 65 |                     parameters=model.parameters(), create_graph=False,
 66 |                     update_grad=(data_iter_step + 1) % accum_iter == 0)
 67 |         if (data_iter_step + 1) % accum_iter == 0:
 68 |             optimizer.zero_grad()
 69 | 
 70 |         torch.cuda.synchronize()
 71 | 
 72 |         metric_logger.update(loss=loss_value)
 73 |         min_lr = 10.
 74 |         max_lr = 0.
 75 |         for group in optimizer.param_groups:
 76 |             min_lr = min(min_lr, group["lr"])
 77 |             max_lr = max(max_lr, group["lr"])
 78 | 
 79 |         metric_logger.update(lr=max_lr)
 80 | 
 81 |         loss_value_reduce = misc.all_reduce_mean(loss_value)
 82 |         if not math.isfinite(loss_value_reduce):
 83 |             print("Loss is {}, stopping training".format(loss_value_reduce))
 84 |             sys.exit(1)
 85 |         if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
 86 |             """ We use epoch_1000x as the x-axis in tensorboard.
 87 |             This calibrates different curves when batch size changes.
 88 |             """
 89 |             epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
 90 |             log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x)
 91 |             log_writer.add_scalar('lr', max_lr, epoch_1000x)
 92 | 
 93 |     # gather the stats from all processes
 94 |     metric_logger.synchronize_between_processes()
 95 |     print("Averaged stats:", metric_logger)
 96 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 97 | 
 98 | 
 99 | @torch.no_grad()
100 | def evaluate(data_loader, model, device):
101 |     criterion = torch.nn.CrossEntropyLoss()
102 | 
103 |     metric_logger = misc.MetricLogger(delimiter="  ")
104 |     header = 'Test:'
105 | 
106 |     # switch to evaluation mode
107 |     model.eval()
108 | 
109 |     for batch in metric_logger.log_every(data_loader, 10, header):
110 |         images = batch[0]
111 |         target = batch[-1]
112 |         images = images.to(device, non_blocking=True)
113 |         target = target.to(device, non_blocking=True)
114 | 
115 |         # compute output
116 |         with torch.cuda.amp.autocast():
117 |             output = model(images)
118 |             loss = criterion(output, target)
119 | 
120 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
121 | 
122 |         batch_size = images.shape[0]
123 |         metric_logger.update(loss=loss.item())
124 |         metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
125 |         metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
126 |     # gather the stats from all processes
127 |     metric_logger.synchronize_between_processes()
128 |     print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
129 |           .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
130 | 
131 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


--------------------------------------------------------------------------------
/CV/MAE/engine_pretrain.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # References:
 8 | # DeiT: https://github.com/facebookresearch/deit
 9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit
10 | # --------------------------------------------------------
11 | import math
12 | import sys
13 | from typing import Iterable
14 | 
15 | import torch
16 | 
17 | import util.misc as misc
18 | import util.lr_sched as lr_sched
19 | 
20 | 
21 | def train_one_epoch(model: torch.nn.Module,
22 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
23 |                     device: torch.device, epoch: int, loss_scaler,
24 |                     log_writer=None,
25 |                     args=None):
26 |     model.train(True)
27 |     metric_logger = misc.MetricLogger(delimiter="  ")
28 |     metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
29 |     header = 'Epoch: [{}]'.format(epoch)
30 |     print_freq = 20
31 | 
32 |     accum_iter = args.accum_iter
33 | 
34 |     optimizer.zero_grad()
35 | 
36 |     if log_writer is not None:
37 |         print('log_dir: {}'.format(log_writer.log_dir))
38 | 
39 |     for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
40 | 
41 |         # we use a per iteration (instead of per epoch) lr scheduler
42 |         if data_iter_step % accum_iter == 0:
43 |             lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
44 | 
45 |         samples = samples.to(device, non_blocking=True)
46 | 
47 |         with torch.cuda.amp.autocast():
48 |             loss, _, _ = model(samples, mask_ratio=args.mask_ratio)
49 | 
50 |         loss_value = loss.item()
51 | 
52 |         
53 | 
54 |         loss /= accum_iter
55 |         loss_scaler(loss, optimizer, parameters=model.parameters(),
56 |                     update_grad=(data_iter_step + 1) % accum_iter == 0)
57 |         if (data_iter_step + 1) % accum_iter == 0:
58 |             optimizer.zero_grad()
59 | 
60 |         torch.cuda.synchronize()
61 | 
62 |         metric_logger.update(loss=loss_value)
63 | 
64 |         lr = optimizer.param_groups[0]["lr"]
65 |         metric_logger.update(lr=lr)
66 | 
67 |         loss_value_reduce = misc.all_reduce_mean(loss_value)
68 |         if not math.isfinite(loss_value_reduce):
69 |             print("Loss is {}, stopping training".format(loss_value_reduce))
70 |             sys.exit(1)
71 |         if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
72 |             """ We use epoch_1000x as the x-axis in tensorboard.
73 |             This calibrates different curves when batch size changes.
74 |             """
75 |             epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
76 |             log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
77 |             log_writer.add_scalar('lr', lr, epoch_1000x)
78 | 
79 | 
80 |     # gather the stats from all processes
81 |     metric_logger.synchronize_between_processes()
82 |     print("Averaged stats:", metric_logger)
83 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


--------------------------------------------------------------------------------
/CV/MAE/exp_results/MAE/large/log_large_ft.txt:
--------------------------------------------------------------------------------
 1 | {"train_lr": 0.0007476019200000001, "train_loss": 5.9094133159518245, "test_loss": 1.7714076134562493, "test_acc1": 61.33637235611582, "test_acc5": 84.77687142609177, "epoch": 0, "n_parameters": 304326632}
 2 | {"train_lr": 0.0022476019200000003, "train_loss": 4.501337738275528, "test_loss": 1.1959131537377834, "test_acc1": 72.27087332465598, "test_acc5": 91.66066860084875, "epoch": 1, "n_parameters": 304326632}
 3 | {"train_lr": 0.0037476019200000004, "train_loss": 4.119643689954281, "test_loss": 1.0854404755681752, "test_acc1": 75.52783110144804, "test_acc5": 93.39011516131733, "epoch": 2, "n_parameters": 304326632}
 4 | {"train_lr": 0.005247601920000002, "train_loss": 3.9008864871740343, "test_loss": 1.0289268112555146, "test_acc1": 76.92938261289896, "test_acc5": 94.09788868386092, "epoch": 3, "n_parameters": 304326632}
 5 | {"train_lr": 0.006747601919999998, "train_loss": 3.76051225707531, "test_loss": 0.9720380315184594, "test_acc1": 78.21497122713639, "test_acc5": 94.63371721293326, "epoch": 4, "n_parameters": 304326632}
 6 | {"train_lr": 0.00824760192, "train_loss": 3.651956864875555, "test_loss": 0.9415295435115695, "test_acc1": 78.97672746285214, "test_acc5": 95.09756876746584, "epoch": 5, "n_parameters": 304326632}
 7 | {"train_lr": 0.009747601920000001, "train_loss": 3.5677191224038602, "test_loss": 0.9388785093277693, "test_acc1": 79.57453616627957, "test_acc5": 95.29950415058465, "epoch": 6, "n_parameters": 304326632}
 8 | {"train_lr": 0.011247601919999997, "train_loss": 3.507449230492115, "test_loss": 0.9052619117870927, "test_acc1": 80.08437302847818, "test_acc5": 95.49944016815986, "epoch": 7, "n_parameters": 304326632}
 9 | {"train_lr": 0.012747601919999994, "train_loss": 3.4423172294437885, "test_loss": 0.8388488055765628, "test_acc1": 80.4342610673575, "test_acc5": 95.76935380052772, "epoch": 8, "n_parameters": 304326632}
10 | {"train_lr": 0.014247601920000002, "train_loss": 3.3948125799477102, "test_loss": 0.8529021150618792, "test_acc1": 80.73616445743343, "test_acc5": 95.86732244598355, "epoch": 9, "n_parameters": 304326632}
11 | {"train_lr": 0.01499233375709719, "train_loss": 3.342990658354759, "test_loss": 0.8151264287903905, "test_acc1": 81.03206976010719, "test_acc5": 95.96529109723585, "epoch": 10, "n_parameters": 304326632}
12 | {"train_lr": 0.014946245730243689, "train_loss": 3.288912183743715, "test_loss": 0.8095201044529676, "test_acc1": 81.51191621381963, "test_acc5": 96.16522712243801, "epoch": 11, "n_parameters": 304326632}
13 | {"train_lr": 0.01485427994899793, "train_loss": 3.238141927015781, "test_loss": 0.7871933653950691, "test_acc1": 82.07973450799821, "test_acc5": 96.36716250067556, "epoch": 12, "n_parameters": 304326632}
14 | {"train_lr": 0.014717003412983015, "train_loss": 3.1956452232837678, "test_loss": 0.7688306730240584, "test_acc1": 82.2496801315022, "test_acc5": 96.52111323888074, "epoch": 13, "n_parameters": 304326632}
15 | {"train_lr": 0.014535262477692571, "train_loss": 3.1652532088041307, "test_loss": 0.7522821754962206, "test_acc1": 82.66154833756725, "test_acc5": 96.58309339943104, "epoch": 14, "n_parameters": 304326632}
16 | {"train_lr": 0.014310177636427614, "train_loss": 3.121457608240843, "test_loss": 0.7477796772867441, "test_acc1": 82.73952337029799, "test_acc5": 96.67906269169892, "epoch": 15, "n_parameters": 304326632}
17 | {"train_lr": 0.014043136612082945, "train_loss": 3.0966577651739122, "test_loss": 0.753467806391418, "test_acc1": 82.9974408353359, "test_acc5": 96.78502878132953, "epoch": 16, "n_parameters": 304326632}
18 | {"train_lr": 0.013735785801373714, "train_loss": 3.0689808761537076, "test_loss": 0.7341048694401979, "test_acc1": 83.14339413813727, "test_acc5": 96.79302621802991, "epoch": 17, "n_parameters": 304326632}
19 | {"train_lr": 0.01339002012425247, "train_loss": 3.029768516147137, "test_loss": 0.725501059666276, "test_acc1": 83.34532951271389, "test_acc5": 96.81301982526358, "epoch": 18, "n_parameters": 304326632}
20 | {"train_lr": 0.01300797134109743, "train_loss": 3.0120413874208927, "test_loss": 0.7309531949833036, "test_acc1": 83.50927706414586, "test_acc5": 97.00095968694924, "epoch": 19, "n_parameters": 304326632}
21 | {"train_lr": 0.012591994909700855, "train_loss": 2.9821670488238334, "test_loss": 0.7118158831447363, "test_acc1": 83.61924186945724, "test_acc5": 97.01895393230026, "epoch": 20, "n_parameters": 304326632}
22 | {"train_lr": 0.012144655463088535, "train_loss": 2.962305991309881, "test_loss": 0.7047568802535534, "test_acc1": 83.74520156128774, "test_acc5": 97.07493601513458, "epoch": 21, "n_parameters": 304326632}
23 | {"train_lr": 0.011668710997704269, "train_loss": 2.938569626682997, "test_loss": 0.7103257965296507, "test_acc1": 83.9051503784292, "test_acc5": 97.10092768666078, "epoch": 22, "n_parameters": 304326632}
24 | {"train_lr": 0.01116709586944475, "train_loss": 2.91352473244071, "test_loss": 0.7010805677436293, "test_acc1": 84.26103648877037, "test_acc5": 97.12492002376135, "epoch": 23, "n_parameters": 304326632}
25 | {"train_lr": 0.010642902702379645, "train_loss": 2.8938853970646856, "test_loss": 0.692104572802782, "test_acc1": 84.34101090580701, "test_acc5": 97.2508797091852, "epoch": 24, "n_parameters": 304326632}
26 | {"train_lr": 0.010099363321695844, "train_loss": 2.874984144228697, "test_loss": 0.6802691061235965, "test_acc1": 84.30902114603967, "test_acc5": 97.22488802759142, "epoch": 25, "n_parameters": 304326632}
27 | {"train_lr": 0.009539828828420426, "train_loss": 2.852267661267519, "test_loss": 0.6850866706669331, "test_acc1": 84.41898594143599, "test_acc5": 97.29486562941827, "epoch": 26, "n_parameters": 304326632}
28 | {"train_lr": 0.00896774893876856, "train_loss": 2.837763201504946, "test_loss": 0.6828102863952518, "test_acc1": 84.65091173876118, "test_acc5": 97.3268554027616, "epoch": 27, "n_parameters": 304326632}
29 | {"train_lr": 0.008386650715495802, "train_loss": 2.81947190862298, "test_loss": 0.6762189302407206, "test_acc1": 84.7188899800782, "test_acc5": 97.34884836501368, "epoch": 28, "n_parameters": 304326632}
30 | {"train_lr": 0.00780011682238341, "train_loss": 2.8003201848089696, "test_loss": 0.6725861196033657, "test_acc1": 84.82285671179217, "test_acc5": 97.32285668235212, "epoch": 29, "n_parameters": 304326632}
31 | {"train_lr": 0.007211763435924688, "train_loss": 2.7866385659873485, "test_loss": 0.671936163790524, "test_acc1": 84.95481448881304, "test_acc5": 97.38883556682028, "epoch": 30, "n_parameters": 304326632}
32 | {"train_lr": 0.006625217950394574, "train_loss": 2.7746526652514936, "test_loss": 0.6678782022558153, "test_acc1": 84.89283432917799, "test_acc5": 97.4168266078561, "epoch": 31, "n_parameters": 304326632}
33 | {"train_lr": 0.006044096613757472, "train_loss": 2.7576689450562, "test_loss": 0.6610171441733838, "test_acc1": 85.12675947130145, "test_acc5": 97.46681061770316, "epoch": 32, "n_parameters": 304326632}
34 | {"train_lr": 0.00547198223229625, "train_loss": 2.7347684874773024, "test_loss": 0.6683760618418455, "test_acc1": 85.13675626942688, "test_acc5": 97.39683300702906, "epoch": 33, "n_parameters": 304326632}
35 | {"train_lr": 0.004912402081419917, "train_loss": 2.723790532976389, "test_loss": 0.6556776543706655, "test_acc1": 85.26271595713884, "test_acc5": 97.47680741643875, "epoch": 34, "n_parameters": 304326632}
36 | {"train_lr": 0.004368806158837928, "train_loss": 2.7088236126720906, "test_loss": 0.654360967874527, "test_acc1": 85.24072299839516, "test_acc5": 97.47280869541913, "epoch": 35, "n_parameters": 304326632}
37 | {"train_lr": 0.003844545914176986, "train_loss": 2.694744017738104, "test_loss": 0.6538684133067727, "test_acc1": 85.33869164430858, "test_acc5": 97.53278950156115, "epoch": 36, "n_parameters": 304326632}
38 | {"train_lr": 0.0033428535861796433, "train_loss": 2.6908254801392557, "test_loss": 0.6542927216365934, "test_acc1": 85.39467373049877, "test_acc5": 97.53878758217536, "epoch": 37, "n_parameters": 304326632}
39 | {"train_lr": 0.002866822274877639, "train_loss": 2.671278304463625, "test_loss": 0.6524978142604232, "test_acc1": 85.49464174439643, "test_acc5": 97.49280229456822, "epoch": 38, "n_parameters": 304326632}
40 | {"train_lr": 0.0024193868716016085, "train_loss": 2.657200170958042, "test_loss": 0.650126696806401, "test_acc1": 85.59660910492285, "test_acc5": 97.52879077825345, "epoch": 39, "n_parameters": 304326632}
41 | {"train_lr": 0.0020033059644001382, "train_loss": 2.652334677708149, "test_loss": 0.6520910476334393, "test_acc1": 85.52263278619495, "test_acc5": 97.50279909849014, "epoch": 40, "n_parameters": 304326632}
42 | {"train_lr": 0.001621144830427048, "train_loss": 2.6431411161601543, "test_loss": 0.647436778191477, "test_acc1": 85.6365963131361, "test_acc5": 97.54478566339972, "epoch": 41, "n_parameters": 304326632}
43 | {"train_lr": 0.0012752596201547688, "train_loss": 2.637372990643978, "test_loss": 0.6462450991012156, "test_acc1": 85.61260398945898, "test_acc5": 97.54678502360446, "epoch": 42, "n_parameters": 304326632}
44 | {"train_lr": 0.0009677828309231273, "train_loss": 2.6305615900933743, "test_loss": 0.6458461854793132, "test_acc1": 85.75455856750352, "test_acc5": 97.53878758278552, "epoch": 43, "n_parameters": 304326632}
45 | {"train_lr": 0.0007006101593841485, "train_loss": 2.627352162593603, "test_loss": 0.6431183713674545, "test_acc1": 85.75455856231719, "test_acc5": 97.5627799058525, "epoch": 44, "n_parameters": 304326632}
46 | {"train_lr": 0.0004753888139017931, "train_loss": 2.6245033386409284, "test_loss": 0.6450332224182784, "test_acc1": 85.80654192580981, "test_acc5": 97.56877798524638, "epoch": 45, "n_parameters": 304326632}
47 | {"train_lr": 0.0002935073589646598, "train_loss": 2.6220774190187455, "test_loss": 0.6432638500258326, "test_acc1": 85.85252721585758, "test_acc5": 97.56078054442744, "epoch": 46, "n_parameters": 304326632}
48 | {"train_lr": 0.00015608715422415792, "train_loss": 2.611486408829689, "test_loss": 0.6422065225988627, "test_acc1": 85.82453617009305, "test_acc5": 97.57077734545112, "epoch": 47, "n_parameters": 304326632}
49 | {"train_lr": 6.397544093936805e-05, "train_loss": 2.6108330062150955, "test_loss": 0.6433782994002104, "test_acc1": 85.822536808668, "test_acc5": 97.57677542606532, "epoch": 48, "n_parameters": 304326632}
50 | {"train_lr": 1.7740118452942777e-05, "train_loss": 2.6155946560740473, "test_loss": 0.6427758732996881, "test_acc1": 85.822536808668, "test_acc5": 97.5807741464748, "epoch": 49, "n_parameters": 304326632}
51 | 


--------------------------------------------------------------------------------
/CV/MAE/models_vit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # References:
 8 | # timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
 9 | # DeiT: https://github.com/facebookresearch/deit
10 | # --------------------------------------------------------
11 | 
12 | from functools import partial
13 | 
14 | import torch
15 | import torch.nn as nn
16 | 
17 | import timm.models.vision_transformer
18 | 
19 | 
20 | class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
21 |     """ Vision Transformer with support for global average pooling
22 |     """
23 |     def __init__(self, global_pool=False, **kwargs):
24 |         super(VisionTransformer, self).__init__(**kwargs)
25 | 
26 |         self.global_pool = global_pool
27 |         if self.global_pool:
28 |             norm_layer = kwargs['norm_layer']
29 |             embed_dim = kwargs['embed_dim']
30 |             self.fc_norm = norm_layer(embed_dim)
31 | 
32 |             del self.norm  # remove the original norm
33 | 
34 |     def forward_features(self, x):
35 |         B = x.shape[0]
36 |         x = self.patch_embed(x)
37 | 
38 |         cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
39 |         x = torch.cat((cls_tokens, x), dim=1)
40 |         x = x + self.pos_embed
41 |         x = self.pos_drop(x)
42 | 
43 |         for blk in self.blocks:
44 |             x = blk(x)
45 | 
46 |         if self.global_pool:
47 |             x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
48 |             outcome = self.fc_norm(x)
49 |         else:
50 |             x = self.norm(x)
51 |             outcome = x[:, 0]
52 | 
53 |         return outcome
54 | 
55 | 
56 | def vit_base_patch16(**kwargs):
57 |     model = VisionTransformer(
58 |         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
59 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
60 |     return model
61 | 
62 | 
63 | def vit_large_patch16(**kwargs):
64 |     model = VisionTransformer(
65 |         patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
66 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
67 |     return model
68 | 
69 | 
70 | def vit_huge_patch14(**kwargs):
71 |     model = VisionTransformer(
72 |         patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True,
73 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
74 |     return model


--------------------------------------------------------------------------------
/CV/MAE/util/crop.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | import torch
10 | 
11 | from torchvision import transforms
12 | from torchvision.transforms import functional as F
13 | 
14 | 
15 | class RandomResizedCrop(transforms.RandomResizedCrop):
16 |     """
17 |     RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
18 |     This may lead to results different with torchvision's version.
19 |     Following BYOL's TF code:
20 |     https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
21 |     """
22 |     @staticmethod
23 |     def get_params(img, scale, ratio):
24 |         width, height = F._get_image_size(img)
25 |         area = height * width
26 | 
27 |         target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
28 |         log_ratio = torch.log(torch.tensor(ratio))
29 |         aspect_ratio = torch.exp(
30 |             torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
31 |         ).item()
32 | 
33 |         w = int(round(math.sqrt(target_area * aspect_ratio)))
34 |         h = int(round(math.sqrt(target_area / aspect_ratio)))
35 | 
36 |         w = min(w, width)
37 |         h = min(h, height)
38 | 
39 |         i = torch.randint(0, height - h + 1, size=(1,)).item()
40 |         j = torch.randint(0, width - w + 1, size=(1,)).item()
41 | 
42 |         return i, j, h, w


--------------------------------------------------------------------------------
/CV/MAE/util/datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # References:
 8 | # DeiT: https://github.com/facebookresearch/deit
 9 | # --------------------------------------------------------
10 | 
11 | import os
12 | import PIL
13 | 
14 | from torchvision import datasets, transforms
15 | 
16 | from timm.data import create_transform
17 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
18 | 
19 | 
20 | def build_dataset(is_train, args):
21 |     transform = build_transform(is_train, args)
22 | 
23 |     root = os.path.join(args.data_path, 'train' if is_train else 'val')
24 |     dataset = datasets.ImageFolder(root, transform=transform)
25 | 
26 |     print(dataset)
27 | 
28 |     return dataset
29 | 
30 | 
31 | def build_transform(is_train, args):
32 |     mean = IMAGENET_DEFAULT_MEAN
33 |     std = IMAGENET_DEFAULT_STD
34 |     # train transform
35 |     if is_train:
36 |         # this should always dispatch to transforms_imagenet_train
37 |         transform = create_transform(
38 |             input_size=args.input_size,
39 |             is_training=True,
40 |             color_jitter=args.color_jitter,
41 |             auto_augment=args.aa,
42 |             interpolation='bicubic',
43 |             re_prob=args.reprob,
44 |             re_mode=args.remode,
45 |             re_count=args.recount,
46 |             mean=mean,
47 |             std=std,
48 |         )
49 |         return transform
50 | 
51 |     # eval transform
52 |     t = []
53 |     if args.input_size <= 224:
54 |         crop_pct = 224 / 256
55 |     else:
56 |         crop_pct = 1.0
57 |     size = int(args.input_size / crop_pct)
58 |     t.append(
59 |         transforms.Resize(size, interpolation=PIL.Image.BICUBIC),  # to maintain same ratio w.r.t. 224 images
60 |     )
61 |     t.append(transforms.CenterCrop(args.input_size))
62 | 
63 |     t.append(transforms.ToTensor())
64 |     t.append(transforms.Normalize(mean, std))
65 |     return transforms.Compose(t)
66 | 


--------------------------------------------------------------------------------
/CV/MAE/util/lars.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # LARS optimizer, implementation from MoCo v3:
 8 | # https://github.com/facebookresearch/moco-v3
 9 | # --------------------------------------------------------
10 | 
11 | import torch
12 | 
13 | 
14 | class LARS(torch.optim.Optimizer):
15 |     """
16 |     LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
17 |     """
18 |     def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001):
19 |         defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient)
20 |         super().__init__(params, defaults)
21 | 
22 |     @torch.no_grad()
23 |     def step(self):
24 |         for g in self.param_groups:
25 |             for p in g['params']:
26 |                 dp = p.grad
27 | 
28 |                 if dp is None:
29 |                     continue
30 | 
31 |                 if p.ndim > 1: # if not normalization gamma/beta or bias
32 |                     dp = dp.add(p, alpha=g['weight_decay'])
33 |                     param_norm = torch.norm(p)
34 |                     update_norm = torch.norm(dp)
35 |                     one = torch.ones_like(param_norm)
36 |                     q = torch.where(param_norm > 0.,
37 |                                     torch.where(update_norm > 0,
38 |                                     (g['trust_coefficient'] * param_norm / update_norm), one),
39 |                                     one)
40 |                     dp = dp.mul(q)
41 | 
42 |                 param_state = self.state[p]
43 |                 if 'mu' not in param_state:
44 |                     param_state['mu'] = torch.zeros_like(p)
45 |                 mu = param_state['mu']
46 |                 mu.mul_(g['momentum']).add_(dp)
47 |                 p.add_(mu, alpha=-g['lr'])


--------------------------------------------------------------------------------
/CV/MAE/util/lr_decay.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # References:
 8 | # ELECTRA https://github.com/google-research/electra
 9 | # BEiT: https://github.com/microsoft/unilm/tree/master/beit
10 | # --------------------------------------------------------
11 | 
12 | import json
13 | 
14 | 
15 | def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75):
16 |     """
17 |     Parameter groups for layer-wise lr decay
18 |     Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
19 |     """
20 |     param_group_names = {}
21 |     param_groups = {}
22 | 
23 |     num_layers = len(model.blocks) + 1
24 | 
25 |     layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
26 | 
27 |     for n, p in model.named_parameters():
28 |         if not p.requires_grad:
29 |             continue
30 | 
31 |         # no decay: all 1D parameters and model specific ones
32 |         if p.ndim == 1 or n in no_weight_decay_list:
33 |             g_decay = "no_decay"
34 |             this_decay = 0.
35 |         else:
36 |             g_decay = "decay"
37 |             this_decay = weight_decay
38 |             
39 |         layer_id = get_layer_id_for_vit(n, num_layers)
40 |         group_name = "layer_%d_%s" % (layer_id, g_decay)
41 | 
42 |         if group_name not in param_group_names:
43 |             this_scale = layer_scales[layer_id]
44 | 
45 |             param_group_names[group_name] = {
46 |                 "lr_scale": this_scale,
47 |                 "weight_decay": this_decay,
48 |                 "params": [],
49 |             }
50 |             param_groups[group_name] = {
51 |                 "lr_scale": this_scale,
52 |                 "weight_decay": this_decay,
53 |                 "params": [],
54 |             }
55 | 
56 |         param_group_names[group_name]["params"].append(n)
57 |         param_groups[group_name]["params"].append(p)
58 | 
59 |     # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
60 | 
61 |     return list(param_groups.values())
62 | 
63 | 
64 | def get_layer_id_for_vit(name, num_layers):
65 |     """
66 |     Assign a parameter with its layer id
67 |     Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
68 |     """
69 |     if name in ['cls_token', 'pos_embed']:
70 |         return 0
71 |     elif name.startswith('patch_embed'):
72 |         return 0
73 |     elif name.startswith('blocks'):
74 |         return int(name.split('.')[1]) + 1
75 |     else:
76 |         return num_layers


--------------------------------------------------------------------------------
/CV/MAE/util/lr_sched.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | def adjust_learning_rate(optimizer, epoch, args):
10 |     """Decay the learning rate with half-cycle cosine after warmup"""
11 |     if epoch < args.warmup_epochs:
12 |         lr = args.lr * epoch / args.warmup_epochs 
13 |     else:
14 |         lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
15 |             (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
16 |     for param_group in optimizer.param_groups:
17 |         if "lr_scale" in param_group:
18 |             param_group["lr"] = lr * param_group["lr_scale"]
19 |         else:
20 |             param_group["lr"] = lr
21 |     return lr
22 | 


--------------------------------------------------------------------------------
/CV/MAE/util/pos_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # --------------------------------------------------------
 7 | # Position embedding utils
 8 | # --------------------------------------------------------
 9 | 
10 | import numpy as np
11 | 
12 | import torch
13 | 
14 | # --------------------------------------------------------
15 | # 2D sine-cosine position embedding
16 | # References:
17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
18 | # MoCo v3: https://github.com/facebookresearch/moco-v3
19 | # --------------------------------------------------------
20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
21 |     """
22 |     grid_size: int of the grid height and width
23 |     return:
24 |     pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
25 |     """
26 |     grid_h = np.arange(grid_size, dtype=np.float32)
27 |     grid_w = np.arange(grid_size, dtype=np.float32)
28 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
29 |     grid = np.stack(grid, axis=0)
30 | 
31 |     grid = grid.reshape([2, 1, grid_size, grid_size])
32 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
33 |     if cls_token:
34 |         pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
35 |     return pos_embed
36 | 
37 | 
38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
39 |     assert embed_dim % 2 == 0
40 | 
41 |     # use half of dimensions to encode grid_h
42 |     emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
43 |     emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
44 | 
45 |     emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
46 |     return emb
47 | 
48 | 
49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
50 |     """
51 |     embed_dim: output dimension for each position
52 |     pos: a list of positions to be encoded: size (M,)
53 |     out: (M, D)
54 |     """
55 |     assert embed_dim % 2 == 0
56 |     omega = np.arange(embed_dim // 2, dtype=np.float)
57 |     omega /= embed_dim / 2.
58 |     omega = 1. / 10000**omega  # (D/2,)
59 | 
60 |     pos = pos.reshape(-1)  # (M,)
61 |     out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
62 | 
63 |     emb_sin = np.sin(out) # (M, D/2)
64 |     emb_cos = np.cos(out) # (M, D/2)
65 | 
66 |     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
67 |     return emb
68 | 
69 | 
70 | # --------------------------------------------------------
71 | # Interpolate position embeddings for high-resolution
72 | # References:
73 | # DeiT: https://github.com/facebookresearch/deit
74 | # --------------------------------------------------------
75 | def interpolate_pos_embed(model, checkpoint_model):
76 |     if 'pos_embed' in checkpoint_model:
77 |         pos_embed_checkpoint = checkpoint_model['pos_embed']
78 |         embedding_size = pos_embed_checkpoint.shape[-1]
79 |         num_patches = model.patch_embed.num_patches
80 |         num_extra_tokens = model.pos_embed.shape[-2] - num_patches
81 |         # height (== width) for the checkpoint position embedding
82 |         orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
83 |         # height (== width) for the new position embedding
84 |         new_size = int(num_patches ** 0.5)
85 |         # class_token and dist_token are kept unchanged
86 |         if orig_size != new_size:
87 |             print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
88 |             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
89 |             # only the position tokens are interpolated
90 |             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
91 |             pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
92 |             pos_tokens = torch.nn.functional.interpolate(
93 |                 pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
94 |             pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
95 |             new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
96 |             checkpoint_model['pos_embed'] = new_pos_embed
97 | 


--------------------------------------------------------------------------------
/CV/timm/README.md:
--------------------------------------------------------------------------------
 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
 2 | 
 3 | For vision tasks, our implementation is based on the official [`timm`](https://github.com/rwightman/pytorch-image-models). To reproduce our results, please first refer to [`timm`](https://github.com/rwightman/pytorch-image-models) and install it. Then you can follow the following two steps to reproduce our experiments in paper. 
 4 | 
 5 | 
 6 | 
 7 | ## Environment
 8 | 
 9 | Our experiments for this task are based on the following pkg version.
10 | 
11 | ```python
12 | torch.__version__  = '1.10.0+cu113'
13 | torchvision.__version__ = '0.11.1+cu113'
14 | timm.__version__ = '0.6.1'
15 | torchaudio.__version__ = '0.10.0+cu113'
16 | ```
17 | 
18 | Note that our timm is a developer version. If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:timm](https://hub.docker.com/repository/docker/xyxie/adan-image).
19 | 
20 | 
21 | 
22 | ## Usage of Adan in timm
23 | 
24 | ### Two steps to use Adan
25 | 
26 | **Step 1.** add Adan-dependent hyper-parameters by adding the following hyper-parameters to the `train.py`:
27 | 
28 | ```python
29 | parser.add_argument('--max-grad-norm', type=float, default=0.0, help='if the l2 norm is large than this hyper-parameter, then we clip the gradient  (default: 0.0, no gradient clip)')
30 | parser.add_argument('--weight-decay', type=float, default=0.02,  help='weight decay, similar one used in AdamW (default: 0.02)')
31 | parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', help='optimizer epsilon to avoid the bad case where second-order moment is zero (default: None, use opt default 1e-8 in adan)')
32 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='optimizer betas in Adan (default: None, use opt default [0.98, 0.92, 0.99] in Adan)')
33 | parser.add_argument('--no-prox', action='store_true', default=False, help='whether perform weight decay like AdamW (default=False)')
34 | parser.add_argument('--bias-decay', action='store_true', default=False, help='Perform the weight decay on bias term (default=False)')
35 | 
36 | ```
37 | 
38 | * `bias-decay`: It decides whether or not to perform the weight decay on 1) bias term, 2) bn, and 3) other 1d params, which are all filtered out by the default setting in timm.
39 | 
40 | * `no-prox`: It determines the update rule of parameters with weight decay. By default, Adan updates the parameters in the way presented in Algorithm 1 in the paper:
41 | 
42 |     $$\boldsymbol{\theta}_{k+1} = ( 1+\lambda \eta)^{-1}\left[\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k)\right],$$
43 | 
44 |   But one also can update the parameter like Adamw:
45 | 
46 |   $$\boldsymbol{\theta}_{k+1} = ( 1-\lambda \eta)\boldsymbol{\theta}_k - \boldsymbol{\eta}_k \circ (\mathbf{m}_k+(1-{\color{blue}\beta_2})\mathbf{v}_k).$$
47 |   **In all experiments, we set `no-prox=False` in our paper.** 
48 | 
49 |   
50 | 
51 | **Step 2.** creat the Adan optimizer as follows. In this step, we directly replace the vanilla optimizer creator by using the following three substeps. 
52 | 
53 | i) add Adan into `optim_factory.py`:
54 |   ```python
55 |   elif opt_lower == 'adan': 
56 |     optimizer = Adan(parameters, **opt_args)
57 |   ```
58 | 
59 | ii) import the optimizer creator into your training file `train.py` from `optim_factory` :
60 | 
61 |   ```python
62 |   from optim_factory import create_optimizer
63 |   ```
64 | 
65 | iii) replace the vanilla creator (`optimizer = create_optimizer(args, model)`) in the training file `train.py`  with Adan:
66 | 
67 |   ```python
68 |   opt_lower = args.opt.lower()
69 |   if opt_lower == 'adan':
70 |     args.opt_args = {'max_grad_norm': args.max_grad_norm, 'no_prox': args.no_prox}
71 |   optimizer = create_optimizer(args, model, filter_bias_and_bn = not args.bias_decay)
72 |   ```
73 | 
74 | 
75 | 
76 | ## ImageNet-1K Training Recipe
77 | 
78 | We provide the specific commonds and hyper-parameters for ViTs, ResNets and ConvNexts in this [recipe](./supervised.md).
79 | 
80 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ConvNext/small/args_cvnext_150.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m9-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: false
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: null
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.1
 29 | epoch_repeats: 0.0
 30 | epochs: 150
 31 | eval_metric: top1
 32 | experiment: 
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 0.0
 52 | mean: null
 53 | min_lr: 0.0001
 54 | mixup: 0.8
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: convnext_tiny_hnf
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/cvnext
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.25
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.1
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: bicubic
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-08
110 | weight_decay: 0.04
111 | workers: 8
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ConvNext/small/args_cvnext_300.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m9-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: false
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: null
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.1
 29 | epoch_repeats: 0.0
 30 | epochs: 300
 31 | eval_metric: top1
 32 | experiment:
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.016
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 0.0
 52 | mean: null
 53 | min_lr: 0.0001
 54 | mixup: 0.8
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: convnext_tiny_hnf
 60 | model_ema: true
 61 | model_ema_decay: 0.9999
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.9
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/cvnext
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.25
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.1
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 150
109 | warmup_lr: 1.0e-08
110 | weight_decay: 0.02
111 | workers: 8
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res101/args_res101_100.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | configure: job_101_adan2.yaml
 17 | cooldown_epochs: 10
 18 | crop_pct: 0.95
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/imagenet
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.1
 30 | epoch_repeats: 0.0
 31 | epochs: 100
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.01
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 5.0
 53 | mean: null
 54 | min_lr: 0.0001
 55 | mixup: 0.1
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: resnet101
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_prox: false
 69 | no_resume_opt: false
 70 | num_classes: null
 71 | opt: adan
 72 | opt_betas:
 73 | - 0.98
 74 | - 0.92
 75 | - 0.99
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_results/res101-100-
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.0
 88 | resplit: false
 89 | resume: model_best.pth.tar
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 3407
 96 | smoothing: 0.0
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: random
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 40
110 | warmup_lr: 1.0e-09
111 | weight_decay: 0.02
112 | workers: 8
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res101/args_res101_200.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: 0.95
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.1
 29 | epoch_repeats: 0.0
 30 | epochs: 200
 31 | eval_metric: top1
 32 | experiment: ''
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 5.0
 52 | mean: null
 53 | min_lr: 0.0001
 54 | mixup: 0.1
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: resnet101
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/res101-epoch-
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.0
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.0
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-09
110 | weight_decay: 0.02
111 | workers: 8
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res101/args_res101_300.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | configure: job_101_adan.yaml
 17 | cooldown_epochs: 10
 18 | crop_pct: 0.95
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/imagenet
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.2
 30 | epoch_repeats: 0.0
 31 | epochs: 300
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.015
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 5.0
 53 | mean: null
 54 | min_lr: 1.0e-05
 55 | mixup: 0.1
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: resnet101
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_prox: false
 69 | no_resume_opt: false
 70 | num_classes: null
 71 | opt: adan
 72 | opt_betas:
 73 | - 0.98
 74 | - 0.92
 75 | - 0.99
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_results/res101-300-
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.0
 88 | resplit: false
 89 | resume: model_best.pth.tar
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 3407
 96 | smoothing: 0.0
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: random
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 90
110 | warmup_lr: 1.0e-09
111 | weight_decay: 0.02
112 | workers: 8
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res101/summary_res101_100.csv:
--------------------------------------------------------------------------------
  1 | epoch,train_loss,eval_loss,eval_top1,eval_top5
  2 | 0,0.6999703517981938,6.9381575,0.102,0.4979999999666214
  3 | 1,0.05811868847480842,6.87584375,0.3939999999332428,1.7039999963378907
  4 | 2,0.007747739586713058,6.76303875,1.6039999908447267,5.338000015869141
  5 | 3,0.0073236161510327035,5.3613025,6.767999984130859,18.46600002319336
  6 | 4,0.006898723675736359,4.6371725,13.537999970092773,31.59400007080078
  7 | 5,0.006467533497405904,4.0580925,20.71000001831055,43.00000006103516
  8 | 6,0.0060907453298568726,3.52676375,27.20999994140625,52.32400007324219
  9 | 7,0.005859104982976403,3.19898375,33.092000048828126,58.69000002685547
 10 | 8,0.005392835708335042,2.749453125,39.679999995117186,66.31800005371093
 11 | 9,0.005332435946911573,2.4970771875,44.67999998046875,70.7900000415039
 12 | 10,0.004835383068504078,2.29522875,48.786000073242185,74.15399998535156
 13 | 11,0.004691791022196412,2.0821503125,52.47399998535156,77.29200000732422
 14 | 12,0.004713796977219837,2.0056796875,54.215999931640624,78.56200026367188
 15 | 13,0.0046068779192864895,1.965403125,55.21000003173828,79.31600004638672
 16 | 14,0.004495224349999002,1.8351496875,57.713999921875,81.02600004638671
 17 | 15,0.004356943453396005,1.8534640625,57.819999921875,81.29599999023438
 18 | 16,0.004283945841182556,1.72975828125,59.72999997314453,82.90000000732422
 19 | 17,0.0041249994620946905,1.6525403125,61.77600001220703,84.05799998779297
 20 | 18,0.004143890575505793,1.61844859375,61.91799998779297,84.07199997802735
 21 | 19,0.0041115750126274565,1.57048875,63.07800004394531,85.23199998291015
 22 | 20,0.004011271621233651,1.53420265625,64.04999995849609,85.62200005859376
 23 | 21,0.0038510982579152498,1.55004109375,63.43200009765625,85.19999993652344
 24 | 22,0.0038439546312604633,1.5229715625,64.74600000976562,86.03400003173829
 25 | 23,0.0038987634304378715,1.51144328125,65.10999988525391,86.39199997558593
 26 | 24,0.0038393755676224828,1.46528921875,65.5220000024414,86.72600003173828
 27 | 25,0.0039864167171929565,1.43677015625,65.95400012939453,86.9820000048828
 28 | 26,0.003912502034966435,1.44618640625,65.75600003662109,86.69800005859375
 29 | 27,0.003806904241043542,1.41935078125,66.46599997070312,87.32600012939453
 30 | 28,0.0037424968489046606,1.427661875,66.50000010986328,87.29399997802734
 31 | 29,0.003874790804859783,1.367300625,67.45599989746094,87.88599997558593
 32 | 30,0.0038485014478542973,1.38268578125,67.30000010498047,87.80599994873047
 33 | 31,0.00382271282640951,1.34994140625,67.810000078125,88.08400021240234
 34 | 32,0.00378438870289496,1.394751875,66.83399997314453,87.6119999975586
 35 | 33,0.0037495847625125733,1.35988171875,67.53400002197266,88.06600005371094
 36 | 34,0.00387607079132327,1.39839203125,66.71800006103516,87.71599990234375
 37 | 35,0.0036058901376756175,1.39179953125,66.84600000976563,87.66000002929688
 38 | 36,0.0035983206471428275,1.40048484375,66.83400002929687,87.65799994873046
 39 | 37,0.0035912682402080725,1.4286309375,66.94400002929687,87.706
 40 | 38,0.00369265178285007,1.3891053125,67.31400002685547,87.800000078125
 41 | 39,0.003823710572240608,1.369221875,67.4979999975586,87.82800002197266
 42 | 40,0.003413957726609494,1.246991875,69.91399991210938,89.59400009521484
 43 | 41,0.0034744960388966967,1.2547934375,70.14600015136719,89.61400020263672
 44 | 42,0.00342402803445501,1.26230765625,69.94400004638672,89.19399997314453
 45 | 43,0.0034545807700072017,1.2105721875,71.06400001953125,89.91600007080078
 46 | 44,0.0033984247129410505,1.225816875,70.70999999267578,89.90400004638671
 47 | 45,0.0035708143675167647,1.2220984375,71.30400004882813,90.18400009521484
 48 | 46,0.0034103883330577184,1.21230796875,71.87999998779297,90.40800010009765
 49 | 47,0.0033485869644209743,1.19272125,71.76399999267578,90.49800009765625
 50 | 48,0.0034036182332783937,1.15048734375,72.35600006591797,90.75400014892578
 51 | 49,0.0034680215176194906,1.163283125,72.40999996337891,90.73800007080078
 52 | 50,0.003379080627512719,1.1675125,72.2219999633789,90.87000004394531
 53 | 51,0.003313158971390554,1.1659403125,72.33799996826171,90.63599999267578
 54 | 52,0.003398957579130573,1.12288484375,72.96600006591797,91.19399994140625
 55 | 53,0.003229137593215065,1.10655890625,73.06600004150391,91.44800012451172
 56 | 54,0.003297736668693168,1.12947734375,73.59000001708985,91.52800009521485
 57 | 55,0.003102192488898124,1.13240875,72.63600001464843,91.07600004882812
 58 | 56,0.003310556390455791,1.07455796875,73.82199999023437,91.73399988525391
 59 | 57,0.003254913375712931,1.070454375,74.12600001220703,91.82000006591797
 60 | 58,0.003150251860331212,1.0846184375,73.90800006835937,91.64800017333984
 61 | 59,0.0032559275361044066,1.0725778125,74.06800004150391,91.95000004394531
 62 | 60,0.0031484989948304637,1.059663125,74.60599998291016,91.99600006835938
 63 | 61,0.003113441352200295,1.05973453125,74.53799998535156,92.01800006835937
 64 | 62,0.003175405037057187,1.04974859375,74.59600006347657,92.17199991455078
 65 | 63,0.0030697412855390993,1.02565875,75.1760000390625,92.41599991455078
 66 | 64,0.002913903827512903,1.0269609375,75.22199998535156,92.38000001464843
 67 | 65,0.003126694039175553,1.025984375,75.27800006347657,92.49400011962891
 68 | 66,0.0031302267452701926,1.01711203125,75.39400001220703,92.56000001708985
 69 | 67,0.0030674594454467297,1.015896875,75.63200011962891,92.67800009277343
 70 | 68,0.003019252243185682,0.9928059375,75.96400008544921,92.88200011474609
 71 | 69,0.003121067354056452,1.00143140625,76.01400003173828,92.64000004150391
 72 | 70,0.0028890574384214623,0.9874996875,76.20200000976563,92.8920000415039
 73 | 71,0.0029056143913684146,0.98568921875,76.20599990478516,93.06000011474609
 74 | 72,0.0029515030827107175,0.9696134375,76.52800013427735,93.0959999633789
 75 | 73,0.0029652343031817247,0.9613771875,76.79200016357422,93.05400011962891
 76 | 74,0.002942567242176405,0.9542409375,76.91399998291016,93.22800014160157
 77 | 75,0.002948857095491673,0.9467534375,77.12999998291015,93.40400014160156
 78 | 76,0.002977430753942047,0.9387584375,77.26999992675782,93.44200017089844
 79 | 77,0.002892624304097678,0.9361546875,77.32999990234374,93.5340000366211
 80 | 78,0.0028546288875596865,0.9220828125,77.7280000024414,93.61599998535156
 81 | 79,0.0029401361742722137,0.92840078125,77.70400003173827,93.65800003662109
 82 | 80,0.0027510516312239425,0.90395640625,78.03200008789062,93.8580001171875
 83 | 81,0.0027062457354207125,0.9050746875,78.17399998291016,93.9360000390625
 84 | 82,0.002744901110418141,0.8973440625,78.25999998046875,93.95200006347656
 85 | 83,0.002718568014513169,0.8877671875,78.62400013427734,94.14600001220703
 86 | 84,0.0028618485001581056,0.88632234375,78.51400000732421,94.10800006347657
 87 | 85,0.0027493445834677133,0.876911015625,78.77400000976563,94.19799998535156
 88 | 86,0.002734048092471702,0.8764725,78.82800005859374,94.22000011230469
 89 | 87,0.0025374879062707934,0.86807703125,79.05199995361328,94.27600016601562
 90 | 88,0.002690665889531374,0.8684296875,79.06000008300781,94.2960000366211
 91 | 89,0.002699888893403113,0.8860525,79.10200010986328,94.32600001220703
 92 | 90,0.002796581364236772,0.8614496875,79.20399995361328,94.4160000366211
 93 | 91,0.002500709379091859,0.854776171875,79.29600013427735,94.47400006103516
 94 | 92,0.0024909183183418854,0.852921328125,79.42600000488281,94.52999993164063
 95 | 93,0.0024433880296003607,0.84558203125,79.54000005615234,94.55600016601562
 96 | 94,0.002702345955185592,0.8445765625,79.57800015869141,94.6319999584961
 97 | 95,0.002313682609902961,0.846581875,79.72599995361328,94.66400000976563
 98 | 96,0.0025091321939336403,0.844203828125,79.76800008056641,94.64400016357422
 99 | 97,0.0026143502577074935,0.842131484375,79.7340000805664,94.61999998535157
100 | 98,0.002516709908377379,0.841174921875,79.65399995361328,94.66400003662109
101 | 99,0.00261627570060747,0.838111484375,79.86200008300781,94.69399998291016
102 | 100,0.002597623238606112,0.8395615625,79.81399995361328,94.69400003662109
103 | 101,0.002467484595919294,0.845680859375,79.71800000488281,94.59600008789063
104 | 102,0.002401946045990501,0.839708046875,79.89400000488281,94.76199993164063
105 | 103,0.0026869618200830053,0.835429453125,79.88600010986328,94.72400000976563
106 | 104,0.0025204311407703373,0.83972015625,79.74200003173829,94.67600011230469
107 | 105,0.0026361714283536586,0.84152625,79.95200013671875,94.73999993164063
108 | 106,0.0027149992362995234,0.839165859375,79.91000003173828,94.69000011230469
109 | 107,0.002398747401977224,0.836501796875,80.00600005615235,94.75400006103516
110 | 108,0.00262980888198529,0.852977421875,79.71200008300781,94.63800000976562
111 | 109,0.002572902212185519,0.838661171875,79.85999997802735,94.63199998291016
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res50/args_res50_100.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: 0.95
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.05
 29 | epoch_repeats: 0.0
 30 | epochs: 100
 31 | eval_metric: top1
 32 | experiment: e100-aug0-w60-minlr1e6-wrlr1e9-initRdm-bias-lr3e2
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.03
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 5.0
 52 | mean: null
 53 | min_lr: 1.0e-06
 54 | mixup: 0.1
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: resnet50
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/res50-epoch-
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.0
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.0
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-09
110 | weight_decay: 0.02
111 | workers: 8
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res50/args_res50_200.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: 0.95
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.05
 29 | epoch_repeats: 0.0
 30 | epochs: 200
 31 | eval_metric: top1
 32 | experiment: e200-aug0-w60-minlr1e4-wrlr1e9-initRdm-bias
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 5.0
 52 | mean: null
 53 | min_lr: 0.0001
 54 | mixup: 0.1
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: resnet50
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/res50-epoch-
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.0
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.0
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-09
110 | weight_decay: 0.02
111 | workers: 8
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res50/args_res50_300.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: 0.95
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.05
 29 | epoch_repeats: 0.0
 30 | epochs: 300
 31 | eval_metric: top1
 32 | experiment: res50-aug0-retrain
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 5.0
 52 | mean: null
 53 | min_lr: 1.0e-05
 54 | mixup: 0.1
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: resnet50
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_debug: 5
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_results/res50-epoch-
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.0
 88 | resplit: false
 89 | resume: null
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 42
 96 | smoothing: 0.0
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: bicubic
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 60
110 | warmup_lr: 1.0e-06
111 | weight_decay: 0.02
112 | workers: 8
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ResNet/Res50/summary_res50_100.csv:
--------------------------------------------------------------------------------
  1 | epoch,train_loss,eval_loss,eval_top1,eval_top5
  2 | 0,0.7045409509113857,6.9416,0.064,0.418
  3 | 1,0.058665430905031304,6.89746625,0.3079999999332428,1.2019999998664856
  4 | 2,0.007796582133908357,6.2966525,1.7179999993896484,5.899999989013672
  5 | 3,0.007212148014722126,5.116435,8.078000043945313,21.984000035400392
  6 | 4,0.006597742538100907,4.30874625,16.604000009765624,37.32800003540039
  7 | 5,0.006309278409129807,3.7494875,24.503999986572264,48.297999992675784
  8 | 6,0.00587210977183921,3.23308,31.903999926757812,57.66999989746094
  9 | 7,0.005444032173337681,2.87593875,38.16399994140625,63.99200002685547
 10 | 8,0.0054282506462186575,2.59584875,43.517999924316406,69.46200001464844
 11 | 9,0.005179691860186202,2.359841875,47.206000029296874,72.58200003417969
 12 | 10,0.004889545729383826,2.1719675,50.609999997558596,75.55400000244141
 13 | 11,0.00470197234036667,2.1567584375,51.69199992919922,76.44600010253906
 14 | 12,0.004586202425083944,1.98930375,54.606000112304685,78.83600004394532
 15 | 13,0.004271666053682566,1.8706825,56.328000031738284,80.30800010009766
 16 | 14,0.004447908040934375,1.806950625,58.472000075683596,81.5399999633789
 17 | 15,0.0041762767692229575,1.7647315625,58.741999968261716,82.09000006103516
 18 | 16,0.004471837143812861,1.708065625,60.30200004394531,82.98200011230469
 19 | 17,0.004270398956058281,1.67571921875,61.048000041503904,83.32200005859374
 20 | 18,0.004100026030625615,1.65201375,61.26000004150391,83.77400021972656
 21 | 19,0.0041242205105455855,1.63376078125,61.504000068359375,84.07800001220703
 22 | 20,0.004059118734273527,1.67590984375,60.91800009765625,83.5019999584961
 23 | 21,0.0041561292850279385,1.63649734375,61.82800004882812,84.22399995361329
 24 | 22,0.004249815163867814,1.5946559375,62.68000001220703,84.70800006347656
 25 | 23,0.0039470667751239875,1.64520578125,61.93799990234375,84.07400013427734
 26 | 24,0.003988273092545569,1.671076875,61.05600004394531,83.42199993164063
 27 | 25,0.004096939311628895,1.7034496875,61.12399993652344,83.56399995605469
 28 | 26,0.004087086118358586,1.60285265625,62.73200006347656,84.75999995605468
 29 | 27,0.00399751916328179,1.61492046875,62.43800003662109,84.32400010742188
 30 | 28,0.003949649166315794,1.701069375,60.77399994628906,83.2460001147461
 31 | 29,0.004051400797574648,1.6202353125,62.64599990722656,84.67000006103515
 32 | 30,0.004139024115699742,1.6344540625,62.20200006591797,84.12200026855469
 33 | 31,0.003921386137205575,1.62690984375,62.05000011474609,84.17200008544921
 34 | 32,0.00411509963617261,1.68366421875,61.46400011474609,83.86600005859376
 35 | 33,0.003911659786743777,1.67565765625,60.84800007324219,83.32999993408202
 36 | 34,0.00395727701418634,1.62554953125,62.0080000390625,84.16199998291016
 37 | 35,0.004033969731868378,1.71603296875,60.70599999267578,83.0460000390625
 38 | 36,0.004010531336202153,1.6436690625,62.05400001953125,84.18800013916015
 39 | 37,0.0039575622982478565,1.67731390625,61.35800016845703,83.65600013671875
 40 | 38,0.0039316649615232435,1.61552953125,62.22400010986328,84.39000005126952
 41 | 39,0.003873389430477151,1.63947921875,61.81200003662109,84.1440000366211
 42 | 40,0.004065845494291612,1.653141875,61.8460001147461,83.82200008789063
 43 | 41,0.004109910373309893,1.714169375,60.308000017089846,83.24199985595703
 44 | 42,0.003946930452782128,1.94490875,56.48200006103516,79.57000004638672
 45 | 43,0.0041138056798705035,1.6267740625,61.803999931640625,84.29799997802735
 46 | 44,0.004048073315061629,1.62808609375,62.09799998291015,84.28800000976563
 47 | 45,0.0039734537546922055,1.784985625,59.12400004882812,82.1780000390625
 48 | 46,0.0038987650768831372,1.713120625,60.78800010498047,83.26599994628906
 49 | 47,0.0040997504090358105,1.88673,57.57800005615234,80.4180000415039
 50 | 48,0.003935285162047616,1.6685634375,61.34400001220703,83.64799995605469
 51 | 49,0.004107319034769067,1.7783765625,59.22000000244141,82.05199999023438
 52 | 50,0.00387493397907487,1.6779953125,61.276,83.92200001464843
 53 | 51,0.004015801890221026,1.847471875,58.37599998046875,81.37399998779297
 54 | 52,0.003935897473378905,1.859410625,58.18199997802734,81.15000001708984
 55 | 53,0.004190738429315388,1.821818125,58.34600005615234,81.56200009277343
 56 | 54,0.004043174558319151,1.823231875,58.122000075683594,81.2140000390625
 57 | 55,0.004158310043359441,1.86400625,57.84399987792969,81.45800022460938
 58 | 56,0.003960915591700801,1.7923175,58.804000024414066,81.96200001220703
 59 | 57,0.004142970977617162,1.7743928125,59.36600004394531,82.41600017333984
 60 | 58,0.004029840646710779,1.7658021875,59.30400007080078,82.15200016845704
 61 | 59,0.004218896684635963,1.88195375,56.881999975585934,80.56000011474609
 62 | 60,0.0036925061971747448,1.3517940625,67.70000002197266,88.15399987304687
 63 | 61,0.0035992927150800824,1.34404765625,68.08800004882812,88.23600020751954
 64 | 62,0.003520481986925006,1.283674375,69.1300000805664,88.94400007568359
 65 | 63,0.003616590718073504,1.3082865625,68.802000078125,88.65599994384766
 66 | 64,0.0036838793894276023,1.27181484375,69.44200001953125,89.27800005126953
 67 | 65,0.003572586092299649,1.29942640625,69.78399999267579,89.41400007324219
 68 | 66,0.0036129531716661794,1.2370415625,70.27599992431641,89.516
 69 | 67,0.0032376082381233573,1.2114928125,70.86000002197265,90.03600010009765
 70 | 68,0.0035054978714989765,1.224236875,70.44400004394531,89.89400004394531
 71 | 69,0.0034192517466310945,1.23175109375,70.51399994628906,89.67800012451171
 72 | 70,0.00328368427498,1.19048328125,71.48400014648438,90.30600015136719
 73 | 71,0.00327613196402256,1.16209390625,71.9240000366211,90.69200007080079
 74 | 72,0.0030484608806935804,1.16013578125,71.9080000390625,90.63800004394531
 75 | 73,0.0034537422138133217,1.1457075,72.4540000390625,90.89400011962891
 76 | 74,0.003460384572723082,1.13635015625,72.41000006835938,90.91400004638672
 77 | 75,0.0033204310374068363,1.12647875,72.77400001464844,91.23800009521484
 78 | 76,0.0032639388061527696,1.113355625,72.89800006347656,91.27400009521484
 79 | 77,0.0032552302914804648,1.1143825,72.92800009033203,91.40200001708985
 80 | 78,0.003150941720897598,1.0993584375,73.49799998779297,91.53000014892578
 81 | 79,0.0031130987585389186,1.0650625,74.15800011474609,92.03200009521484
 82 | 80,0.0032726521603763103,1.0721525,74.11400006591796,91.98600006591796
 83 | 81,0.00320629304873624,1.0649465625,74.26599995605469,92.1619999633789
 84 | 82,0.0029540062449606402,1.0372840625,74.79800008789063,92.30600001708984
 85 | 83,0.003026906833318727,1.0280375,75.05400014160156,92.53800022460938
 86 | 84,0.0029979831805186613,1.017864375,75.4720000366211,92.63999999267578
 87 | 85,0.00299135923186051,0.99109765625,75.92600000732422,92.9679999633789
 88 | 86,0.003011097732399191,0.99155703125,75.93799998291016,92.82800001464844
 89 | 87,0.003033405419306031,0.970643125,76.38000008789062,93.0640001171875
 90 | 88,0.0028323159287018435,0.9561534375,76.69000000976563,93.1739999633789
 91 | 89,0.0030302958163831916,0.9529859375,76.86200008544922,93.20600006591796
 92 | 90,0.0030514331634289454,0.9512065625,77.03400000976562,93.26600009033203
 93 | 91,0.002754983675133969,0.9374346875,77.17200000488282,93.41999996337891
 94 | 92,0.002925087830850056,0.92438484375,77.47400018554687,93.53399991210938
 95 | 93,0.002743347780779004,0.9260734375,77.55600011230469,93.63999993652344
 96 | 94,0.0028534684097394347,0.95646546875,77.48199992675781,93.63399998779298
 97 | 95,0.0028282569421987447,0.91486703125,77.77999995361328,93.66999996337891
 98 | 96,0.0026793425869462745,0.90815390625,77.85000003173828,93.7900001171875
 99 | 97,0.002686592417636088,0.909225625,78.00000013427734,93.78399993652344
100 | 98,0.002937979913050575,0.90744421875,77.98200003173828,93.79199998779296
101 | 99,0.002853604283050767,0.90461453125,78.05800000488281,93.84199993652344
102 | 100,0.002864615060389042,0.9053496875,78.0300000830078,93.79799998779296
103 | 101,0.002886664870727275,0.9070628125,78.00999995361327,93.78799993652343
104 | 102,0.002906581253877708,0.91363046875,77.93799995361329,93.71799998779296
105 | 103,0.0030246374164042728,0.90368484375,78.14200010986328,93.83399998779296
106 | 104,0.0028219220860462102,0.906053125,78.03600005615235,93.78400006591797
107 | 105,0.002867467302296843,0.90486140625,78.06800013427734,93.80799998779297
108 | 106,0.002776414771298213,0.90622484375,78.1760000830078,93.84400001464844
109 | 107,0.0027404509518029435,0.90221796875,78.09400000488282,93.82399998779297
110 | 108,0.002886704235736813,0.90330140625,78.10999998046876,93.80999993652344
111 | 109,0.0028225835911663516,0.9019365625,78.07000000488281,93.81399993652344
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/base/args_vit-B_150.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m9-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 3
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: false
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | bn_tf: false
 12 | channels_last: false
 13 | checkpoint_hist: 2
 14 | clip_grad: null
 15 | clip_mode: norm
 16 | color_jitter: 0.4
 17 | cooldown_epochs: 10
 18 | crop_pct: null
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/common/imagenet-raw
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.1
 30 | epoch_repeats: 0.0
 31 | epochs: 150
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.015
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 5.0
 53 | mean: null
 54 | min_lr: 1.0e-08
 55 | mixup: 0.8
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: deit_base_patch16_224
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_debug: 5
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_results/deit-base-ori-
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.25
 88 | resplit: false
 89 | resume: ''
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 42
 96 | smoothing: 0.1
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: random
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 60
110 | warmup_lr: 1.0e-06
111 | weight_decay: 0.02
112 | workers: 10
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/base/args_vit-B_300.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m9-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 3
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: false
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | bn_tf: false
 12 | channels_last: false
 13 | checkpoint_hist: 2
 14 | clip_grad: null
 15 | clip_mode: norm
 16 | color_jitter: 0.4
 17 | cooldown_epochs: 10
 18 | crop_pct: null
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/common/imagenet-raw
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.1
 30 | epoch_repeats: 0.0
 31 | epochs: 300
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.015
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 5.0
 53 | mean: null
 54 | min_lr: 1.0e-05
 55 | mixup: 0.8
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: deit_base_patch16_224
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_debug: 5
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_results/deit-base-ori-
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.25
 88 | resplit: false
 89 | resume: ''
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 42
 96 | smoothing: 0.1
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: random
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 60
110 | warmup_lr: 1.0e-08
111 | weight_decay: 0.02
112 | workers: 10
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/base/args_vit-B_300_T.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m9-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 3
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: false
  8 | bias_decay: true
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | configure: job_base_adan.yaml
 17 | cooldown_epochs: 10
 18 | crop_pct: null
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/imagenet
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.2
 30 | epoch_repeats: 0.0
 31 | epochs: 300
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.015
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 5.0
 53 | mean: null
 54 | min_lr: 1.0e-06
 55 | mixup: 0.8
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: deit_base_patch16_224
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_prox: false
 69 | no_resume_opt: false
 70 | num_classes: null
 71 | opt: adan
 72 | opt_betas:
 73 | - 0.98
 74 | - 0.92
 75 | - 0.99
 76 | opt_eps: 1.0e-08
 77 | output: ./exp_res/vit-base-300
 78 | patience_epochs: 10
 79 | pin_mem: false
 80 | pretrained: false
 81 | ratio:
 82 | - 0.75
 83 | - 1.3333333333333333
 84 | recount: 1
 85 | recovery_interval: 0
 86 | remode: pixel
 87 | reprob: 0.25
 88 | resplit: false
 89 | resume: model_best.pth.tar
 90 | save_images: false
 91 | scale:
 92 | - 0.08
 93 | - 1.0
 94 | sched: cosine
 95 | seed: 3407
 96 | smoothing: 0.1
 97 | split_bn: false
 98 | start_epoch: null
 99 | std: null
100 | sync_bn: false
101 | torchscript: false
102 | train_interpolation: random
103 | train_split: train
104 | tta: 0
105 | use_multi_epochs_loader: false
106 | val_split: validation
107 | validation_batch_size: null
108 | vflip: 0.0
109 | warmup_epochs: 90
110 | warmup_lr: 1.0e-08
111 | weight_decay: 0.02
112 | workers: 10
113 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/small/args_vit-s_150-I.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | bn_tf: false
 12 | channels_last: false
 13 | checkpoint_hist: 2
 14 | clip_grad: null
 15 | clip_mode: norm
 16 | color_jitter: 0.4
 17 | cooldown_epochs: 10
 18 | crop_pct: null
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/common/imagenet-raw
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.05
 30 | epoch_repeats: 0.0
 31 | epochs: 150
 32 | eval_metric: top1
 33 | experiment: ''
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.015
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 0.0
 53 | mean: null
 54 | min_lr: 1.0e-08
 55 | mixup: 0.2
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: deit_small_patch16_224
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_prox: false
 69 | no_resume_opt: false
 70 | num_classes: null
 71 | opt: adan
 72 | opt_betas:
 73 | - 0.98
 74 | - 0.92
 75 | - 0.99
 76 | opt_debug: 5
 77 | opt_eps: 1.0e-08
 78 | output: ./exp_results/deit-small-bs-test-
 79 | patience_epochs: 10
 80 | pin_mem: false
 81 | pretrained: false
 82 | ratio:
 83 | - 0.75
 84 | - 1.3333333333333333
 85 | recount: 1
 86 | recovery_interval: 0
 87 | remode: pixel
 88 | reprob: 0.0
 89 | resplit: false
 90 | resume: ''
 91 | save_images: false
 92 | scale:
 93 | - 0.08
 94 | - 1.0
 95 | sched: cosine
 96 | seed: 1005
 97 | smoothing: 0.1
 98 | split_bn: false
 99 | start_epoch: null
100 | std: null
101 | sync_bn: false
102 | torchscript: false
103 | train_interpolation: random
104 | train_split: train
105 | tta: 0
106 | use_multi_epochs_loader: false
107 | val_split: validation
108 | validation_batch_size: null
109 | vflip: 0.0
110 | warmup_epochs: 60
111 | warmup_lr: 1.0e-08
112 | weight_decay: 0.02
113 | workers: 10
114 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/small/args_vit-s_150.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: null
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.1
 29 | epoch_repeats: 0.0
 30 | epochs: 150
 31 | eval_metric: top1
 32 | experiment: wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 0.0
 52 | mean: null
 53 | min_lr: 1.0e-05
 54 | mixup: 0.8
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: deit_small_patch16_224
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/deit-small
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.25
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.1
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-08
110 | weight_decay: 0.02
111 | workers: 10
112 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/small/args_vit-s_300-I.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | bn_tf: false
 12 | channels_last: false
 13 | checkpoint_hist: 2
 14 | clip_grad: null
 15 | clip_mode: norm
 16 | color_jitter: 0.4
 17 | cooldown_epochs: 10
 18 | crop_pct: null
 19 | cutmix: 1.0
 20 | cutmix_minmax: null
 21 | data_dir: /dataset/common/imagenet-raw
 22 | dataset: ''
 23 | decay_epochs: 100
 24 | decay_rate: 0.1
 25 | dist_bn: reduce
 26 | drop: 0.0
 27 | drop_block: null
 28 | drop_connect: null
 29 | drop_path: 0.05
 30 | epoch_repeats: 0.0
 31 | epochs: 300
 32 | eval_metric: top1
 33 | experiment: bs4096
 34 | gp: null
 35 | hflip: 0.5
 36 | img_size: null
 37 | initial_checkpoint: ''
 38 | input_size: null
 39 | interpolation: ''
 40 | jsd_loss: false
 41 | local_rank: 0
 42 | log_interval: 50
 43 | log_wandb: false
 44 | lr: 0.02121
 45 | lr_cycle_decay: 0.5
 46 | lr_cycle_limit: 1
 47 | lr_cycle_mul: 1.0
 48 | lr_k_decay: 1.0
 49 | lr_noise: null
 50 | lr_noise_pct: 0.67
 51 | lr_noise_std: 1.0
 52 | max_grad_norm: 0.0
 53 | mean: null
 54 | min_lr: 1.0e-08
 55 | mixup: 0.2
 56 | mixup_mode: batch
 57 | mixup_off_epoch: 0
 58 | mixup_prob: 1.0
 59 | mixup_switch_prob: 0.5
 60 | model: deit_small_patch16_224
 61 | model_ema: false
 62 | model_ema_decay: 0.9998
 63 | model_ema_force_cpu: false
 64 | momentum: 0.9
 65 | native_amp: false
 66 | no_aug: false
 67 | no_prefetcher: false
 68 | no_prox: false
 69 | no_resume_opt: false
 70 | num_classes: null
 71 | opt: adan
 72 | opt_betas:
 73 | - 0.98
 74 | - 0.92
 75 | - 0.99
 76 | opt_debug: 5
 77 | opt_eps: 1.0e-08
 78 | output: ./exp_results/deit-small-bs-test-
 79 | patience_epochs: 10
 80 | pin_mem: false
 81 | pretrained: false
 82 | ratio:
 83 | - 0.75
 84 | - 1.3333333333333333
 85 | recount: 1
 86 | recovery_interval: 0
 87 | remode: pixel
 88 | reprob: 0.0
 89 | resplit: false
 90 | resume: ''
 91 | save_images: false
 92 | scale:
 93 | - 0.08
 94 | - 1.0
 95 | sched: cosine
 96 | seed: 1005
 97 | smoothing: 0.1
 98 | split_bn: false
 99 | start_epoch: null
100 | std: null
101 | sync_bn: false
102 | torchscript: false
103 | train_interpolation: random
104 | train_split: train
105 | tta: 0
106 | use_multi_epochs_loader: false
107 | val_split: validation
108 | validation_batch_size: null
109 | vflip: 0.0
110 | warmup_epochs: 80
111 | warmup_lr: 1.0e-08
112 | weight_decay: 0.02
113 | workers: 10
114 | 


--------------------------------------------------------------------------------
/CV/timm/exp_results/ViT/small/args_vit-s_300.yaml:
--------------------------------------------------------------------------------
  1 | aa: rand-m7-mstd0.5-inc1
  2 | amp: true
  3 | apex_amp: false
  4 | aug_repeats: 0
  5 | aug_splits: 0
  6 | batch_size: 256
  7 | bce_loss: true
  8 | bias_decay: false
  9 | bn_eps: null
 10 | bn_momentum: null
 11 | channels_last: false
 12 | checkpoint_hist: 2
 13 | clip_grad: null
 14 | clip_mode: norm
 15 | color_jitter: 0.4
 16 | cooldown_epochs: 10
 17 | crop_pct: null
 18 | cutmix: 1.0
 19 | cutmix_minmax: null
 20 | data_dir: /dataset/common/imagenet-raw
 21 | dataset: ''
 22 | decay_epochs: 100
 23 | decay_rate: 0.1
 24 | dist_bn: reduce
 25 | drop: 0.0
 26 | drop_block: null
 27 | drop_connect: null
 28 | drop_path: 0.1
 29 | epoch_repeats: 0.0
 30 | epochs: 300
 31 | eval_metric: top1
 32 | experiment: e300-wrlr1e8-mlr1e5-lr1d5e2-dp01-mix08-bce
 33 | gp: null
 34 | hflip: 0.5
 35 | img_size: null
 36 | initial_checkpoint: ''
 37 | input_size: null
 38 | interpolation: ''
 39 | jsd_loss: false
 40 | local_rank: 0
 41 | log_interval: 50
 42 | log_wandb: false
 43 | lr: 0.015
 44 | lr_cycle_decay: 0.5
 45 | lr_cycle_limit: 1
 46 | lr_cycle_mul: 1.0
 47 | lr_k_decay: 1.0
 48 | lr_noise: null
 49 | lr_noise_pct: 0.67
 50 | lr_noise_std: 1.0
 51 | max_grad_norm: 0.0
 52 | mean: null
 53 | min_lr: 1.0e-05
 54 | mixup: 0.8
 55 | mixup_mode: batch
 56 | mixup_off_epoch: 0
 57 | mixup_prob: 1.0
 58 | mixup_switch_prob: 0.5
 59 | model: deit_small_patch16_224
 60 | model_ema: false
 61 | model_ema_decay: 0.9998
 62 | model_ema_force_cpu: false
 63 | momentum: 0.9
 64 | native_amp: false
 65 | no_aug: false
 66 | no_prefetcher: false
 67 | no_prox: false
 68 | no_resume_opt: false
 69 | num_classes: null
 70 | opt: adan
 71 | opt_betas:
 72 | - 0.98
 73 | - 0.92
 74 | - 0.99
 75 | opt_eps: 1.0e-08
 76 | output: ./exp_results/deit-small
 77 | patience_epochs: 10
 78 | pin_mem: false
 79 | pretrained: false
 80 | ratio:
 81 | - 0.75
 82 | - 1.3333333333333333
 83 | recount: 1
 84 | recovery_interval: 0
 85 | remode: pixel
 86 | reprob: 0.25
 87 | resplit: false
 88 | resume: null
 89 | save_images: false
 90 | scale:
 91 | - 0.08
 92 | - 1.0
 93 | sched: cosine
 94 | seed: 42
 95 | smoothing: 0.1
 96 | split_bn: false
 97 | start_epoch: null
 98 | std: null
 99 | sync_bn: false
100 | torchscript: false
101 | train_interpolation: random
102 | train_split: train
103 | tta: 0
104 | use_multi_epochs_loader: false
105 | val_split: validation
106 | validation_batch_size: null
107 | vflip: 0.0
108 | warmup_epochs: 60
109 | warmup_lr: 1.0e-08
110 | weight_decay: 0.02
111 | workers: 10
112 | 


--------------------------------------------------------------------------------
/CV/timm/sam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class SAM(torch.optim.Optimizer):
 5 |     def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs):
 6 |         assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
 7 | 
 8 |         defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
 9 |         super(SAM, self).__init__(params, defaults)
10 | 
11 |         self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
12 |         self.param_groups = self.base_optimizer.param_groups
13 | 
14 |     @torch.no_grad()
15 |     def first_step(self, zero_grad=False):
16 |         grad_norm = self._grad_norm()
17 |         for group in self.param_groups:
18 |             scale = group["rho"] / (grad_norm + 1e-12)
19 | 
20 |             for p in group["params"]:
21 |                 if p.grad is None: continue
22 |                 self.state[p]["old_p"] = p.data.clone()
23 |                 e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p)
24 |                 p.add_(e_w)  # climb to the local maximum "w + e(w)"
25 | 
26 |         if zero_grad: self.zero_grad()
27 | 
28 |     @torch.no_grad()
29 |     def second_step(self, zero_grad=False):
30 |         for group in self.param_groups:
31 |             for p in group["params"]:
32 |                 if p.grad is None: continue
33 |                 p.data = self.state[p]["old_p"]  # get back to "w" from "w + e(w)"
34 | 
35 |         self.base_optimizer.step()  # do the actual "sharpness-aware" update
36 | 
37 |         if zero_grad: self.zero_grad()
38 | 
39 |     @torch.no_grad()
40 |     def step(self, closure=None):
41 |         assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
42 |         closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass
43 | 
44 |         self.first_step(zero_grad=True)
45 |         closure()
46 |         self.second_step()
47 | 
48 |     def _grad_norm(self):
49 |         shared_device = self.param_groups[0]["params"][0].device  # put everything on the same device, in case of model parallelism
50 |         norm = torch.norm(
51 |                     torch.stack([
52 |                         ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device)
53 |                         for group in self.param_groups for p in group["params"]
54 |                         if p.grad is not None
55 |                     ]),
56 |                     p=2
57 |                )
58 |         return norm
59 | 
60 |     def load_state_dict(self, state_dict):
61 |         super().load_state_dict(state_dict)
62 |         self.base_optimizer.param_groups = self.param_groups


--------------------------------------------------------------------------------
/NLP/BERT/README.md:
--------------------------------------------------------------------------------
  1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
  2 | 
  3 | 
  4 | 
  5 | ## Installation of Fairseq
  6 | 
  7 | Our experiment is based on the repo [Fairseq](https://github.com/facebookresearch/fairseq). For the requirements and installation of [Fairseq](https://github.com/facebookresearch/fairseq) and Apex, please refer to that repo.
  8 | 
  9 | 
 10 | 
 11 | ## Environment
 12 | 
 13 | Our experiments for this task are based on the following pkg version.
 14 | 
 15 | ```python
 16 | torch.__version__  = '1.10.1+cu111'
 17 | torchvision.__version__ = '0.11.2+cu111'
 18 | torchaudio.__version__ = '0.10.1+cu111'
 19 | fairseq.__version__ = '0.12.2'
 20 | ```
 21 | 
 22 | If you want to strictly follow our environment, please refer to our released docker image [xyxie/adan-image:fairseq](https://hub.docker.com/repository/docker/xyxie/adan-image).
 23 | 
 24 | 
 25 | 
 26 | ## Usage of Adan in Fairseq
 27 | 
 28 | ### One step to use Adan
 29 | 
 30 | Please first put the file [`adan.py`](./adan.py) to the directory `path/to/fairseq/fairseq/optim`. Then you can choose Adan as the optimizer in the config file. See  following example for pre-training:
 31 | 
 32 | ```yaml
 33 | optimizer:
 34 |   _name: adan
 35 |   weight_decay: 0.02
 36 |   adan_betas: (0.98,0.92,0.99)
 37 |   adan_eps: 1e-08
 38 | ```
 39 | 
 40 | 
 41 | 
 42 | ## Pretraining
 43 | 
 44 | The following steps are modified from [Fairseq-Roberta](https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.pretraining.md). For completeness, we list some key steps here.
 45 | 
 46 | 
 47 | ### 1) Preprocess the data
 48 | 
 49 | Data should be preprocessed following the [language modeling format](https://github.com/facebookresearch/fairseq/tree/main/examples/language_model). That is, each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`, and all lines should be concatenated as a 1D text stream during training.
 50 | 
 51 | 
 52 | 
 53 | In the following steps, we use the [Bookcorpus dataset](https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz) and [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Database_download) to demonstrate how to preprocess raw text data with the GPT-2 BPE.
 54 | 
 55 | #### i) Download the dataset:
 56 | 
 57 | ```bash
 58 | wget https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz
 59 | tar  -zxvf  books1.tar.gz  -C  ./bert-corpus/
 60 | ```
 61 | 
 62 | ```python
 63 | pip install datasets
 64 | from datasets import load_dataset
 65 | 
 66 | dataset = load_dataset("wikipedia", "20220301.en")
 67 | ```
 68 | 
 69 | #### ii) Generate Raw data:
 70 | 
 71 |    - For wikipedia dataset,  we need to read each line of the json line file , replace the `\n` in the text field with a space, and write the line (add `\n` at the end), to the file new  `all_data.raw`.
 72 | 
 73 |    - For  bookcorpus dataset, read out the contexts of each book, then replace  the `\n` with the space, and then write the context of the book as one line in `all_data.raw`, ended up with `\n`.
 74 | 
 75 |    - Split the  `all_data.raw`  in to  `wiki.train.raw` and  `wiki.dev.raw`  with the ratio of 99:1. Set  `wiki.test.raw = wiki.dev.raw` for compatibility of fairseq.
 76 | 
 77 |      
 78 | 
 79 | #### iii) Encode data with the GPT-2 BPE:
 80 | 
 81 | ```bash
 82 | mkdir -p gpt2_bpe
 83 | wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
 84 | wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
 85 | for SPLIT in train valid test; do \
 86 |     python -m examples.roberta.multiprocessing_bpe_encoder \
 87 |         --encoder-json gpt2_bpe/encoder.json \
 88 |         --vocab-bpe gpt2_bpe/vocab.bpe \
 89 |         --inputs bert-corpus/wiki.${SPLIT}.raw \
 90 |         --outputs bert-corpus/wiki.${SPLIT}.bpe \
 91 |         --keep-empty \
 92 |         --workers 60; \
 93 | done
 94 | ```
 95 | 
 96 | 
 97 | 
 98 | #### iv) Binarize the data using the GPT-2 fairseq dictionary:
 99 | 
100 | ```bash
101 | wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
102 | fairseq-preprocess \
103 |     --only-source \
104 |     --srcdict gpt2_bpe/dict.txt \
105 |     --trainpref bert-corpus/wiki.train.bpe \
106 |     --validpref bert-corpus/wiki.valid.bpe \
107 |     --testpref bert-corpus/wiki.test.bpe \
108 |     --destdir data-bin/bert-corpus \
109 |     --workers 60
110 | ```
111 | 
112 | 
113 | 
114 | ### 2) Train BERT base
115 | 
116 | Put the provided [config files](./config/pretraining) to the directory `path/to/fairseq/examples/roberta/config/pretraining`
117 | 
118 | ```bash
119 | DATA_DIR=/path/to/fairseq/bert-corpus
120 | 
121 | fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \
122 | --config-name ${NAME} task.data=$DATA_DIR \
123 | checkpoint.save_dir=/path/to/save_dir/
124 | 
125 | ```
126 | 
127 | We can optionally resume the training of the released BERT-base model by adding `checkpoint.restore_file=/path/to/model.pt`. Note, in our experiments, we use Adan to train BERT-base from scratch. You can use the following config files to train  BERT-base with Adam or Adan:
128 | 
129 |   |   NAME    | Optimizer |                         Config                         |                         Download                         |
130 |   | :-------: | :-------: | :----------------------------------------------------: | :------------------------------------------------------: |
131 |   | bert-base |   Adam    | [config](./exp_results/pretrain/full_config-adam.yaml) | [log](./exp_results/pretrain/hydra_train-adam.log)/model |
132 |   | bert-adan |   Adan    | [config](./exp_results/pretrain/full_config-adan.yaml) | [log](./exp_results/pretrain/hydra_train-adan.log)/model |
133 | 
134 | The above command assumes the training is on 8x40GB A100 GPUs. Each GPU uses a batch size of 32 sequences (`dataset.batch_size`). If you have fewer GPUs or GPUs with less memory, you may need to reduce `dataset.batch_size` and increase `dataset.update_freq` to compensate. Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to improve the training speed.
135 | 
136 | 
137 | ## Finetuning BERT-base on GLUE tasks
138 | 
139 | ### 1) Download the data from [GLUE website](https://gluebenchmark.com/tasks) using following commands:
140 | ```bash
141 | wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
142 | python download_glue_data.py --data_dir glue_data --tasks all
143 | ```
144 | There some problems to download `MRPC` and  `MNLI` , hence we pass the `MRPC` task and download the data of `MNLI` from the unofficial sources.
145 | 
146 | 
147 | 
148 | ### 2) Preprocess GLUE task data:
149 | 
150 | ```bash
151 | ./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
152 | ```
153 | - `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, RTE, STS-B, SST-2, CoLA}`. Use `ALL` for preprocessing all the glue tasks.
154 | 
155 | 
156 | 
157 | ### 3) Fine-tuning on GLUE task:
158 | 
159 | Example fine-tuning cmd for `RTE` task
160 | ```bash
161 | TASK=RTE;
162 | 
163 | python  path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --avg_num 1 \
164 | --data_path /path/to/fairseq/GLUE/glue_data/$TASK \
165 | --bin_path /path/to/fairseq/GLUE/$TASK-bin \
166 | --pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \
167 | --finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \
168 | --task rte-adan
169 | ```
170 | 
171 | - `avg_num` number of repetitions.
172 | 
173 | - `data_path` path to the data of GLUE task, e.g., CoLA, MNLI, etc.
174 | 
175 | - `bin_path` similar to `data_path`, but is path to the binarized data after processing.
176 | 
177 | - `pre_path` path to the pre-trained model.
178 | 
179 | - `finetune_path` path to save/load fine-tuned model.
180 | 
181 | - `task` config name, please refer to the directory of [fine-tuning](./config/finetuning) for the additional config files for each of the GLUE tasks.
182 | 
183 | - This cmd-args and hyperparams are tested on one Nvidia `A100` GPU with `40gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
184 | 
185 |   
186 | 
187 | ### 4) Inference on GLUE task
188 | After training the model by using previous step, we can perform inference with checkpoints in `finetune_path` directory using following code snippet:
189 | 
190 | ```bash
191 | TASK=RTE;
192 | 
193 | python  path/to/fairseq/examples/roberta/config/finetuning/acc_test.py --inference \
194 | --data_path /path/to/fairseq/GLUE/glue_data/$TASK \
195 | --bin_path /path/to/fairseq/GLUE/$TASK-bin \
196 | --pre_path /path/to/fairseq/bert-adan/checkpoint_best.pt \
197 | --finetune_path /path/to/fairseq/bert-fintune/adan/$TASK/ \
198 | --task rte-adan
199 | 
200 | ```
201 | 
202 |  This should give:
203 | 
204 | | GLUE-Task | Metric                       |  Result   |                    Config                     |
205 | | --------- | :--------------------------- | :-------: | :-------------------------------------------: |
206 | | CoLA      | Matthew's corr.              |   64.6    | [config](./config/finetuning/cola-adan.yaml)  |
207 | | SST-2     | Accuracy                     |   93.2    | [config](./config/finetuning/sst_2-adan.yaml) |
208 | | STS-B     | Person corr.                 |   89.3    | [config](./config/finetuning/sts_b-adan.yaml) |
209 | | QQP       | Accuracy                     |   91.2    |  [config](./config/finetuning/qqp-adan.yaml)  |
210 | | MNLI      | Matched acc./Mismatched acc. | 85.7/85.6 | [config](./config/finetuning/mnli-adan.yaml)  |
211 | | QNLI      | Accuracy                     |   91.3    | [config](./config/finetuning/qnli-adan.yaml)  |
212 | | RTE       | Accuracy                     |   73.3    |  [config](./config/finetuning/rte-adan.yaml)  |
213 | 
214 | 


--------------------------------------------------------------------------------
/NLP/BERT/adan.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Garena Online Private Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | import math
 18 | from collections.abc import Collection
 19 | from dataclasses import dataclass, field
 20 | from typing import Any, List
 21 | 
 22 | import torch
 23 | import torch.distributed as dist
 24 | import torch.optim
 25 | from fairseq.dataclass import FairseqDataclass
 26 | from fairseq.optim import FairseqOptimizer, register_optimizer
 27 | from omegaconf import II, OmegaConf
 28 | 
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | @dataclass
 34 | class FairseqAdanConfig(FairseqDataclass):
 35 |     adan_betas: Any = field(
 36 |         default=(0.98, 0.92, 0.99), metadata={"help": "betas for Adan optimizer"}
 37 |     )
 38 |     adan_eps: float = field(
 39 |         default=1e-8, metadata={"help": "epsilon for Adam optimizer"}
 40 |     )
 41 |     weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
 42 | 
 43 |     no_prox: bool = field(
 44 |         default=False, metadata={"help": "wether to perform prox operator"}
 45 |     )
 46 |     fp16_adan_stats: bool = field(
 47 |         default=False, metadata={"help": "use FP16 stats (with automatic scaling)"}
 48 |     )
 49 |     # TODO common vars below in parent
 50 |     tpu: bool = II("common.tpu")
 51 |     lr: List[float] = II("optimization.lr")
 52 | 
 53 | 
 54 | @register_optimizer("adan", dataclass=FairseqAdanConfig)
 55 | class FairseqAdan(FairseqOptimizer):
 56 |     """
 57 |     Adan optimizer for fairseq.
 58 |     """
 59 | 
 60 |     def __init__(self, cfg: FairseqAdanConfig, params):
 61 |         super().__init__(cfg)
 62 |         fused_adan_cls = None
 63 |         use_fused_adan = (
 64 |             fused_adan_cls is not None
 65 |             and torch.cuda.is_available()
 66 |         )
 67 |         if getattr(cfg, "tpu", False):
 68 |             if self.cfg.fp16_adan_stats:
 69 |                 raise NotImplementedError("--fp16-adam-stats is only supported on GPU")
 70 |             # on TPUs we use the Adam defined here, since it
 71 |             # automatically casts gradients to FP32
 72 |             self._optimizer = Adan(params, **self.optimizer_config)
 73 |         elif use_fused_adan:
 74 |             raise NotImplementedError("--fp16-adam-stats is only supported on GPU")
 75 |         else:
 76 |             if self.cfg.fp16_adan_stats:
 77 |                 raise NotImplementedError(
 78 |                     "--fp16-adam-stats is only supported with FusedAdanV1"
 79 |                 )
 80 |             self._optimizer = Adan(params, **self.optimizer_config)
 81 | 
 82 |     @property
 83 |     def optimizer_config(self):
 84 |         """
 85 |         Return a kwarg dictionary that will be used to override optimizer
 86 |         args stored in checkpoints. This allows us to load a checkpoint and
 87 |         resume training using a different set of optimizer args, e.g., with a
 88 |         different learning rate.
 89 |         """
 90 |         return {
 91 |             "lr": self.cfg.lr[0]
 92 |             if isinstance(self.cfg.lr, Collection)
 93 |             else self.cfg.lr,
 94 |             "betas": eval(self.cfg.adan_betas)
 95 |             if isinstance(self.cfg.adan_betas, str)
 96 |             else OmegaConf.to_container(self.cfg.adan_betas),
 97 |             "eps": self.cfg.adan_eps,
 98 |             "weight_decay": self.cfg.weight_decay,
 99 |         }
100 | 
101 |     def average_params(self):
102 |         """Reduce Params is only used during BMUF distributed training."""
103 |         state_dict = self.optimizer.state_dict()
104 |         total_gpus = float(dist.get_world_size())
105 | 
106 |         for _, value in state_dict["state"].items():
107 |             value["exp_avg"] /= total_gpus
108 |             value["exp_avg_sq"] /= total_gpus
109 |             value['exp_avg_diff'] /= total_gpus
110 |             dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
111 |             dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
112 |             dist.all_reduce(value["exp_avg_diff"], op=dist.ReduceOp.SUM)
113 | 
114 | 
115 | class Adan(torch.optim.Optimizer):
116 |     r"""Implements Adan algorithm.
117 | 
118 |     Args:
119 |         params (iterable): iterable of parameters to optimize or dicts defining
120 |             parameter groups
121 |         lr (float, optional): learning rate (default: 1e-3)
122 |         betas (Tuple[float, float, float], optional): coefficients used for computing
123 |             running averages of gradient and its square (default: (0.98, 0.92, 0.99))
124 |         eps (float, optional): term added to the denominator to improve
125 |             numerical stability (default: 1e-8)
126 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
127 |     """
128 |     def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
129 |                  weight_decay=0.0, no_prox = False):
130 |         defaults = dict(lr=lr, betas=betas, eps=eps,
131 |                         weight_decay=weight_decay, no_prox = no_prox)
132 |         super(Adan, self).__init__(params, defaults)
133 | 
134 |     def __setstate__(self, state):
135 |         super(Adan, self).__setstate__(state)
136 |         for group in self.param_groups:
137 |             group.setdefault('no_prox', False)
138 | 
139 |     @property
140 |     def supports_memory_efficient_fp16(self):
141 |         return True
142 | 
143 |     @property
144 |     def supports_flat_params(self):
145 |         return True
146 | 
147 |     def step(self, closure=None):
148 |         """Performs a single optimization step.
149 |         Arguments:
150 |             closure (callable, optional): A closure that reevaluates the model
151 |                 and returns the loss.
152 |         """
153 |         loss = None
154 |         if closure is not None:
155 |             loss = closure()
156 |         
157 | 
158 |         for group in self.param_groups:
159 |             beta1, beta2, beta3 = group['betas']
160 |             # assume same step across group now to simplify things
161 |             # per parameter step can be easily support by making it tensor, or pass list into kernel
162 |             if 'step' in group:
163 |                 group['step'] += 1 
164 |             else:
165 |                 group['step'] = 1
166 | 
167 |             
168 |             bias_correction1 = 1.0 - beta1 ** group['step']
169 | 
170 |             bias_correction2 = 1.0 - beta2 ** group['step']
171 | 
172 |             bias_correction3 = 1.0 - beta3 ** group['step']
173 | 
174 |             for p in group['params']:
175 |                 if p.grad is None:
176 |                     continue
177 | 
178 |                 p_data_fp32 = p.data
179 |                 if p.data.dtype in {torch.float16, torch.bfloat16}:
180 |                     p_data_fp32 = p_data_fp32.float()
181 |                 
182 |                 state = self.state[p]
183 |                 if len(state) == 0:
184 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
185 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
186 |                     state['exp_avg_diff'] = torch.zeros_like(p_data_fp32)
187 |                 else:
188 |                     state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
189 |                     state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
190 |                     state['exp_avg_diff'] = state['exp_avg_diff'].to(p_data_fp32)
191 | 
192 | 
193 |                 grad = p.grad.data
194 |                 if grad.dtype in {torch.float16, torch.bfloat16}:
195 |                     grad = grad.float()
196 |                 if grad.is_sparse:
197 |                     raise RuntimeError(
198 |                         "Adan does not support sparse gradients, please consider SparseAdam instead"
199 |                     )
200 | 
201 |                 if 'pre_grad' not in state or group['step'] == 1: 
202 |                     state['pre_grad'] = grad
203 | 
204 |                 
205 |                 copy_grad = grad.clone()
206 |                 
207 | 
208 |                 exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
209 |                 diff = grad - state['pre_grad']
210 | 
211 | 
212 |                 update = grad+beta2*diff
213 |                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
214 |                 exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
215 |                 exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # v_t
216 |         
217 |                 denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
218 |                 update = ((exp_avg/bias_correction1+beta2*exp_avg_diff/bias_correction2) ).div_(denom)
219 | 
220 |                 if group['no_prox']:
221 |                     p_data_fp32.mul_(1 - group['lr'] * group['weight_decay'])
222 |                     p_data_fp32.add_(update, alpha=-group['lr']) 
223 |                 else:
224 |                     p_data_fp32.add_(update, alpha=-group['lr'])  
225 |                     p_data_fp32.div_(1 + group['lr'] * group['weight_decay']) 
226 |                     
227 |                 state['pre_grad'] = copy_grad
228 |                 
229 |                 if p.data.dtype in {torch.float16, torch.bfloat16}:
230 |                     p.data.copy_(p_data_fp32)             
231 |         return loss
232 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/acc_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from fairseq.models.roberta import RobertaModel
  3 | import argparse
  4 | from scipy.stats import pearsonr
  5 | from sklearn.metrics import matthews_corrcoef
  6 | 
  7 | 
  8 | def get_acc(model_path, data_path, bin_path, task='rte'):
  9 |     acc_list = []
 10 |     gold, pred = [], []
 11 |     roberta = RobertaModel.from_pretrained(
 12 |         model_path,
 13 |         checkpoint_file='checkpoint_best.pt',
 14 |         data_name_or_path=bin_path#'RTE-bin'
 15 |     )
 16 | 
 17 |     label_fn = lambda label: roberta.task.label_dictionary.string(
 18 |         [label + roberta.task.label_dictionary.nspecial]
 19 |     )
 20 |     ncorrect, nsamples = 0, 0
 21 |     roberta.cuda()
 22 |     roberta.eval()
 23 |     if 'mnli' not in task:
 24 |         dev_files = ['dev.tsv']
 25 |     else: dev_files = ['dev_mismatched.tsv', 'dev_matched.tsv']
 26 |     for dev_file in dev_files:
 27 |         with open(os.path.join(data_path, dev_file)) as fin:
 28 |             fin.readline()
 29 |             for index, line in enumerate(fin):
 30 |                 tokens = line.strip().split('\t')
 31 |                 if 'rte' in task or 'qnli' in task:
 32 |                     sent1, sent2, target = tokens[1], tokens[2], tokens[3]
 33 |                     tokens = roberta.encode(sent1, sent2)
 34 |                 elif 'qqp' in task:
 35 |                     sent1, sent2, target = tokens[3], tokens[4], tokens[5]
 36 |                     tokens = roberta.encode(sent1, sent2)
 37 |                 elif 'mnli' in task:
 38 |                     sent1, sent2, target = tokens[8], tokens[9], tokens[11]
 39 |                     tokens = roberta.encode(sent1, sent2)
 40 |                 elif 'mrpc' in task:
 41 |                     sent1, sent2, target = tokens[3], tokens[4], tokens[0]
 42 |                     tokens = roberta.encode(sent1, sent2)
 43 |                 elif 'sts_b' in task:
 44 |                     sent1, sent2, target = tokens[7], tokens[8], float(tokens[9])
 45 |                     tokens = roberta.encode(sent1, sent2)
 46 |                 elif 'sst_2' in task:
 47 |                     sent, target = tokens[0], tokens[1]
 48 |                     tokens = roberta.encode(sent)
 49 |                    
 50 |                 elif 'cola' in task:
 51 |                     sent, target = tokens[3], tokens[1]
 52 |                     tokens = roberta.encode(sent)
 53 |                 if 'sts_b' not in task:
 54 |                     prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
 55 |                     prediction_label = label_fn(prediction)
 56 |                     ncorrect += int(prediction_label == target)
 57 |                     
 58 |                     nsamples += 1
 59 |                     if 'cola' in task: 
 60 |                         target = int(target)
 61 |                         prediction_label = int(prediction_label)
 62 |                         pred.append(prediction_label)
 63 |                         gold.append(target)
 64 |                     
 65 |                 else:
 66 |                     features = roberta.extract_features(tokens)
 67 |                     predictions = 5.0 * roberta.model.classification_heads['sentence_classification_head'](features)
 68 |                     gold.append(target)
 69 |                     pred.append(predictions.item())
 70 |         if 'cola' in task: 
 71 |             out = matthews_corrcoef(gold, pred)
 72 |         elif 'sts_b' in task:
 73 |             out = pearsonr(gold, pred)[0]
 74 |         else: out = float(ncorrect)/float(nsamples)
 75 |         
 76 |         acc_list.append(out)
 77 |     return acc_list
 78 | 
 79 | 
 80 | parser = argparse.ArgumentParser(description='GLUE test for acc')
 81 | parser.add_argument('--avg_num', type=int, default=1,
 82 |                     help='number of try')
 83 | parser.add_argument('--pre_path', type=str,  default='./baseline/checkpoint_20_1000000.pt',
 84 |                     help='path to pre-trained model')
 85 | parser.add_argument('--data_path', type=str,  default='./GLUE/glue_data/STS-B',
 86 |                     help='path to data')
 87 | parser.add_argument('--bin_path', type=str,  default='./GLUE/STS-B-bin',
 88 |                     help='path to -bin data')
 89 | parser.add_argument('--finetune_path', type=str,  default='./bert-fintune/adam/STS-B/',
 90 |                     help='path to finetuned model')
 91 | parser.add_argument('--task', type=str,  default='sts_b',
 92 |                     help='task of finetune')
 93 | parser.add_argument('--inference', action='store_true', default=False,
 94 |                     help='inference only')
 95 | args = parser.parse_args()
 96 | 
 97 | 
 98 | acc_avg = 0.0
 99 | acc_avg2 = 0.0
100 | for _ in range(args.avg_num):
101 |     if not args.inference:
102 |         val = os.system(' fairseq-hydra-train --config-dir ./fairseq/examples/roberta/config/finetuning \
103 |                     --config-name {} \
104 |                     task.data={} checkpoint.restore_file={} \
105 |                     checkpoint.save_dir={}'.format(args.task, args.bin_path, args.pre_path, args.finetune_path))
106 |     all_acc = get_acc(args.finetune_path, args.data_path, args.bin_path, args.task)
107 |     acc_avg+=all_acc[0]
108 |     if len(all_acc)>1:
109 |         acc_avg2+=all_acc[1]
110 | 
111 | if acc_avg2>0:
112 |     print('Mismatched Accuracy1:{},   Matched Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num), float(acc_avg2)/float(args.avg_num)))
113 | else:
114 |     print('AVG Accuracy1:{}'.format(float(acc_avg)/float(args.avg_num)))
115 | 
116 |                  


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/cola-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.01
43 |   adan_betas: (0.98,0.99,0.99)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 320
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [4e-05]
53 |   max_update: 5336
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/cola.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 320
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [1e-05]
53 |   max_update: 5336
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/mnli-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 3
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.01
43 |   adan_betas: (0.98,0.92,0.999)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 7432
49 | 
50 | optimization:
51 |   clip_norm: 1.0
52 |   lr: [2.0e-05]
53 |   max_update: 123873
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/mnli.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 3
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 32
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 7432
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [1e-05]
53 |   max_update: 123873
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/qnli-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.001
43 |   adan_betas: (0.98,0.99,0.99)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: cosine
48 |   warmup_updates: 1986
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [2e-05]
53 |   max_update: 33112
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/qnli.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 32
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 1986
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [1e-05]
53 |   max_update: 33112
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/qqp-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.001
43 |   adan_betas: (0.98,0.99,0.99)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 28318
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [4e-05]
53 |   max_update: 113272
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/qqp.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 32
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 28318
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [1e-05]
53 |   max_update: 113272
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/rte-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.01
43 |   adan_betas: (0.98,0.99,0.99)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 122
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [2e-05]
53 |   max_update: 2036
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/rte.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 16
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 122
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [2e-05]
53 |   max_update: 2036
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/sst_2-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 32
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adan
42 |   weight_decay: 0.01
43 |   adan_betas: (0.98,0.92,0.99)
44 |   adan_eps: 1e-08
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 1256
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [4e-05]
53 |   max_update: 20935
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/sst_2.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 2
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   best_checkpoint_metric: accuracy
25 |   maximize_best_checkpoint_metric: true
26 |   no_epoch_checkpoints: true
27 | 
28 | distributed_training:
29 |   find_unused_parameters: true
30 |   distributed_world_size: 1
31 | 
32 | criterion:
33 |   _name: sentence_prediction
34 | 
35 | dataset:
36 |   batch_size: 32
37 |   required_batch_size_multiple: 1
38 |   max_tokens: 4400
39 | 
40 | optimizer:
41 |   _name: adam
42 |   weight_decay: 0.1
43 |   adam_betas: (0.9,0.98)
44 |   adam_eps: 1e-06
45 | 
46 | lr_scheduler:
47 |   _name: polynomial_decay
48 |   warmup_updates: 1256
49 | 
50 | optimization:
51 |   clip_norm: 0.0
52 |   lr: [1e-05]
53 |   max_update: 20935
54 |   max_epoch: 10
55 | 
56 | model:
57 |   _name: roberta
58 |   dropout: 0.1
59 |   attention_dropout: 0.1
60 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/sts_b-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 1
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   no_epoch_checkpoints: true
25 | 
26 | distributed_training:
27 |   find_unused_parameters: true
28 |   distributed_world_size: 1
29 | 
30 | criterion:
31 |   _name: sentence_prediction
32 |   regression_target: true
33 | 
34 | dataset:
35 |   batch_size: 16
36 |   required_batch_size_multiple: 1
37 |   max_tokens: 4400
38 | 
39 | optimizer:
40 |   _name: adan
41 |   weight_decay: 0.01
42 |   adan_betas: (0.98,0.99,0.99)
43 |   adan_eps: 1e-8
44 | 
45 | lr_scheduler:
46 |   _name: cosine
47 |   warmup_updates: 214
48 | 
49 | optimization:
50 |   clip_norm: 0.5
51 |   lr: [4e-05]
52 |   max_update: 3598
53 |   max_epoch: 10
54 | 
55 | model:
56 |   _name: roberta
57 |   dropout: 0.1
58 |   attention_dropout: 0.1
59 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/finetuning/sts_b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   fp16_init_scale: 4
 6 |   threshold_loss_scale: 1
 7 |   fp16_scale_window: 128
 8 |   log_format: json
 9 |   log_interval: 200
10 | 
11 | task:
12 |   _name: sentence_prediction
13 |   data: ???
14 |   init_token: 0
15 |   separator_token: 2
16 |   num_classes: 1
17 |   max_positions: 512
18 | 
19 | checkpoint:
20 |   restore_file: ???
21 |   reset_optimizer: true
22 |   reset_dataloader: true
23 |   reset_meters: true
24 |   no_epoch_checkpoints: true
25 | 
26 | distributed_training:
27 |   find_unused_parameters: true
28 |   distributed_world_size: 1
29 | 
30 | criterion:
31 |   _name: sentence_prediction
32 |   regression_target: true
33 | 
34 | dataset:
35 |   batch_size: 16
36 |   required_batch_size_multiple: 1
37 |   max_tokens: 4400
38 | 
39 | optimizer:
40 |   _name: adam
41 |   weight_decay: 0.1
42 |   adam_betas: (0.9,0.98)
43 |   adam_eps: 1e-06
44 | 
45 | lr_scheduler:
46 |   _name: polynomial_decay
47 |   warmup_updates: 214
48 | 
49 | optimization:
50 |   clip_norm: 0.0
51 |   lr: [2e-05]
52 |   max_update: 3598
53 |   max_epoch: 10
54 | 
55 | model:
56 |   _name: roberta
57 |   dropout: 0.1
58 |   attention_dropout: 0.1
59 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/pretraining/base.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | common:
 3 |   fp16: true
 4 |   log_format: json
 5 |   log_interval: 200
 6 | 
 7 | checkpoint:
 8 |   no_epoch_checkpoints: true
 9 | 
10 | task:
11 |   _name: masked_lm
12 |   data: ???
13 |   sample_break_mode: complete
14 |   tokens_per_sample: 512
15 | 
16 | criterion: masked_lm
17 | 
18 | dataset:
19 |   batch_size: 16
20 |   ignore_unused_valid_subsets: true
21 | 
22 | optimizer:
23 |   _name: adam
24 |   weight_decay: 0.01
25 |   adam_betas: (0.9,0.98)
26 |   adam_eps: 1e-06
27 | 
28 | lr_scheduler:
29 |   _name: polynomial_decay
30 |   warmup_updates: 10000
31 | 
32 | optimization:
33 |   clip_norm: 0
34 |   lr: [0.0005]
35 |   max_update: 125000
36 |   update_freq: [16]
37 | 
38 | model:
39 |   _name: roberta
40 |   max_positions: 512
41 |   dropout: 0.1
42 |   attention_dropout: 0.1
43 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/pretraining/bert-adan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | common:
 3 |   fp16: true
 4 |   log_format: json
 5 |   log_interval: 200
 6 | 
 7 | checkpoint:
 8 |   no_epoch_checkpoints: true
 9 |   save_interval: 5
10 |   save_interval_updates: 50000
11 |   
12 | task:
13 |   _name: masked_lm
14 |   data: ???
15 |   sample_break_mode: complete
16 |   tokens_per_sample: 512
17 | 
18 | criterion: masked_lm
19 | 
20 | 
21 | 
22 | optimizer:
23 |   _name: adan
24 |   weight_decay: 0.02
25 |   adan_betas: (0.98,0.92,0.99)
26 |   adan_eps: 1e-08
27 | 
28 | lr_scheduler:
29 |   _name: polynomial_decay
30 |   warmup_updates: 10000
31 | 
32 | optimization:
33 |   clip_norm: 5.0
34 |   lr: [0.001]
35 |   max_update: 1000000
36 |   update_freq: [1]
37 | 
38 | model:
39 |   _name: roberta
40 |   max_positions: 512
41 |   dropout: 0.1
42 |   attention_dropout: 0.1
43 | 
44 | distributed_training:
45 |   ddp_backend: no_c10d
46 | 
47 | dataset:
48 |   skip_invalid_size_inputs_valid_test: true
49 |   validate_interval: 5
50 |   validate_interval_updates: 50000
51 |   batch_size: 32
52 |   ignore_unused_valid_subsets: true
53 | 


--------------------------------------------------------------------------------
/NLP/BERT/config/pretraining/bert-base.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | common:
 3 |   fp16: true
 4 |   log_format: json
 5 |   log_interval: 200
 6 | 
 7 | checkpoint:
 8 |   save_dir: 'bert/baseline/'
 9 |   no_epoch_checkpoints: true
10 |   save_interval: 5
11 |   save_interval_updates: 50000
12 | 
13 | task:
14 |   _name: masked_lm
15 |   data: ???
16 |   sample_break_mode: complete
17 |   tokens_per_sample: 512
18 | 
19 | criterion: masked_lm
20 | 
21 | 
22 | 
23 | optimizer:
24 |   _name: adam
25 |   weight_decay: 0.01
26 |   adam_betas: (0.9,0.98)
27 |   adam_eps: 1e-06
28 | 
29 | lr_scheduler:
30 |   _name: polynomial_decay
31 |   warmup_updates: 10000
32 | 
33 | optimization:
34 |   clip_norm: 0
35 |   lr: [0.0001]
36 |   max_update: 1000000
37 |   update_freq: [1]
38 | 
39 | model:
40 |   _name: roberta
41 |   max_positions: 512
42 |   dropout: 0.1
43 |   attention_dropout: 0.1
44 | 
45 | distributed_training:
46 |   ddp_backend: no_c10d
47 | 
48 | dataset:
49 |   skip_invalid_size_inputs_valid_test: true
50 |   validate_interval: 5
51 |   validate_interval_updates: 50000
52 |   batch_size: 32
53 |   ignore_unused_valid_subsets: true
54 | 
55 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/README.md:
--------------------------------------------------------------------------------
 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
 2 | 
 3 | We first provide the instruction to modify the official training files from [Transformer-XL](https://github.com/kimiyoung/transformer-xl) to support Adan. **For data preparation, please follow that repo.**
 4 | 
 5 | ## Environment
 6 | 
 7 | As recommended by the official [Transformer-XL](https://github.com/kimiyoung/transformer-xl), our experiments for this task are based on the following pkg version.
 8 | 
 9 | ```python
10 | torch.__version__  = '1.1.0'
11 | ```
12 | 
13 | ## Usage of Adan for Transformer-XL
14 | 
15 | ### Two steps to use Adan
16 | 
17 | **Step 1.** add the following parameters to the file `train.py`.
18 | 
19 | ```python
20 | parser.add_argument('--optim', default='adam', type=str, choices=['adam', 'sgd', 'adagrad', 'adan'], help='optimizer to use.')
21 | parser.add_argument('--wd', type=float, default=0.02, help='weight decay (default: 0.02)')
22 | parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', help='Optimizer Betas (default: None, use opt default)')
23 | ```
24 | 
25 | - `optim`: the choice of optimizers. We add Adan in the choices.
26 | 
27 | - `wd`: decoupled weight decay.
28 | 
29 | - `opt-betas`: optimizer betas for Adan.
30 | 
31 | **Step 2.** replace the original optimizitor creation with the following:
32 | 
33 | ```python
34 | from adan import Adan
35 | 
36 | elif args.optim.lower() == 'adan':
37 |     if args.sample_softmax > 0:
38 |         dense_params, sparse_params = [], []
39 |         for param in model.parameters():
40 |             if param.size() == model.word_emb.weight.size():
41 |                 sparse_params.append(param)
42 |             else:
43 |                 dense_params.append(param)
44 |         optimizer_sparse = Adan(sparse_params,betas=args.opt_betas, lr=args.lr, weight_decay= args.wd)
45 |         optimizer = Adan(dense_params, lr=args.lr,betas=args.opt_betas, weight_decay= args.wd)
46 |     else:
47 |         optimizer = Adan(model.parameters(), lr=args.lr, betas=args.opt_betas, weight_decay= args.wd)
48 | 
49 | ```
50 | 
51 | ## Data Preparation
52 | 
53 | see `bash getdata.sh` in repo  [Transformer-XL](https://github.com/kimiyoung/transformer-xl).
54 | 
55 | ## Training and Evaluation
56 | 
57 | - #### Training
58 | 
59 |   `bash run_wt103_adan.sh train --work_dir PATH_TO_WORK_DIR`
60 | 
61 | - #### Evaluation
62 | 
63 |   `bash run_wt103_adan.sh eval --work_dir PATH_TO_WORK_DIR`
64 | 
65 | - #### Tips for Experiments
66 | 
67 |   - For Adan, we set `args.wd = 0.02` for all steps, which is consistent with the other experiments.
68 |   - For the experiment using `steps = 50k`, we choose a slightly larger `LR`.
69 | 
70 | ## Results and Logs
71 | 
72 | With a different setting for `lr` and `max_step` in `run_wt103_adan.sh`, we have the following results:
73 | 
74 | |                     |   LR   | Steps | Test PPL |                 Download                 |
75 | | ------------------- | :----: | :---: | :------: | :--------------------------------------: |
76 | | Baseline (Adam)     | 2.5e-4 | 200k  |   24.2   | [log&config](./exp_results/log-adam.txt) |
77 | | Transformer-XL-base | 1.5e-3 |  50k  |   26.2   | [log&config](./exp_results/log-50k.txt)  |
78 | | Transformer-XL-base |  1e-3  | 100k  |   24.2   | [log&config](./exp_results/log-100k.txt) |
79 | | Transformer-XL-base |  1e-3  | 200k  |   23.5   | [log&config](./exp_results/log-200k.txt) |
80 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/eval.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import argparse
  3 | import time
  4 | import math
  5 | import os, sys
  6 | 
  7 | import torch
  8 | 
  9 | from data_utils import get_lm_corpus
 10 | from mem_transformer import MemTransformerLM
 11 | from utils.exp_utils import get_logger
 12 | 
 13 | parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
 14 | parser.add_argument('--data', type=str, default='../data/wikitext-103',
 15 |                     help='location of the data corpus')
 16 | parser.add_argument('--dataset', type=str, default='wt103',
 17 |                     choices=['wt103', 'lm1b', 'enwik8', 'text8'],
 18 |                     help='dataset name')
 19 | parser.add_argument('--split', type=str, default='all',
 20 |                     choices=['all', 'valid', 'test'],
 21 |                     help='which split to evaluate')
 22 | parser.add_argument('--batch_size', type=int, default=10,
 23 |                     help='batch size')
 24 | parser.add_argument('--tgt_len', type=int, default=5,
 25 |                     help='number of tokens to predict')
 26 | parser.add_argument('--ext_len', type=int, default=0,
 27 |                     help='length of the extended context')
 28 | parser.add_argument('--mem_len', type=int, default=0,
 29 |                     help='length of the retained previous heads')
 30 | parser.add_argument('--clamp_len', type=int, default=-1,
 31 |                     help='max positional embedding index')
 32 | parser.add_argument('--cuda', action='store_true',
 33 |                     help='use CUDA')
 34 | parser.add_argument('--work_dir', type=str, required=True,
 35 |                     help='path to the work_dir')
 36 | parser.add_argument('--no_log', action='store_true',
 37 |                     help='do not log the eval result')
 38 | parser.add_argument('--same_length', action='store_true',
 39 |                     help='set same length attention with masking')
 40 | args = parser.parse_args()
 41 | assert args.ext_len >= 0, 'extended context length must be non-negative'
 42 | 
 43 | device = torch.device("cuda" if args.cuda else "cpu")
 44 | 
 45 | # Get logger
 46 | logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
 47 |                      log_=not args.no_log)
 48 | 
 49 | # Load dataset
 50 | corpus = get_lm_corpus(args.data, args.dataset)
 51 | ntokens = len(corpus.vocab)
 52 | 
 53 | va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
 54 |     device=device, ext_len=args.ext_len)
 55 | te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
 56 |     device=device, ext_len=args.ext_len)
 57 | 
 58 | # Load the best saved model.
 59 | with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
 60 |     model = torch.load(f)
 61 | model.backward_compatible()
 62 | model = model.to(device)
 63 | 
 64 | logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
 65 |        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
 66 | 
 67 | model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
 68 | if args.clamp_len > 0:
 69 |     model.clamp_len = args.clamp_len
 70 | if args.same_length:
 71 |     model.same_length = True
 72 | 
 73 | ###############################################################################
 74 | # Evaluation code
 75 | ###############################################################################
 76 | def evaluate(eval_iter):
 77 |     # Turn on evaluation mode which disables dropout.
 78 |     model.eval()
 79 |     total_len, total_loss = 0, 0.
 80 |     start_time = time.time()
 81 |     with torch.no_grad():
 82 |         mems = tuple()
 83 |         for idx, (data, target, seq_len) in enumerate(eval_iter):
 84 |             ret = model(data, target, *mems)
 85 |             loss, mems = ret[0], ret[1:]
 86 |             loss = loss.mean()
 87 |             total_loss += seq_len * loss.item()
 88 |             total_len += seq_len
 89 |         total_time = time.time() - start_time
 90 |     logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
 91 |             total_time, 1000 * total_time / (idx+1)))
 92 |     return total_loss / total_len
 93 | 
 94 | # Run on test data.
 95 | if args.split == 'all':
 96 |     test_loss = evaluate(te_iter)
 97 |     valid_loss = evaluate(va_iter)
 98 | elif args.split == 'valid':
 99 |     valid_loss = evaluate(va_iter)
100 |     test_loss = None
101 | elif args.split == 'test':
102 |     test_loss = evaluate(te_iter)
103 |     valid_loss = None
104 | 
105 | def format_log(loss, split):
106 |     if args.dataset in ['enwik8', 'text8']:
107 |         log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
108 |             split, loss, loss / math.log(2))
109 |     else:
110 |         log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
111 |             split, loss, math.exp(loss))
112 |     return log_str
113 | 
114 | log_str = ''
115 | if valid_loss is not None:
116 |     log_str += format_log(valid_loss, 'valid')
117 | if test_loss is not None:
118 |     log_str += format_log(test_loss, 'test')
119 | 
120 | logging('=' * 100)
121 | logging(log_str)
122 | logging('=' * 100)
123 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/run_wt103_adan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $1 == 'train' ]]; then
 4 |     echo 'Run training...'
 5 |     python train.py \
 6 |         --cuda \
 7 |         --data /root/autodl-tmp/data/wikitext-103/ \
 8 |         --dataset wt103 \
 9 |         --adaptive \
10 |         --n_layer 16 \
11 |         --d_model 410 \
12 |         --n_head 10 \
13 |         --d_head 41 \
14 |         --d_inner 2100 \
15 |         --dropout 0.1 \
16 |         --dropatt 0.0 \
17 |         --optim adan \
18 |         --wd 0.02 \
19 |         --lr 0.0015 \
20 |         --opt-betas 0.9 0.9 0.999 \
21 |         --clip 0.25 \
22 |         --lr_min 1e-6 \
23 |         --warmup_step 5000 \
24 |         --max_step 200000 \
25 |         --tgt_len 150 \
26 |         --mem_len 150 \
27 |         --eval_tgt_len 150 \
28 |         --batch_size 60 \
29 |         --multi_gpu \
30 |         --gpu0_bsz 4 \
31 |         ${@:2}
32 | elif [[ $1 == 'eval' ]]; then
33 |     echo 'Run evaluation...'
34 |     python eval.py \
35 |         --cuda \
36 |         --data /root/autodl-tmp/data/wikitext-103/ \
37 |         --dataset wt103 \
38 |         --tgt_len 64 \
39 |         --mem_len 640 \
40 |         --clamp_len 400 \
41 |         --same_length \
42 |         --split test \
43 |         ${@:2}
44 | else
45 |     echo 'unknown argment 1'
46 | fi
47 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/adaptive_softmax.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | class AdaptiveLogSoftmax(nn.Module):
10 |     def __init__(self, in_features, n_classes, cutoffs, keep_order=False):
11 |         super(AdaptiveLogSoftmax, self).__init__()
12 | 
13 |         cutoffs = list(cutoffs)
14 | 
15 |         if (cutoffs != sorted(cutoffs)) \
16 |                 or (min(cutoffs) <= 0) \
17 |                 or (max(cutoffs) >= (n_classes - 1)) \
18 |                 or (len(set(cutoffs)) != len(cutoffs)) \
19 |                 or any([int(c) != c for c in cutoffs]):
20 | 
21 |             raise ValueError("cutoffs should be a sequence of unique, positive "
22 |                              "integers sorted in an increasing order, where "
23 |                              "each value is between 1 and n_classes-1")
24 | 
25 |         self.in_features = in_features
26 |         self.n_classes = n_classes
27 |         self.cutoffs = cutoffs + [n_classes]
28 | 
29 |         self.shortlist_size = self.cutoffs[0]
30 |         self.n_clusters = len(self.cutoffs) - 1
31 |         self.head_size = self.shortlist_size + self.n_clusters
32 | 
33 |         self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features))
34 |         self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
35 | 
36 |         self.keep_order = keep_order
37 | 
38 | 
39 |     def forward(self, hidden, target, weight, bias, keep_order=False):
40 |         if hidden.size(0) != target.size(0):
41 |             raise RuntimeError('Input and target should have the same size '
42 |                                'in the batch dimension.')
43 | 
44 |         head_weight = torch.cat(
45 |             [weight[:self.shortlist_size], self.cluster_weight], dim=0)
46 |         head_bias = torch.cat(
47 |             [bias[:self.shortlist_size], self.cluster_bias], dim=0)
48 | 
49 |         head_logit = F.linear(hidden, head_weight, bias=head_bias)
50 |         head_logprob = F.log_softmax(head_logit, dim=1)
51 | 
52 |         nll = torch.zeros_like(target,
53 |                 dtype=hidden.dtype, device=hidden.device)
54 | 
55 |         offset = 0
56 |         cutoff_values = [0] + self.cutoffs
57 |         for i in range(len(cutoff_values) - 1):
58 |             l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1]
59 | 
60 |             mask_i = (target >= l_idx) & (target < h_idx)
61 |             indices_i = mask_i.nonzero().squeeze()
62 | 
63 |             if indices_i.numel() == 0:
64 |                 continue
65 | 
66 |             target_i = target.index_select(0, indices_i) - l_idx
67 |             head_logprob_i = head_logprob.index_select(0, indices_i)
68 | 
69 |             if i == 0:
70 |                 logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
71 |             else:
72 |                 weight_i = weight[l_idx:h_idx]
73 |                 bias_i = bias[l_idx:h_idx]
74 | 
75 |                 hidden_i = hidden.index_select(0, indices_i)
76 | 
77 |                 tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i)
78 |                 tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
79 | 
80 |                 logprob_i = head_logprob_i[:, -i] \
81 |                           + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
82 | 
83 |             if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
84 |                 nll.index_copy_(0, indices_i, -logprob_i)
85 |             else:
86 |                 nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
87 | 
88 |             offset += logprob_i.size(0)
89 | 
90 |         return nll
91 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/data_parallel.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.nn.parallel import DataParallel
 3 | import torch
 4 | from torch.nn.parallel._functions import Scatter
 5 | from torch.nn.parallel.parallel_apply import parallel_apply
 6 | 
 7 | def scatter(inputs, target_gpus, chunk_sizes, dim=0):
 8 |     r"""
 9 |     Slices tensors into approximately equal chunks and
10 |     distributes them across given GPUs. Duplicates
11 |     references to objects that are not tensors.
12 |     """
13 |     def scatter_map(obj):
14 |         if isinstance(obj, torch.Tensor):
15 |             try:
16 |                 return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
17 |             except:
18 |                 print('obj', obj.size())
19 |                 print('dim', dim)
20 |                 print('chunk_sizes', chunk_sizes)
21 |                 quit()
22 |         if isinstance(obj, tuple) and len(obj) > 0:
23 |             return list(zip(*map(scatter_map, obj)))
24 |         if isinstance(obj, list) and len(obj) > 0:
25 |             return list(map(list, zip(*map(scatter_map, obj))))
26 |         if isinstance(obj, dict) and len(obj) > 0:
27 |             return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
28 |         return [obj for targets in target_gpus]
29 | 
30 |     # After scatter_map is called, a scatter_map cell will exist. This cell
31 |     # has a reference to the actual function scatter_map, which has references
32 |     # to a closure that has a reference to the scatter_map cell (because the
33 |     # fn is recursive). To avoid this reference cycle, we set the function to
34 |     # None, clearing the cell
35 |     try:
36 |         return scatter_map(inputs)
37 |     finally:
38 |         scatter_map = None
39 | 
40 | def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0):
41 |     r"""Scatter with support for kwargs dictionary"""
42 |     inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
43 |     kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
44 |     if len(inputs) < len(kwargs):
45 |         inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
46 |     elif len(kwargs) < len(inputs):
47 |         kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
48 |     inputs = tuple(inputs)
49 |     kwargs = tuple(kwargs)
50 |     return inputs, kwargs
51 | 
52 | class BalancedDataParallel(DataParallel):
53 |     def __init__(self, gpu0_bsz, *args, **kwargs):
54 |         self.gpu0_bsz = gpu0_bsz
55 |         super().__init__(*args, **kwargs)
56 | 
57 |     def forward(self, *inputs, **kwargs):
58 |         if not self.device_ids:
59 |             return self.module(*inputs, **kwargs)
60 |         if self.gpu0_bsz == 0:
61 |             device_ids = self.device_ids[1:]
62 |         else:
63 |             device_ids = self.device_ids
64 |         inputs, kwargs = self.scatter(inputs, kwargs, device_ids)
65 |         if len(self.device_ids) == 1:
66 |             return self.module(*inputs[0], **kwargs[0])
67 |         replicas = self.replicate(self.module, self.device_ids)
68 |         if self.gpu0_bsz == 0:
69 |             replicas = replicas[1:]
70 |         outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs)
71 |         return self.gather(outputs, self.output_device)
72 | 
73 |     def parallel_apply(self, replicas, device_ids, inputs, kwargs):
74 |         return parallel_apply(replicas, inputs, kwargs, device_ids)
75 | 
76 |     def scatter(self, inputs, kwargs, device_ids):
77 |         bsz = inputs[0].size(self.dim)
78 |         num_dev = len(self.device_ids)
79 |         gpu0_bsz = self.gpu0_bsz
80 |         bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1)
81 |         if gpu0_bsz < bsz_unit:
82 |             chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1)
83 |             delta = bsz - sum(chunk_sizes)
84 |             for i in range(delta):
85 |                 chunk_sizes[i + 1] += 1
86 |             if gpu0_bsz == 0:
87 |                 chunk_sizes = chunk_sizes[1:]
88 |         else:
89 |             return super().scatter(inputs, kwargs, device_ids)
90 |         return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim)
91 | 
92 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/exp_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import os, shutil
 3 | 
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def logging(s, log_path, print_=True, log_=True):
10 |     if print_:
11 |         print(s)
12 |     if log_:
13 |         with open(log_path, 'a+') as f_log:
14 |             f_log.write(s + '\n')
15 | 
16 | def get_logger(log_path, **kwargs):
17 |     return functools.partial(logging, log_path=log_path, **kwargs)
18 | 
19 | def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
20 |     if debug:
21 |         print('Debug Mode : no experiment dir created')
22 |         return functools.partial(logging, log_path=None, log_=False)
23 | 
24 |     if not os.path.exists(dir_path):
25 |         os.makedirs(dir_path)
26 | 
27 |     print('Experiment dir : {}'.format(dir_path))
28 |     if scripts_to_save is not None:
29 |         script_path = os.path.join(dir_path, 'scripts')
30 |         if not os.path.exists(script_path):
31 |             os.makedirs(script_path)
32 |         for script in scripts_to_save:
33 |             dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
34 |             shutil.copyfile(script, dst_file)
35 | 
36 |     return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
37 | 
38 | def save_checkpoint(model, optimizer, path, epoch):
39 |     torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
40 |     torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))
41 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/log_uniform_sampler.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import numpy as np
  4 | 
  5 | class LogUniformSampler(object):
  6 |     def __init__(self, range_max, n_sample):
  7 |         """
  8 |         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
  9 |             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
 10 | 
 11 |         expected count can be approximated by 1 - (1 - p)^n
 12 |         and we use a numerically stable version -expm1(num_tries * log1p(-p))
 13 | 
 14 |         Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
 15 |         """
 16 |         with torch.no_grad():
 17 |             self.range_max = range_max
 18 |             log_indices = torch.arange(1., range_max+2., 1.).log_()
 19 |             self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
 20 |             # print('P', self.dist.numpy().tolist()[-30:])
 21 | 
 22 |             self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
 23 | 
 24 |         self.n_sample = n_sample
 25 | 
 26 |     def sample(self, labels):
 27 |         """
 28 |             labels: [b1, b2]
 29 |         Return
 30 |             true_log_probs: [b1, b2]
 31 |             samp_log_probs: [n_sample]
 32 |             neg_samples: [n_sample]
 33 |         """
 34 | 
 35 |         # neg_samples = torch.empty(0).long()
 36 |         n_sample = self.n_sample
 37 |         n_tries = 2 * n_sample
 38 | 
 39 |         with torch.no_grad():
 40 |             neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
 41 |             device = labels.device
 42 |             neg_samples = neg_samples.to(device)
 43 |             true_log_probs = self.log_q[labels].to(device)
 44 |             samp_log_probs = self.log_q[neg_samples].to(device)
 45 |             return true_log_probs, samp_log_probs, neg_samples
 46 | 
 47 | def sample_logits(embedding, bias, labels, inputs, sampler):
 48 |     """
 49 |         embedding: an nn.Embedding layer
 50 |         bias: [n_vocab]
 51 |         labels: [b1, b2]
 52 |         inputs: [b1, b2, n_emb]
 53 |         sampler: you may use a LogUniformSampler
 54 |     Return
 55 |         logits: [b1, b2, 1 + n_sample]
 56 |     """
 57 |     true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
 58 |     n_sample = neg_samples.size(0)
 59 |     b1, b2 = labels.size(0), labels.size(1)
 60 |     all_ids = torch.cat([labels.view(-1), neg_samples])
 61 |     all_w = embedding(all_ids)
 62 |     true_w = all_w[: -n_sample].view(b1, b2, -1)
 63 |     sample_w = all_w[- n_sample:].view(n_sample, -1)
 64 | 
 65 |     all_b = bias[all_ids]
 66 |     true_b = all_b[: -n_sample].view(b1, b2)
 67 |     sample_b = all_b[- n_sample:]
 68 | 
 69 |     hit = (labels[:, :, None] == neg_samples).detach()
 70 | 
 71 |     true_logits = torch.einsum('ijk,ijk->ij',
 72 |         [true_w, inputs]) + true_b - true_log_probs
 73 |     sample_logits = torch.einsum('lk,ijk->ijl',
 74 |         [sample_w, inputs]) + sample_b - samp_log_probs
 75 |     sample_logits.masked_fill_(hit, -1e30)
 76 |     logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
 77 | 
 78 |     return logits
 79 | 
 80 | 
 81 | # class LogUniformSampler(object):
 82 | #     def __init__(self, range_max, unique=False):
 83 | #         """
 84 | #         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
 85 | #             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
 86 | #         """
 87 | #         self.range_max = range_max
 88 | #         log_indices = torch.arange(1., range_max+2., 1.).log_()
 89 | #         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
 90 | 
 91 | #         self.unique = unique
 92 | 
 93 | #         if self.unique:
 94 | #             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
 95 | 
 96 | #     def sample(self, n_sample, labels):
 97 | #         pos_sample, new_labels = labels.unique(return_inverse=True)
 98 | #         n_pos_sample = pos_sample.size(0)
 99 | #         n_neg_sample = n_sample - n_pos_sample
100 | 
101 | #         if self.unique:
102 | #             self.exclude_mask.index_fill_(0, pos_sample, 1)
103 | #             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
104 | #             self.exclude_mask.index_fill_(0, pos_sample, 0)
105 | #         else:
106 | #             sample_dist = self.dist
107 | 
108 | #         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
109 | 
110 | #         sample = torch.cat([pos_sample, neg_sample])
111 | #         sample_prob = self.dist[sample]
112 | 
113 | #         return new_labels, sample, sample_prob
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     S, B = 3, 4
118 |     n_vocab = 10000
119 |     n_sample = 5
120 |     H = 32
121 | 
122 |     labels = torch.LongTensor(S, B).random_(0, n_vocab)
123 | 
124 |     # sampler = LogUniformSampler(n_vocab, unique=False)
125 |     # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
126 | 
127 |     sampler = LogUniformSampler(n_vocab, unique=True)
128 |     # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
129 | 
130 |     # print('true_probs', true_probs.numpy().tolist())
131 |     # print('samp_probs', samp_probs.numpy().tolist())
132 |     # print('neg_samples', neg_samples.numpy().tolist())
133 | 
134 |     # print('sum', torch.sum(sampler.dist).item())
135 | 
136 |     # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
137 | 
138 |     embedding = nn.Embedding(n_vocab, H)
139 |     bias = torch.zeros(n_vocab)
140 |     inputs = torch.Tensor(S, B, H).normal_()
141 | 
142 |     logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
143 |     print('logits', logits.detach().numpy().tolist())
144 |     print('logits shape', logits.size())
145 |     print('out_labels', out_labels.detach().numpy().tolist())
146 |     print('out_labels shape', out_labels.size())
147 | 
148 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/proj_adaptive_softmax.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
 10 | CUDA_MINOR = int(torch.version.cuda.split('.')[1])
 11 | 
 12 | class ProjectedAdaptiveLogSoftmax(nn.Module):
 13 |     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
 14 |                  keep_order=False):
 15 |         super(ProjectedAdaptiveLogSoftmax, self).__init__()
 16 | 
 17 |         self.n_token = n_token
 18 |         self.d_embed = d_embed
 19 |         self.d_proj = d_proj
 20 | 
 21 |         self.cutoffs = cutoffs + [n_token]
 22 |         self.cutoff_ends = [0] + self.cutoffs
 23 |         self.div_val = div_val
 24 | 
 25 |         self.shortlist_size = self.cutoffs[0]
 26 |         self.n_clusters = len(self.cutoffs) - 1
 27 |         self.head_size = self.shortlist_size + self.n_clusters
 28 | 
 29 |         if self.n_clusters > 0:
 30 |             self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
 31 |             self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
 32 | 
 33 |         self.out_layers = nn.ModuleList()
 34 |         self.out_projs = nn.ParameterList()
 35 | 
 36 |         if div_val == 1:
 37 |             for i in range(len(self.cutoffs)):
 38 |                 if d_proj != d_embed:
 39 |                     self.out_projs.append(
 40 |                         nn.Parameter(torch.Tensor(d_proj, d_embed))
 41 |                     )
 42 |                 else:
 43 |                     self.out_projs.append(None)
 44 | 
 45 |             self.out_layers.append(nn.Linear(d_embed, n_token))
 46 |         else:
 47 |             for i in range(len(self.cutoffs)):
 48 |                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
 49 |                 d_emb_i = d_embed // (div_val ** i)
 50 | 
 51 |                 self.out_projs.append(
 52 |                     nn.Parameter(torch.Tensor(d_proj, d_emb_i))
 53 |                 )
 54 | 
 55 |                 self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
 56 | 
 57 |         self.keep_order = keep_order
 58 | 
 59 |     def _compute_logit(self, hidden, weight, bias, proj):
 60 |         if proj is None:
 61 |             logit = F.linear(hidden, weight, bias=bias)
 62 |         else:
 63 |             # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
 64 |             proj_hid = F.linear(hidden, proj.t().contiguous())
 65 |             logit = F.linear(proj_hid, weight, bias=bias)
 66 |             # else:
 67 |             #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
 68 |             #     if bias is not None:
 69 |             #         logit = logit + bias
 70 | 
 71 |         return logit
 72 | 
 73 |     def forward(self, hidden, target, keep_order=False):
 74 |         '''
 75 |             hidden :: [len*bsz x d_proj]
 76 |             target :: [len*bsz]
 77 |         '''
 78 | 
 79 |         if hidden.size(0) != target.size(0):
 80 |             raise RuntimeError('Input and target should have the same size '
 81 |                                'in the batch dimension.')
 82 | 
 83 |         if self.n_clusters == 0:
 84 |             logit = self._compute_logit(hidden, self.out_layers[0].weight,
 85 |                                         self.out_layers[0].bias, self.out_projs[0])
 86 |             nll = -F.log_softmax(logit, dim=-1) \
 87 |                     .gather(1, target.unsqueeze(1)).squeeze(1)
 88 |         else:
 89 |             # construct weights and biases
 90 |             weights, biases = [], []
 91 |             for i in range(len(self.cutoffs)):
 92 |                 if self.div_val == 1:
 93 |                     l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
 94 |                     weight_i = self.out_layers[0].weight[l_idx:r_idx]
 95 |                     bias_i = self.out_layers[0].bias[l_idx:r_idx]
 96 |                 else:
 97 |                     weight_i = self.out_layers[i].weight
 98 |                     bias_i = self.out_layers[i].bias
 99 | 
100 |                 if i == 0:
101 |                     weight_i = torch.cat(
102 |                         [weight_i, self.cluster_weight], dim=0)
103 |                     bias_i = torch.cat(
104 |                         [bias_i, self.cluster_bias], dim=0)
105 | 
106 |                 weights.append(weight_i)
107 |                 biases.append(bias_i)
108 | 
109 |             head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
110 | 
111 |             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
112 |             head_logprob = F.log_softmax(head_logit, dim=1)
113 | 
114 |             nll = torch.zeros_like(target,
115 |                     dtype=hidden.dtype, device=hidden.device)
116 | 
117 |             offset = 0
118 |             cutoff_values = [0] + self.cutoffs
119 |             for i in range(len(cutoff_values) - 1):
120 |                 l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
121 | 
122 |                 mask_i = (target >= l_idx) & (target < r_idx)
123 |                 indices_i = mask_i.nonzero().squeeze()
124 | 
125 |                 if indices_i.numel() == 0:
126 |                     continue
127 | 
128 |                 target_i = target.index_select(0, indices_i) - l_idx
129 |                 head_logprob_i = head_logprob.index_select(0, indices_i)
130 | 
131 |                 if i == 0:
132 |                     logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
133 |                 else:
134 |                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
135 | 
136 |                     hidden_i = hidden.index_select(0, indices_i)
137 | 
138 |                     tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
139 |                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
140 | 
141 |                     logprob_i = head_logprob_i[:, -i] \
142 |                               + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
143 | 
144 |                 if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
145 |                     nll.index_copy_(0, indices_i, -logprob_i)
146 |                 else:
147 |                     nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
148 | 
149 |                 offset += logprob_i.size(0)
150 | 
151 |         return nll
152 | 


--------------------------------------------------------------------------------
/NLP/Transformer-XL/utils/vocabulary.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import Counter, OrderedDict
  3 | 
  4 | import torch
  5 | 
  6 | class Vocab(object):
  7 |     def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
  8 |                  delimiter=None, vocab_file=None):
  9 |         self.counter = Counter()
 10 |         self.special = special
 11 |         self.min_freq = min_freq
 12 |         self.max_size = max_size
 13 |         self.lower_case = lower_case
 14 |         self.delimiter = delimiter
 15 |         self.vocab_file = vocab_file
 16 | 
 17 |     def tokenize(self, line, add_eos=False, add_double_eos=False):
 18 |         line = line.strip()
 19 |         # convert to lower case
 20 |         if self.lower_case:
 21 |             line = line.lower()
 22 | 
 23 |         # empty delimiter '' will evaluate False
 24 |         if self.delimiter == '':
 25 |             symbols = line
 26 |         else:
 27 |             symbols = line.split(self.delimiter)
 28 | 
 29 |         if add_double_eos: # lm1b
 30 |             return ['<S>'] + symbols + ['<S>']
 31 |         elif add_eos:
 32 |             return symbols + ['<eos>']
 33 |         else:
 34 |             return symbols
 35 | 
 36 |     def count_file(self, path, verbose=False, add_eos=False):
 37 |         if verbose: print('counting file {} ...'.format(path))
 38 |         assert os.path.exists(path)
 39 | 
 40 |         sents = []
 41 |         with open(path, 'r', encoding='utf-8') as f:
 42 |             for idx, line in enumerate(f):
 43 |                 if verbose and idx > 0 and idx % 500000 == 0:
 44 |                     print('    line {}'.format(idx))
 45 |                 symbols = self.tokenize(line, add_eos=add_eos)
 46 |                 self.counter.update(symbols)
 47 |                 sents.append(symbols)
 48 | 
 49 |         return sents
 50 | 
 51 |     def count_sents(self, sents, verbose=False):
 52 |         """
 53 |             sents : a list of sentences, each a list of tokenized symbols
 54 |         """
 55 |         if verbose: print('counting {} sents ...'.format(len(sents)))
 56 |         for idx, symbols in enumerate(sents):
 57 |             if verbose and idx > 0 and idx % 500000 == 0:
 58 |                 print('    line {}'.format(idx))
 59 |             self.counter.update(symbols)
 60 | 
 61 |     def _build_from_file(self, vocab_file):
 62 |         self.idx2sym = []
 63 |         self.sym2idx = OrderedDict()
 64 | 
 65 |         with open(vocab_file, 'r', encoding='utf-8') as f:
 66 |             for line in f:
 67 |                 symb = line.strip().split()[0]
 68 |                 self.add_symbol(symb)
 69 |         self.unk_idx = self.sym2idx['<UNK>']
 70 | 
 71 |     def build_vocab(self):
 72 |         if self.vocab_file:
 73 |             print('building vocab from {}'.format(self.vocab_file))
 74 |             self._build_from_file(self.vocab_file)
 75 |             print('final vocab size {}'.format(len(self)))
 76 |         else:
 77 |             print('building vocab with min_freq={}, max_size={}'.format(
 78 |                 self.min_freq, self.max_size))
 79 |             self.idx2sym = []
 80 |             self.sym2idx = OrderedDict()
 81 | 
 82 |             for sym in self.special:
 83 |                 self.add_special(sym)
 84 | 
 85 |             for sym, cnt in self.counter.most_common(self.max_size):
 86 |                 if cnt < self.min_freq: break
 87 |                 self.add_symbol(sym)
 88 | 
 89 |             print('final vocab size {} from {} unique tokens'.format(
 90 |                 len(self), len(self.counter)))
 91 | 
 92 |     def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
 93 |             add_double_eos=False):
 94 |         if verbose: print('encoding file {} ...'.format(path))
 95 |         assert os.path.exists(path)
 96 |         encoded = []
 97 |         with open(path, 'r', encoding='utf-8') as f:
 98 |             for idx, line in enumerate(f):
 99 |                 if verbose and idx > 0 and idx % 500000 == 0:
100 |                     print('    line {}'.format(idx))
101 |                 symbols = self.tokenize(line, add_eos=add_eos,
102 |                     add_double_eos=add_double_eos)
103 |                 encoded.append(self.convert_to_tensor(symbols))
104 | 
105 |         if ordered:
106 |             encoded = torch.cat(encoded)
107 | 
108 |         return encoded
109 | 
110 |     def encode_sents(self, sents, ordered=False, verbose=False):
111 |         if verbose: print('encoding {} sents ...'.format(len(sents)))
112 |         encoded = []
113 |         for idx, symbols in enumerate(sents):
114 |             if verbose and idx > 0 and idx % 500000 == 0:
115 |                 print('    line {}'.format(idx))
116 |             encoded.append(self.convert_to_tensor(symbols))
117 | 
118 |         if ordered:
119 |             encoded = torch.cat(encoded)
120 | 
121 |         return encoded
122 | 
123 |     def add_special(self, sym):
124 |         if sym not in self.sym2idx:
125 |             self.idx2sym.append(sym)
126 |             self.sym2idx[sym] = len(self.idx2sym) - 1
127 |             setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
128 | 
129 |     def add_symbol(self, sym):
130 |         if sym not in self.sym2idx:
131 |             self.idx2sym.append(sym)
132 |             self.sym2idx[sym] = len(self.idx2sym) - 1
133 | 
134 |     def get_sym(self, idx):
135 |         assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
136 |         return self.idx2sym[idx]
137 | 
138 |     def get_idx(self, sym):
139 |         if sym in self.sym2idx:
140 |             return self.sym2idx[sym]
141 |         else:
142 |             # print('encounter unk {}'.format(sym))
143 |             assert '<eos>' not in sym
144 |             assert hasattr(self, 'unk_idx')
145 |             return self.sym2idx.get(sym, self.unk_idx)
146 | 
147 |     def get_symbols(self, indices):
148 |         return [self.get_sym(idx) for idx in indices]
149 | 
150 |     def get_indices(self, symbols):
151 |         return [self.get_idx(sym) for sym in symbols]
152 | 
153 |     def convert_to_tensor(self, symbols):
154 |         return torch.LongTensor(self.get_indices(symbols))
155 | 
156 |     def convert_to_sent(self, indices, exclude=None):
157 |         if exclude is None:
158 |             return ' '.join([self.get_sym(idx) for idx in indices])
159 |         else:
160 |             return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
161 | 
162 |     def __len__(self):
163 |         return len(self.idx2sym)
164 | 


--------------------------------------------------------------------------------
/dreamfusion/README.md:
--------------------------------------------------------------------------------
 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
 2 | 
 3 | We show the results of the text-to-3D task supported by the [DreamFusion Project](https://github.com/ashawkey/stable-dreamfusion).
 4 | 
 5 | ## Usage of Adan for DreamFusion
 6 | 
 7 | Adan is the default optimizer for the [DreamFusion Project](https://github.com/ashawkey/stable-dreamfusion); please refer to its repo to run these experiments.
 8 | 
 9 | The project calls the Adan as follows:
10 | 
11 | ```
12 | optimizer = lambda model: Adan(model.get_params(5 * opt.lr), eps=1e-8, weight_decay=2e-5, max_grad_norm=5.0, foreach=False)
13 | ```
14 | 
15 | We may tune learning rate `opt.lr` and maximal gradient norm `max_grad_norm` to refine the results w.r.t. some text prompts.
16 | 
17 | ## Training and Evaluation
18 | 
19 | - #### Training
20 | 
21 |   ` python main.py --text $PROMPT --workspace $SAVE_PATH -O`
22 | 
23 | - #### Evaluation
24 | 
25 |   `python main.py --workspace $SAVE_PATH -O --test`
26 | 
27 | ## Results
28 | 
29 | **prompt:** `a DSLR photo of the leaning tower of Pisa, aerial view`. Adan‘s model has more refined details.
30 | 
31 | https://user-images.githubusercontent.com/10042844/211014605-3860b816-cc1c-4367-b96e-406cd375240a.mp4
32 | 
33 | https://user-images.githubusercontent.com/10042844/211014603-82564238-cf5b-4ffa-b7a3-175bd565e5ce.mp4
34 | 
35 | **prompt:** `Sydney opera house, aerial view`. Adan provides better details.
36 | 
37 | https://user-images.githubusercontent.com/10042844/211014601-da430196-021d-4f6b-962b-8441feff5d02.mp4
38 | 
39 | https://user-images.githubusercontent.com/10042844/211014594-3b5c05e3-9018-4a39-b5db-d6f2fc111cce.mp4
40 | 
41 | **prompt:** `the Statue of Liberty, aerial view`. Adan has a better picture with this prompt.
42 | 
43 | https://user-images.githubusercontent.com/10042844/211014579-4db62a55-fd05-4616-9793-5af5fea81c62.mp4
44 | 
45 | https://user-images.githubusercontent.com/10042844/211014575-db8b9b1b-7e81-4a27-ba36-2ef74c00f0bc.mp4
46 | 
47 | **prompt:** `the Imperial State Crown of England`
48 | 
49 | https://user-images.githubusercontent.com/10042844/211014561-7a943df3-ed8f-4c1a-b51f-8ca5bccf1819.mp4
50 | 
51 | https://user-images.githubusercontent.com/10042844/211014554-b7f696dd-8635-4d75-81c3-218dd0231c76.mp4
52 | 
53 | **prompt:** `a candelabra with many candles`. Adam's model has some candles suspended in the air while Adan's result is more clear.
54 | 
55 | https://user-images.githubusercontent.com/10042844/211014542-47f19116-9fb9-4e65-ad08-522d1c97ba11.mp4
56 | 
57 | https://user-images.githubusercontent.com/10042844/211014532-6dec1554-c552-4fc5-92c4-cf9954d844cb.mp4
58 | 
59 | **prompt:** `an extravagant mansion, aerial view`. Adan's result is more meaningful.
60 | 
61 | https://user-images.githubusercontent.com/10042844/211014591-82d6e57e-bc9f-4b38-8d23-9b156a35334c.mp4
62 | 
63 | https://user-images.githubusercontent.com/10042844/211014584-aa038ea9-58ae-422f-a128-e885d7d7ab08.mp4
64 | 
65 | **prompt:** `Neuschwanstein Castle, aerial view`
66 | 
67 | https://user-images.githubusercontent.com/10042844/211014548-160c7416-d74f-48aa-b3dc-bfd55e809b62.mp4
68 | 
69 | https://user-images.githubusercontent.com/10042844/211014545-2515b2be-bff8-4e7c-9718-0ee0210c98e9.mp4
70 | 
71 | **prompt:** `a delicious hamburger`
72 | 
73 | https://user-images.githubusercontent.com/10042844/211014566-ae9c6f72-2bbf-4e4b-8f15-27851464a620.mp4
74 | 
75 | https://user-images.githubusercontent.com/10042844/211014571-af207d24-1119-4b34-a31d-5250046cc426.mp4
76 | 
77 | **prompt:** `a palm tree, low poly 3d model`. Adan's model has a better shadow part.
78 | 
79 | https://user-images.githubusercontent.com/10042844/211014613-6373253d-7a37-4b66-ac1b-d04bb7819c01.mp4
80 | 
81 | https://user-images.githubusercontent.com/10042844/211014610-67817157-fe9e-4ace-a188-e84d88bf0f66.mp4
82 | 


--------------------------------------------------------------------------------
/fused_adan/README.md:
--------------------------------------------------------------------------------
  1 | # Adan Optimizer fused kernel
  2 | 
  3 | ## Dependence
  4 | 
  5 | 1. Libtorch/Pytorch (ATen is required, Compile passed on Pytorch 1.13.1)
  6 | 2. CUDA Toolkit (Compile passed on CUDA 11.6+)
  7 | 3. ninja
  8 | 
  9 | ## Usage
 10 | 
 11 | Using `Adan(..., foreach=False, fused=True)` enables fused Adan kernel with single tensor access.
 12 | Using `Adan(..., foreach=True, fused=True)` enables fused Adan kernel with multi-tensor access.
 13 | 
 14 | `foreach=True` is recommended for better performance.
 15 | 
 16 | **Single tensor access**
 17 | A *for loop* is used to traverse each layer when calculating the gradient of each Layer, requiring multiple kernels starts. Theoretically, accessing only one layer of parameters at a time is good for reducing peak memory usage, but it introduces kernel launch overhead.
 18 | 
 19 | **Multi tensor access**
 20 | The parameters of all layers are passed into the kernel at once, and the kernel internally uses a for loop to traverse each layer, requiring only one kernel start. Theoretically, this will lead to an increase in peak memory usage but will reduce the overhead of kernel startup. In actual tests, the increase in memory usage is not significant, but the kernel launch overhead is reduced.
 21 | 
 22 | ## Benchmarking Results
 23 | 
 24 | Benchmarking peak memory and wall duration of optimizers: Adam v.s. FusedAdan. The benchmarking uses GPT-2 with the different numbers of heads, layers, and Emb. Dim on a single NVIDIA A100 GPU (40G).
 25 | 
 26 | The benchmarking is conducted based on the following config:
 27 | 
 28 | - vocab size: 49280
 29 | - batch size: 1
 30 | - sequence length: 2048
 31 | 
 32 | #### Memory Comparison
 33 | 
 34 | | Head | Layers | Emb. Dim | Model Size (MB) | Adam Peak (MB) | FusedAdan Peak (MB) | Δ (%) |
 35 | | :--: | :----: | :------: | :-------------: | :------------: | :-----------------: | :---: |
 36 | |  6   |   6    |   768    |       81        |      4490      |        4490         | 0.00  |
 37 | |  12  |   6    |   768    |       81        |      5848      |        5848         | 0.00  |
 38 | |  16  |   6    |   768    |       81        |      6775      |        6775         | 0.00  |
 39 | |  6   |   12   |   768    |       124       |      7151      |        7153         | 0.03  |
 40 | |  12  |   12   |   768    |       124       |      9869      |        9871         | 0.02  |
 41 | |  16  |   12   |   768    |       124       |     11733      |        11735        | 0.02  |
 42 | |  16  |   6    |   1024   |       128       |      7302      |        7302         | 0.00  |
 43 | |  16  |   12   |   1024   |       203       |     12719      |        12719        | 0.00  |
 44 | |  6   |   24   |   768    |       209       |     12471      |        12473        | 0.02  |
 45 | |  12  |   24   |   768    |       209       |     17907      |        17909        | 0.01  |
 46 | |  16  |   24   |   768    |       209       |     21596      |        21598        | 0.01  |
 47 | |  6   |   6    |   1536   |       248       |      6880      |        7308         | 6.22  |
 48 | |  12  |   6    |   1536   |       248       |      8235      |        8235         | 0.00  |
 49 | |  16  |   6    |   1536   |       248       |      9141      |        9141         | 0.00  |
 50 | |  16  |   24   |   1024   |       354       |     23530      |        23532        | 0.01  |
 51 | |  16  |   6    |   2048   |       407       |     11098      |        11098        | 0.00  |
 52 | |  6   |   12   |   1536   |       418       |     11137      |        12213        | 9.66  |
 53 | |  12  |   12   |   1536   |       418       |     13855      |        13857        | 0.01  |
 54 | |  16  |   12   |   1536   |       418       |     15667      |        15669        | 0.01  |
 55 | |  16  |   6    |   2560   |       603       |     13967      |        15965        | 14.30 |
 56 | |  16  |   12   |   2048   |       709       |     18851      |        18853        | 0.01  |
 57 | |  6   |   24   |   1536   |       758       |     19660      |        21997        | 11.88 |
 58 | |  12  |   24   |   1536   |       758       |     25096      |        25100        | 0.02  |
 59 | |  16  |   24   |   1536   |       758       |     28720      |        28724        | 0.01  |
 60 | |  16  |   24   |   2048   |      1313       |     34357      |        34363        | 0.02  |
 61 | 
 62 | #### Time Comparison
 63 | 
 64 | The duration time is the total time of 200 `optimizer.step()`.
 65 | 
 66 | | Head | Layers | Emb. Dim | Model Size (MB) | Adam Time (ms) | FusedAdan Time (ms) | FusedAdan/Adam (%) |
 67 | | :--: | :----: | :------: | :-------------: | :------------: | :-----------------: | :----------------: |
 68 | |  6   |   6    |   768    |       81        |      5.40      |        4.07         |        81.6        |
 69 | |  12  |   6    |   768    |       81        |      5.41      |        4.16         |        76.9        |
 70 | |  16  |   6    |   768    |       81        |      5.41      |        4.11         |        76.0        |
 71 | |  6   |   12   |   768    |       124       |      8.47      |        6.25         |        73.8        |
 72 | |  12  |   12   |   768    |       124       |      8.46      |        6.18         |        73.0        |
 73 | |  16  |   12   |   768    |       124       |      8.48      |        6.20         |        73.1        |
 74 | |  16  |   6    |   1024   |       128       |      7.57      |        6.28         |        83.0        |
 75 | |  16  |   12   |   1024   |       203       |     12.10      |        10.25        |        84.7        |
 76 | |  6   |   24   |   768    |       209       |     16.40      |        10.56        |        64.4        |
 77 | |  12  |   24   |   768    |       209       |     16.40      |        10.47        |        63.8        |
 78 | |  16  |   24   |   768    |       209       |     16.35      |        10.56        |        64.6        |
 79 | |  6   |   6    |   1536   |       248       |     15.92      |        12.29        |        77.2        |
 80 | |  12  |   6    |   1536   |       248       |     15.94      |        12.35        |        77.5        |
 81 | |  16  |   6    |   1536   |       248       |     15.94      |        12.36        |        77.5        |
 82 | |  16  |   24   |   1024   |       354       |     21.05      |        17.51        |        83.2        |
 83 | |  16  |   6    |   2048   |       407       |     25.05      |        19.84        |        79.2        |
 84 | |  6   |   12   |   1536   |       418       |     27.24      |        20.58        |        75.6        |
 85 | |  12  |   12   |   1536   |       418       |     27.25      |        20.54        |        75.4        |
 86 | |  16  |   12   |   1536   |       418       |     27.25      |        20.46        |        75.1        |
 87 | |  16  |   6    |   2560   |       603       |     36.86      |        29.55        |        80.1        |
 88 | |  16  |   12   |   2048   |       709       |     44.00      |        34.89        |        79.3        |
 89 | |  6   |   24   |   1536   |       758       |     49.87      |        37.52        |        75.2        |
 90 | |  12  |   24   |   1536   |       758       |     49.87      |        37.42        |        75.0        |
 91 | |  16  |   24   |   1536   |       758       |     49.92      |        37.56        |        75.2        |
 92 | |  16  |   24   |   2048   |      1313       |     81.81      |        64.48        |        77.9        |
 93 | 
 94 | ## Conclusion
 95 | 
 96 | - The extra memory consumption does not increase linearly with the model's size.
 97 | 
 98 | - In most cases, FusedAdan has no additional memory footprint and the time consumption is only 80% of Adam's.
 99 | 
100 | - In the extreme case, FusedAdan's additional memory footprint does not exceed 15%.
101 | 


--------------------------------------------------------------------------------
/fused_adan/include/fused_adan_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 The LightSeq Team
 2 |    Copyright NVIDIA/apex
 3 |    Copyright AlexwellChen
 4 |    This kernel is adapted from NVIDIA/apex and LightSeq Team
 5 | */
 6 | #include <ATen/ATen.h>
 7 | #include <torch/extension.h>
 8 | 
 9 | // CUDA forward declaration
10 | void fused_adan_cuda(
11 |     at::Tensor& p, at::Tensor& p_copy, at::Tensor& g, at::Tensor& exp_avg, 
12 |     at::Tensor& exp_avg_sq, at::Tensor& exp_avg_diff,
13 |     at::Tensor& neg_grad, float beta1, float beta2, float beta3, 
14 |     float bias_correction1, float bias_correction2, float bias_correction3_sqrt, 
15 |     float lr, float decay, float eps, bool no_prox, float clip_global_grad_norm);
16 | 
17 | void multi_tensor_adan_cuda(
18 |     int chunk_size,
19 |     at::Tensor noop_flag,
20 |     std::vector<std::vector<at::Tensor>> tensor_lists,
21 |     const float beta1,
22 |     const float beta2,
23 |     const float beta3,
24 |     const float bias_correction1,
25 |     const float bias_correction2,
26 |     const float bias_correction3_sqrt,
27 |     const float lr,
28 |     const float decay,
29 |     const float epsilon,
30 |     const bool no_prox,
31 |     const float clip_global_grad_norm);


--------------------------------------------------------------------------------
/fused_adan/include/multi_tensor_apply.cuh:
--------------------------------------------------------------------------------
  1 | /* Copyright 2021 The LightSeq Team
  2 |    Copyright NVIDIA/apex
  3 |    This file is adapted from NVIDIA/apex
  4 | */
  5 | #include <ATen/ATen.h>
  6 | #include <ATen/AccumulateType.h>
  7 | #include <ATen/cuda/CUDAContext.h>
  8 | #include <ATen/cuda/Exceptions.h>
  9 | #include <assert.h>
 10 | #include <c10/cuda/CUDAGuard.h>
 11 | 
 12 | // #include <iostream>
 13 | 
 14 | // This header is the one-stop shop for all your multi-tensor apply needs.
 15 | 
 16 | // TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
 17 | constexpr int depth_to_max_tensors[6] = {110, 64, 48, 36, 30, 24};
 18 | constexpr int depth_to_max_blocks[6] = {320, 320, 320, 320, 320, 320};
 19 | 
 20 | #ifndef TORCH_CHECK
 21 | #define TORCH_CHECK AT_CHECK
 22 | #endif
 23 | 
 24 | #ifdef VERSION_GE_1_3
 25 | #define DATA_PTR data_ptr
 26 | #else
 27 | #define DATA_PTR data
 28 | #endif
 29 | 
 30 | template <int n>
 31 | struct TensorListMetadata {
 32 |   void* addresses[n][depth_to_max_tensors[n - 1]];
 33 |   int sizes[depth_to_max_tensors[n - 1]];
 34 |   unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
 35 |   int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a
 36 |                                                    // full int.
 37 |   int start_tensor_this_launch;
 38 | };
 39 | 
 40 | template <typename T, typename U, typename... ArgTypes>
 41 | __global__ void multi_tensor_apply_kernel(int chunk_size,
 42 |                                           volatile int* noop_flag, T tl,
 43 |                                           U callable, ArgTypes... args) {
 44 |   // Hand the chunk information to the user-supplied functor to process however
 45 |   // it likes.
 46 |   callable(chunk_size, noop_flag, tl, args...);
 47 | }
 48 | 
 49 | template <int depth, typename T, typename... ArgTypes>
 50 | void multi_tensor_apply(
 51 |     int block_size, int chunk_size, const at::Tensor& noop_flag,
 52 |     const std::vector<std::vector<at::Tensor>>& tensor_lists, T callable,
 53 |     ArgTypes... args) {
 54 |   TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
 55 |   int len0 = tensor_lists[0].size();
 56 |   TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
 57 |   auto ref_device = tensor_lists[0][0].device();
 58 |   TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
 59 |   for (int l = 0; l < tensor_lists.size();
 60 |        l++)  // No range-based for because I need indices
 61 |   {
 62 |     TORCH_CHECK(tensor_lists[l].size() == len0,
 63 |                 "Size mismatch among tensor lists");
 64 |     for (int t = 0; t < tensor_lists[l].size(); t++) {
 65 |       // TODO:  Print which tensor fails.
 66 |       bool contiguous_memory = tensor_lists[l][t].is_contiguous();
 67 | #ifdef VERSION_GE_1_5
 68 |       contiguous_memory =
 69 |           (contiguous_memory ||
 70 |            tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 71 | #endif
 72 |       TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
 73 |       TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
 74 |                   "A tensor was not on the same device as the first tensor");
 75 |       TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
 76 |                   "Size mismatch");
 77 |     }
 78 |   }
 79 | 
 80 |   int ntensors = tensor_lists[0].size();
 81 | 
 82 |   TensorListMetadata<depth> tl;
 83 | 
 84 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
 85 |   auto stream = at::cuda::getCurrentCUDAStream();
 86 | 
 87 |   tl.start_tensor_this_launch = 0;
 88 |   int loc_block_info = 0;
 89 |   int loc_tensor_info = 0;
 90 |   for (int t = 0; t < ntensors; t++) {
 91 |     tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
 92 |     for (int d = 0; d < depth; d++)
 93 |       tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
 94 |     loc_tensor_info++;
 95 | 
 96 |     int chunks_this_tensor =
 97 |         (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
 98 | 
 99 |     for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
100 |       // std::cout << chunks_this_tensor << std::endl;
101 |       tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
102 |       tl.block_to_chunk[loc_block_info] = chunk;
103 |       loc_block_info++;
104 | 
105 |       bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
106 |                            chunk == chunks_this_tensor - 1);
107 |       bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
108 |       bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
109 |       if (tensors_full || blocks_full || last_chunk) {
110 |         // using accscalar_t = acc_type<scalar_t, true>;
111 |         multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
112 |             chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
113 | 
114 |         AT_CUDA_CHECK(cudaGetLastError());
115 | 
116 |         // Reset.  The control flow possibilities here make my brain hurt.
117 |         loc_block_info = 0;
118 |         if (chunk == chunks_this_tensor - 1) {
119 |           // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
120 |           // << std::endl;
121 |           loc_tensor_info = 0;
122 |           tl.start_tensor_this_launch = t + 1;
123 |         } else {
124 |           // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
125 |           // << std::endl;
126 |           tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
127 |           for (int d = 0; d < depth; d++)
128 |             tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
129 |           loc_tensor_info = 1;
130 |           tl.start_tensor_this_launch = t;
131 |         }
132 |       }
133 |     }
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/fused_adan/multi_tensor_adan_kernel.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright NVIDIA/apex
  2 |    Copyright AlexwellChen
  3 |    This kernel is adapted from NVIDIA/apex.
  4 | */
  5 | #include <ATen/ATen.h>
  6 | #include <ATen/AccumulateType.h>
  7 | #include <ATen/cuda/CUDAContext.h>
  8 | #include <ATen/cuda/Exceptions.h>
  9 | // Another possibility:
 10 | // #include <torch/all.h>
 11 | 
 12 | #include <assert.h>
 13 | 
 14 | #include "include/type_shim.h" // Used for DISPATCH
 15 | #include "include/multi_tensor_apply.cuh" 
 16 | #include "include/fused_adan_kernel.cuh"
 17 | 
 18 | #define BLOCK_SIZE 512
 19 | #define ILP 4
 20 | 
 21 | using MATH_T = float;
 22 | 
 23 | template<typename T>
 24 | struct AdanFunctor
 25 | {
 26 |    __device__ __forceinline__ void operator()(
 27 |     int chunk_size,
 28 |     volatile int* noop_gmem,
 29 |     TensorListMetadata<6>& tl,
 30 |     const float beta1,
 31 |     const float beta2,
 32 |     const float beta3,
 33 |     const float bias_correction1,
 34 |     const float bias_correction2,
 35 |     const float bias_correction3_sqrt,
 36 |     const float lr,
 37 |     const float decay,
 38 |     const float epsilon,
 39 |     const bool no_prox,
 40 |     const float clip_global_grad_norm
 41 |     )
 42 |   {
 43 |     // I'd like this kernel to propagate infs/nans.
 44 |     // if(*noop_gmem == 1)
 45 |     //   return;
 46 | 
 47 |     int tensor_loc = tl.block_to_tensor[blockIdx.x];
 48 | 
 49 |     // potentially use to pass in list of scalar
 50 |     // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
 51 | 
 52 |     int chunk_idx = tl.block_to_chunk[blockIdx.x];
 53 |     int n = tl.sizes[tensor_loc];
 54 | 
 55 |     T* p = (T*)tl.addresses[0][tensor_loc];
 56 |     p += chunk_idx*chunk_size;
 57 | 
 58 |     T* g = (T*)tl.addresses[1][tensor_loc];
 59 |     g += chunk_idx*chunk_size;
 60 | 
 61 |     T* exp_avg = (T*)tl.addresses[2][tensor_loc];
 62 |     exp_avg += chunk_idx*chunk_size;
 63 | 
 64 |     T* exp_avg_sq = (T*)tl.addresses[3][tensor_loc];
 65 |     exp_avg_sq += chunk_idx*chunk_size;
 66 | 
 67 |     T* exp_avg_diff = (T*)tl.addresses[4][tensor_loc];
 68 |     exp_avg_diff += chunk_idx*chunk_size;
 69 | 
 70 |     T* neg_grad = (T*)tl.addresses[5][tensor_loc];
 71 |     neg_grad += chunk_idx*chunk_size;
 72 | 
 73 |     n -= chunk_idx*chunk_size;
 74 | 
 75 |     for(int i_start = 0;
 76 |             i_start < n && i_start < chunk_size;
 77 |             i_start += blockDim.x*ILP)
 78 |     {
 79 |       MATH_T r_p[ILP];
 80 |       MATH_T r_g[ILP];
 81 |       MATH_T r_exp_avg[ILP];
 82 |       MATH_T r_exp_avg_sq[ILP];
 83 |       MATH_T r_exp_avg_diff[ILP];
 84 |       MATH_T r_neg_grad_diff[ILP];
 85 | #pragma unroll
 86 |       for(int ii = 0; ii < ILP; ii++)
 87 |       {
 88 |         int i = i_start + threadIdx.x + ii*blockDim.x;
 89 |         if(i < n && i < chunk_size)
 90 |         {
 91 |           r_p[ii] = p[i];
 92 |           r_g[ii] = g[i];
 93 |           r_exp_avg[ii] = exp_avg[i];
 94 |           r_exp_avg_sq[ii] = exp_avg_sq[i];
 95 |           r_exp_avg_diff[ii] = exp_avg_diff[i];
 96 |           r_neg_grad_diff[ii] = neg_grad[i];
 97 |         } else {
 98 |           r_p[ii] = MATH_T(0);
 99 |           r_g[ii] = MATH_T(0);
100 |           r_exp_avg[ii] = MATH_T(0);
101 |           r_exp_avg_sq[ii] = MATH_T(0);
102 |           r_exp_avg_diff[ii] = MATH_T(0);
103 |           r_neg_grad_diff[ii] = MATH_T(0);
104 |         }
105 |       }
106 | #pragma unroll
107 |       for(int ii = 0; ii < ILP; ii++)
108 |       {
109 |         r_g[ii] *= clip_global_grad_norm; //scaled_grad
110 |         MATH_T update;
111 |         r_neg_grad_diff[ii] = r_g[ii] + r_neg_grad_diff[ii];
112 |         update = r_g[ii] + beta2 * r_neg_grad_diff[ii]; // 1 MAC, reused twice
113 | 
114 |         r_exp_avg[ii] = beta1 * r_exp_avg[ii] + (1 - beta1) * r_g[ii];
115 |         r_exp_avg_diff[ii] = beta2 * r_exp_avg_diff[ii] + (1 - beta2) * r_neg_grad_diff[ii];
116 |         
117 |         r_exp_avg_sq[ii] = beta3 * r_exp_avg_sq[ii] + (1 - beta3) * update * update;
118 | 
119 |         MATH_T denom;
120 |         denom = sqrtf(r_exp_avg_sq[ii]) / bias_correction3_sqrt + epsilon;
121 |         MATH_T step_size_diff = lr * beta2 / bias_correction2;
122 |         MATH_T step_size = lr / bias_correction1;
123 | 
124 |         if(no_prox){
125 |           r_p[ii] = r_p[ii] * (1 - lr * decay);
126 |           r_p[ii] = r_p[ii] - step_size * r_exp_avg[ii] / denom;
127 |           r_p[ii] = r_p[ii] - step_size_diff * r_exp_avg_diff[ii] / denom;
128 |         } else {
129 |           r_p[ii] = r_p[ii] - step_size * r_exp_avg[ii] / denom;
130 |           r_p[ii] = r_p[ii] - step_size_diff * r_exp_avg_diff[ii] / denom;
131 |           r_p[ii] = r_p[ii] / (1 + lr * decay);
132 |         }
133 |       }
134 | #pragma unroll
135 |       for(int ii = 0; ii < ILP; ii++)
136 |       {
137 |         int i = i_start + threadIdx.x + ii*blockDim.x;
138 |         if(i < n && i < chunk_size)
139 |         {
140 |           g[i] = r_g[ii];
141 |           p[i] = r_p[ii];
142 |           exp_avg[i] = r_exp_avg[ii];
143 |           exp_avg_sq[i] = r_exp_avg_sq[ii];
144 |           exp_avg_diff[i] = r_exp_avg_diff[ii];
145 |         }
146 |       }
147 |     }
148 |   }
149 | };
150 | 
151 | void multi_tensor_adan_cuda(
152 |   int chunk_size,
153 |   at::Tensor noop_flag,
154 |   std::vector<std::vector<at::Tensor>> tensor_lists,
155 |   const float beta1,
156 |   const float beta2,
157 |   const float beta3,
158 |   const float bias_correction1,
159 |   const float bias_correction2,
160 |   const float bias_correction3_sqrt,
161 |   const float lr,
162 |   const float decay,
163 |   const float epsilon,
164 |   const bool no_prox,
165 |   const float clip_global_grad_norm)
166 | {
167 |   using namespace at;
168 |   TORCH_CHECK(!tensor_lists.empty(), "tensor list cannot be empty")
169 |   if (tensor_lists[0].empty()) {
170 |     return;
171 |   }
172 | 
173 |   // Assume single type across p,g,m1,m2 now
174 |   DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(
175 |     tensor_lists[0][0].scalar_type(), 0, "adan",
176 |     multi_tensor_apply<6>(
177 |       BLOCK_SIZE,
178 |       chunk_size,
179 |       noop_flag,
180 |       tensor_lists,
181 |       AdanFunctor<scalar_t_0>(),
182 |       beta1,
183 |       beta2,
184 |       beta3,
185 |       bias_correction1,
186 |       bias_correction2,
187 |       bias_correction3_sqrt,
188 |       lr,
189 |       decay,
190 |       epsilon,
191 |       no_prox,
192 |       clip_global_grad_norm
193 |       ); )
194 | 
195 |   AT_CUDA_CHECK(cudaGetLastError());
196 | 
197 | }
198 | 


--------------------------------------------------------------------------------
/fused_adan/pybind_adan.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include "include/fused_adan_kernel.cuh"
 4 | 
 5 | // x is torch::Tensor
 6 | #define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
 7 | #define CHECK_CONTIGUOUS(x) \
 8 |   AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 9 | #define CHECK_INPUT(x) \
10 |   CHECK_CUDA(x);       \
11 |   CHECK_CONTIGUOUS(x)
12 | 
13 | // C++ interface
14 | 
15 | void adan_single_tensor(at::Tensor& p, 
16 |           at::Tensor& p_copy, 
17 |           at::Tensor& g, 
18 |           at::Tensor& exp_avg, 
19 |           at::Tensor& exp_avg_sq, 
20 |           at::Tensor& exp_avg_diff,
21 |           at::Tensor& pre_g, 
22 |           float beta1, float beta2, float beta3, 
23 |           float bias_correction1, float bias_correction2, float bias_correction3_sqrt, 
24 |           float lr, float decay, float eps, bool no_prox, float grad_scale) {
25 |   CHECK_INPUT(p);
26 |   if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
27 |   CHECK_INPUT(exp_avg);
28 |   CHECK_INPUT(exp_avg_sq);
29 |   CHECK_INPUT(exp_avg_diff);
30 |   CHECK_INPUT(g);
31 |   CHECK_INPUT(pre_g);
32 |   int64_t num_elem = p.numel();
33 |   AT_ASSERTM(exp_avg.numel() == num_elem,
34 |              "number of elements in exp_avg and p tensors should be equal");
35 |   AT_ASSERTM(exp_avg_sq.numel() == num_elem,
36 |              "number of elements in exp_avg_sq and p tensors should be equal");
37 |   AT_ASSERTM(exp_avg_diff.numel() == num_elem,
38 |              "number of elements in exp_avg_diff and p tensors should be equal");
39 |   AT_ASSERTM(g.numel() == num_elem,
40 |              "number of elements in g and p tensors should be equal");
41 |   AT_ASSERTM(pre_g.numel() == num_elem,
42 |              "number of elements in pre_g and p tensors should be equal");
43 |   AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0,
44 |              "number of elements in p_copy and p tensors should be equal, or "
45 |              "p_copy should be empty");
46 | 
47 |   fused_adan_cuda(p, p_copy, g, 
48 |                   exp_avg, exp_avg_sq, exp_avg_diff,
49 |                   pre_g, beta1, beta2, beta3,
50 |                   bias_correction1, bias_correction2, bias_correction3_sqrt,
51 |                   lr, decay, eps, no_prox, grad_scale);  
52 | }
53 | 
54 | void adan_multi_tensor(
55 |   int chunk_size,
56 |   at::Tensor noop_flag,
57 |   std::vector<std::vector<at::Tensor>> tensor_lists,
58 |   const float beta1,
59 |   const float beta2,
60 |   const float beta3,
61 |   const float bias_correction1,
62 |   const float bias_correction2,
63 |   const float bias_correction3_sqrt,
64 |   const float lr,
65 |   const float decay,
66 |   const float epsilon,
67 |   const bool no_prox,
68 |   const float clip_global_grad_norm){
69 |     multi_tensor_adan_cuda(
70 |       chunk_size,
71 |       noop_flag,
72 |       tensor_lists,
73 |       beta1,
74 |       beta2,
75 |       beta3,
76 |       bias_correction1,
77 |       bias_correction2,
78 |       bias_correction3_sqrt,
79 |       lr,
80 |       decay,
81 |       epsilon,
82 |       no_prox,
83 |       clip_global_grad_norm
84 |     );
85 |   }
86 | 
87 | 
88 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
89 |   m.def("adan_single_tensor", &adan_single_tensor, "Adan optimized CUDA single tensor implementation.");
90 |   m.def("adan_multi_tensor", &adan_multi_tensor, "Adan optimized CUDA multi tensor implementation.");
91 | }
92 | 


--------------------------------------------------------------------------------
/gpt2/README.md:
--------------------------------------------------------------------------------
 1 | # Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
 2 | 
 3 | This experiment is based on the warped repo of [Megatron-LM](https://github.com/bigcode-project/Megatron-LM), provided by [BigCode](https://www.bigcode-project.org/). And the task of this experiment is code generalization.
 4 | 
 5 | ## Usage of Adan in Megatron-LM
 6 | 
 7 | ### Two steps to use Adan
 8 | 
 9 | **Step 1.** put `adan.py` in the path `Megatron-LM/megatron/optimizer/adan.py` and import it in the `Megatron-LM/megatron/optimizer/__init__.py`.
10 | 
11 | ```python
12 | from .adan import Adan
13 | 
14 | elif args.optimizer == 'adan':
15 |   optimizer = Adan(param_groups,lr=args.lr, weight_decay=args.weight_decay,
16 |                    betas=(args.adan_beta1, args.adan_beta2, args.adan_beta3),
17 |                    eps=args.adan_eps)
18 | ```
19 | 
20 | **Step 2.** add the following parameters to the file `Megatron-LM/megatron/arguments.py`.
21 | 
22 | ```python
23 | # beta3 is for the optimizer Adan, but not used in Adam.
24 | group.add_argument('--adan-beta1', type=float, default=0.98,
25 |                    help='First coefficient for computing running averages '
26 |                    'of gradient and its square')
27 | group.add_argument('--adan-beta2', type=float, default=0.92,
28 |                    help='Second coefficient for computing running averages '
29 |                    'of gradient and its square')
30 | group.add_argument('--adan-beta3', type=float, default=0.99,
31 |                    help='Second coefficient for computing running averages '
32 |                    'of gradient and its square')
33 | group.add_argument('--adan-eps', type=float, default=1e-08,
34 |                    help='Term added to the denominator to improve'
35 |                    'numerical stability')
36 | group.add_argument('--optimizer', type=str, default='adam',
37 |                    choices=['adam', 'sgd', 'adan'],
38 | ```
39 | 
40 | - `adan-beta1,2,3`: optimizer betas for Adan.
41 | 
42 | - `adan-eps`: stabilizing parameter.
43 | 
44 | - `optimizer`: choices of optimizers.
45 | 
46 | ## Data Preparation
47 | 
48 | **Step 1.** download the dataset used for pre-training. The dataset is collected and released by [BigCode](https://www.bigcode-project.org/) project:
49 | 
50 | ```python
51 | python ./download_dataset.py
52 | ```
53 | 
54 | **Step 2.** binarize the downloaded dataset:
55 | 
56 | ```python
57 | python tools/preprocess_data.py \
58 |       --input stack_python.json \
59 |       --output-prefix codegpt \
60 |       --vocab checkpoints/gpt2-adan/tokenizer/vocab.json \
61 |       --json-key content \
62 |       --dataset-impl mmap \
63 |       --workers 16 \
64 |       --chunk-size 25 \
65 |       --tokenizer-type GPT2BPETokenizer \
66 |       --merge-file checkpoints/gpt2-adan/tokenizer/merges.txt \
67 |       --append-eod; \
68 | ```
69 | 
70 | ## Pre-training
71 | 
72 | - #### Installation and Export
73 | 
74 |   ```bash
75 |   pip install wandb; \
76 |   pip install regex; \
77 |   pip install pybind11; \
78 |   pip install nltk; \
79 |   export MASTER_NODE=localhost; \
80 |   export NUM_NODES=8; \
81 |   export NODE_RANK=0; \
82 |   export WANDB_API_KEY=$YOUR_API; \
83 |   export WANDB_NAME=$PROJECT_NAME; \
84 |   export WANDB_NOTES=$NOTES; \
85 |   ```
86 | 
87 | - #### Training
88 | 
89 |   `bash ./pretrain.sh`
90 | 
91 | ## Results and Logs on GPT2-345m
92 | 
93 | We provide the config and log for GPT2-345m pre-trained on the dataset that comes from [BigCode](https://www.bigcode-project.org/) and evaluated on the [HumanEval](https://github.com/openai/human-eval) dataset by zero-shot learning. [HumanEval](https://github.com/openai/human-eval) is used to measure functional correctness for synthesizing programs from docstrings. It consists of 164 original programming problems, assessing language comprehension, algorithms, and simple mathematics, with some comparable to simple software interview questions. We set ` Temperature = 0.8` during evaluation.
94 | 
95 | |                  | Steps | pass@1 | pass@10 | pass@100 |                                  Download                                  |
96 | | ---------------- | :---: | :----: | :-----: | :------: | :------------------------------------------------------------------------: |
97 | | GPT2-345m (Adam) | 300k  | 0.0840 |  0.209  |  0.360   | [log&config](https://github.com/sail-sg/Adan/files/10362486/gpt2-adam.log) |
98 | | GPT2-345m (Adan) | 150k  | 0.0843 |  0.221  |  0.377   | [log&config](https://github.com/sail-sg/Adan/files/10362485/gpt2-adan.log) |
99 | 


--------------------------------------------------------------------------------
/gpt2/checkpoints/gpt2-adan/gpt_args:
--------------------------------------------------------------------------------
 1 | --num-layers 24
 2 | --hidden-size 1024
 3 | --num-attention-heads 16
 4 | --seq-length 2048
 5 | --max-position-embeddings 2048
 6 | --micro-batch-size 10
 7 | --global-batch-size 400
 8 | --lr 0.0025
 9 | --optimizer adan
10 | --train-iters 150000
11 | --lr-decay-iters 150000
12 | --lr-decay-style cosine
13 | --lr-warmup-iters 4000
14 | --weight-decay .05
15 | --adan-beta3 .95
16 | --fp16
17 | --log-interval 10
18 | --save-interval 5000
19 | --eval-interval 200
20 | --eval-iters 10
21 | --clip-grad 1.0
22 | 


--------------------------------------------------------------------------------
/gpt2/download_dataset.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | dataset = load_dataset('bigcode/the-stack-dedup',
 4 |                        use_auth_token=True,
 5 |                        split='train',
 6 |                        cache_dir='stack_dedup_python',
 7 |                        data_dir='data/python')
 8 | 
 9 | dataset.to_json('stack_python.json', num_proc=16)
10 | 


--------------------------------------------------------------------------------
/gpt2/pretrain.sh:
--------------------------------------------------------------------------------
 1 | set -u # stop on unset variables
 2 | 
 3 | GPUS_PER_NODE=8
 4 | MASTER_ADDR=${MASTER_NODE}
 5 | MASTER_PORT=6000
 6 | NNODES=${NUM_NODES}
 7 | # NODE_RANK=0  # env
 8 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 9 | 
10 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
11 | 
12 | CHECKPOINT_NAME=gpt2-adan
13 | CHECKPOINT_PATH=checkpoints/$CHECKPOINT_NAME  # Directory to store the checkpoints
14 | PREPROCESSED_DATA=preprocessed  # Directory containing the preprocessed dataset. To preprocess a dataset, see https://github.com/bigcode-project/Megatron-LM#data-preprocessing
15 | VOCAB_FILE=${CHECKPOINT_PATH}/tokenizer/vocab.json
16 | MERGE_FILE=${CHECKPOINT_PATH}/tokenizer/merges.txt
17 | DATA_PATH=${PREPROCESSED_DATA}/codegpt_content_document
18 | 
19 | GPT_ARGS=$(cat ${CHECKPOINT_PATH}/gpt_args)
20 | 
21 | TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
22 | 
23 | python -m torch.distributed.launch --nproc_per_node=8 \
24 |        pretrain_gpt.py \
25 |        --tensor-model-parallel-size 1 \
26 |        --pipeline-model-parallel-size 1 \
27 |        --recompute-activations \
28 |        $GPT_ARGS \
29 |        --vocab-file $VOCAB_FILE \
30 |        --merge-file $MERGE_FILE \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --wandb-entity-name xyxie \
35 |        --wandb-project-name $WANDB_NAME \
36 |        $TENSORBOARD_ARGS
37 |        # Uncomment the next two lines to finetune from a pretrained model.
38 |        # --finetune \
39 |        # --finetune-from /directory/containing/pretrained/model
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup
 4 | 
 5 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 6 | from torch.cuda import is_available
 7 | 
 8 | build_cuda_ext = is_available() or os.getenv('FORCE_CUDA', '0') == '1'
 9 | 
10 | cuda_extension = None
11 | if "--unfused" in sys.argv:
12 |     print("Building unfused version of adan")
13 |     sys.argv.remove("--unfused")
14 | elif build_cuda_ext:
15 |     cuda_extension = CUDAExtension(
16 |         'fused_adan', 
17 |         sources=['fused_adan/pybind_adan.cpp', './fused_adan/fused_adan_kernel.cu', './fused_adan/multi_tensor_adan_kernel.cu']
18 |     )
19 | 
20 | setup(
21 |     name='adan',
22 |     python_requires='>=3.8',
23 |     version='0.0.2',
24 |     install_requires=['torch'],
25 |     py_modules=['adan'],
26 |     description=(
27 |         'Adan: Adaptive Nesterov Momentum Algorithm for '
28 |         'Faster Optimizing Deep Models'
29 |     ),
30 |     author=(
31 |         'Xie, Xingyu and Zhou, Pan and Li, Huan and '
32 |         'Lin, Zhouchen and Yan, Shuicheng'
33 |     ),
34 |     ext_modules=[cuda_extension] if cuda_extension is not None else [],
35 |     cmdclass={'build_ext': BuildExtension} if build_cuda_ext else {},
36 | )


--------------------------------------------------------------------------------