├── cl ├── modules │ ├── __init__.py │ └── criterions.py ├── utils │ ├── __init__.py │ └── metric.py ├── dataloaders │ ├── __init__.py │ ├── base.py │ ├── datasetGen.py │ └── wrapper.py ├── models │ ├── __init__.py │ ├── controller.py │ ├── senet.py │ ├── mlp.py │ ├── lenet.py │ ├── resnet.py │ ├── nsa.py │ └── nsm.py ├── agents │ ├── __init__.py │ ├── customization.py │ ├── exp_replay.py │ ├── default.py │ └── regularization.py ├── scripts │ └── split_MNIST_incremental_domain.sh ├── iBatchLearn.py └── outputs │ └── split_MNIST_incremental_domain │ └── Offline.log ├── README.md ├── LICENSE ├── requirements.txt └── .gitignore /cl/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cl/dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cl/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import mlp 2 | from . import lenet 3 | from . import resnet 4 | from . import senet -------------------------------------------------------------------------------- /cl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from . import default 2 | from . import regularization 3 | from . import customization 4 | from . import exp_replay -------------------------------------------------------------------------------- /cl/modules/criterions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class BCEauto(torch.nn.BCEWithLogitsLoss): 4 | """ 5 | BCE with logits loss + automatically convert the target from class label to one-hot vector 6 | """ 7 | def forward(self, x, y): 8 | assert x.ndimension() == 2, 'Input size must be 2D' 9 | assert y.numel() == x.size(0), 'The size of input and target doesnt match. Number of input:' + str(x.size(0)) + ' Number of target:' + str(y.numel()) 10 | y_onehot = x.clone().zero_() 11 | y_onehot.scatter_(1, y.view(-1, 1), 1) 12 | 13 | return super(BCEauto, self).forward(x, y_onehot) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neurocoder 2 | Code of Neurocoder paper 3 | ICML version: https://proceedings.mlr.press/v162/le22b.html 4 | Code Ref: 5 | - Continual Learning tasks: https://github.com/GT-RIPL/Continual-Learning-Benchmark 6 | - Other tasks: TBU 7 | 8 | 9 | # Setup 10 | ``` 11 | pip install -r requirements.txt 12 | ``` 13 | Install other packages if possible 14 | 15 | 16 | # Continual Learning Tasks 17 | 18 | ``` 19 | cd cl 20 | mkdir data 21 | 22 | ``` 23 | Run baseline MLP 24 | ``` 25 | ./scripts/split_MNIST_incremental_domain.sh mlp 26 | ``` 27 | Run baseline Neurocoder 28 | ``` 29 | ./scripts/split_MNIST_incremental_domain.sh nsa 30 | ``` 31 | Notes: 32 | - Results are logged in cl/outputs/ 33 | - Choose CL backbone by modifying cl/scripts/split_MNIST_incremental_domain.sh 34 | - Core model code is in cl/models/nsa.py 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tony 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | brotlipy==0.7.0 2 | certifi @ file:///croot/certifi_1671487769961/work/certifi 3 | cffi @ file:///croot/cffi_1670423208954/work 4 | charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work 5 | cryptography @ file:///croot/cryptography_1673298753778/work 6 | flit_core @ file:///opt/conda/conda-bld/flit-core_1644941570762/work/source/flit_core 7 | idna @ file:///croot/idna_1666125576474/work 8 | mkl-fft==1.3.1 9 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work 10 | mkl-service==2.4.0 11 | numpy @ file:///croot/numpy_and_numpy_base_1672336185480/work 12 | Pillow==9.3.0 13 | pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work 14 | pyOpenSSL @ file:///opt/conda/conda-bld/pyopenssl_1643788558760/work 15 | PySocks @ file:///tmp/build/80754af9/pysocks_1605305779399/work 16 | requests @ file:///opt/conda/conda-bld/requests_1657734628632/work 17 | six @ file:///tmp/build/80754af9/six_1644875935023/work 18 | torch==1.13.1 19 | torchaudio==0.13.1 20 | torchvision==0.14.1 21 | typing_extensions @ file:///croot/typing_extensions_1669924550328/work 22 | urllib3 @ file:///croot/urllib3_1673575502006/work 23 | -------------------------------------------------------------------------------- /cl/utils/metric.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | 4 | def accuracy(output, target, topk=(1,)): 5 | """Computes the precision@k for the specified values of k""" 6 | with torch.no_grad(): 7 | maxk = max(topk) 8 | batch_size = target.size(0) 9 | 10 | _, pred = output.topk(maxk, 1, True, True) 11 | pred = pred.t() 12 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 13 | 14 | res = [] 15 | for k in topk: 16 | correct_k = correct[:k].view(-1).float().sum().item() 17 | res.append(correct_k*100.0 / batch_size) 18 | 19 | if len(res)==1: 20 | return res[0] 21 | else: 22 | return res 23 | 24 | 25 | class AverageMeter(object): 26 | """Computes and stores the average and current value""" 27 | 28 | def __init__(self): 29 | self.reset() 30 | 31 | def reset(self): 32 | self.val = 0 33 | self.avg = 0 34 | self.sum = 0 35 | self.count = 0 36 | 37 | def update(self, val, n=1): 38 | self.val = val 39 | self.sum += val * n 40 | self.count += n 41 | self.avg = float(self.sum) / self.count 42 | 43 | 44 | class Timer(object): 45 | """ 46 | """ 47 | 48 | def __init__(self): 49 | self.reset() 50 | 51 | def reset(self): 52 | self.interval = 0 53 | self.time = time.time() 54 | 55 | def value(self): 56 | return time.time() - self.time 57 | 58 | def tic(self): 59 | self.time = time.time() 60 | 61 | def toc(self): 62 | self.interval = time.time() - self.time 63 | self.time = time.time() 64 | return self.interval -------------------------------------------------------------------------------- /cl/scripts/split_MNIST_incremental_domain.sh: -------------------------------------------------------------------------------- 1 | GPUID=0 2 | OUTDIR=outputs/split_MNIST_incremental_domain 3 | REPEAT=10 4 | MODE=$1 5 | mkdir -p outputs/split_MNIST_incremental_domain 6 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --lr 0.001 --offline_training | tee ${OUTDIR}/Offline.log 7 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --lr 0.001 --mode $1 | tee ${OUTDIR}/Adam_${MODE}.log 8 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adagrad --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --lr 0.01 --mode $1 | tee ${OUTDIR}/Adagrad_${MODE}.log 9 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type customization --agent_name EWC_online_mnist --lr 0.001 --reg_coef 700 --mode $1 | tee ${OUTDIR}/EWC_online_${MODE}.log 10 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type customization --agent_name EWC_mnist --lr 0.001 --reg_coef 100 --mode $1 | tee ${OUTDIR}/EWC_${MODE}.log 11 | python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type regularization --agent_name L2 --lr 0.001 --reg_coef 0.5 --mode $1 | tee ${OUTDIR}/L2_${MODE}.log -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | /cl/data/ 131 | /.idea -------------------------------------------------------------------------------- /cl/models/controller.py: -------------------------------------------------------------------------------- 1 | """LSTM Controller.""" 2 | import torch 3 | from torch import nn 4 | from torch.nn import Parameter 5 | import numpy as np 6 | 7 | 8 | class FFWController(nn.Module): 9 | """An NTM controller based on LSTM.""" 10 | def __init__(self, num_inputs, num_outputs, num_layers): 11 | super(FFWController, self).__init__() 12 | 13 | self.num_inputs = num_inputs 14 | self.num_outputs = num_outputs 15 | self.num_layers = num_layers 16 | 17 | 18 | 19 | def create_new_state(self, batch_size): 20 | h = torch.zeros(batch_size, self.num_outputs) 21 | if torch.cuda.is_available(): 22 | h = h.cuda() 23 | return h 24 | 25 | def reset_parameters(self): 26 | pass 27 | 28 | def size(self): 29 | return self.num_inputs, self.num_outputs 30 | 31 | def forward(self, x, prev_state): 32 | return x, prev_state 33 | 34 | class LSTMController(nn.Module): 35 | """An NTM controller based on LSTM.""" 36 | def __init__(self, num_inputs, num_outputs, num_layers): 37 | super(LSTMController, self).__init__() 38 | 39 | self.num_inputs = num_inputs 40 | self.num_outputs = num_outputs 41 | self.num_layers = num_layers 42 | 43 | self.lstm = nn.LSTM(input_size=num_inputs, 44 | hidden_size=num_outputs, 45 | num_layers=num_layers) 46 | 47 | # The hidden state is a learned parameter 48 | if torch.cuda.is_available(): 49 | self.lstm_h_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs).cuda() * 0.05) 50 | self.lstm_c_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs).cuda() * 0.05) 51 | else: 52 | self.lstm_h_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs) * 0.05) 53 | self.lstm_c_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs) * 0.05) 54 | 55 | self.reset_parameters() 56 | 57 | def create_new_state(self, batch_size): 58 | # Dimension: (num_layers * num_directions, batch, hidden_size) 59 | lstm_h = self.lstm_h_bias.clone().repeat(1, batch_size, 1) 60 | lstm_c = self.lstm_c_bias.clone().repeat(1, batch_size, 1) 61 | return lstm_h, lstm_c 62 | 63 | def reset_parameters(self): 64 | for p in self.lstm.parameters(): 65 | if p.dim() == 1: 66 | nn.init.constant_(p, 0) 67 | else: 68 | stdev = 5 / (np.sqrt(self.num_inputs + self.num_outputs)) 69 | nn.init.uniform_(p, -stdev, stdev) 70 | 71 | def size(self): 72 | return self.num_inputs, self.num_outputs 73 | 74 | def forward(self, x, prev_state): 75 | x = x.unsqueeze(0) 76 | outp, state = self.lstm(x, prev_state) 77 | return outp.squeeze(0), state -------------------------------------------------------------------------------- /cl/dataloaders/base.py: -------------------------------------------------------------------------------- 1 | import torchvision 2 | from torchvision import transforms 3 | from .wrapper import CacheClassLabel 4 | 5 | def MNIST(dataroot, train_aug=False): 6 | # Add padding to make 32x32 7 | #normalize = transforms.Normalize(mean=(0.1307,), std=(0.3081,)) # for 28x28 8 | normalize = transforms.Normalize(mean=(0.1000,), std=(0.2752,)) # for 32x32 9 | 10 | val_transform = transforms.Compose([ 11 | transforms.Pad(2, fill=0, padding_mode='constant'), 12 | transforms.ToTensor(), 13 | normalize, 14 | ]) 15 | train_transform = val_transform 16 | if train_aug: 17 | train_transform = transforms.Compose([ 18 | transforms.RandomCrop(32, padding=4), 19 | transforms.ToTensor(), 20 | normalize, 21 | ]) 22 | 23 | train_dataset = torchvision.datasets.MNIST( 24 | root=dataroot, 25 | train=True, 26 | download=True, 27 | transform=train_transform 28 | ) 29 | train_dataset = CacheClassLabel(train_dataset) 30 | 31 | val_dataset = torchvision.datasets.MNIST( 32 | dataroot, 33 | train=False, 34 | transform=val_transform 35 | ) 36 | val_dataset = CacheClassLabel(val_dataset) 37 | 38 | return train_dataset, val_dataset 39 | 40 | def CIFAR10(dataroot, train_aug=False): 41 | normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262]) 42 | 43 | val_transform = transforms.Compose([ 44 | transforms.ToTensor(), 45 | normalize, 46 | ]) 47 | train_transform = val_transform 48 | if train_aug: 49 | train_transform = transforms.Compose([ 50 | transforms.RandomCrop(32, padding=4), 51 | transforms.RandomHorizontalFlip(), 52 | transforms.ToTensor(), 53 | normalize, 54 | ]) 55 | 56 | train_dataset = torchvision.datasets.CIFAR10( 57 | root=dataroot, 58 | train=True, 59 | download=True, 60 | transform=train_transform 61 | ) 62 | train_dataset = CacheClassLabel(train_dataset) 63 | 64 | val_dataset = torchvision.datasets.CIFAR10( 65 | root=dataroot, 66 | train=False, 67 | download=True, 68 | transform=val_transform 69 | ) 70 | val_dataset = CacheClassLabel(val_dataset) 71 | 72 | return train_dataset, val_dataset 73 | 74 | 75 | def CIFAR100(dataroot, train_aug=False): 76 | normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) 77 | 78 | val_transform = transforms.Compose([ 79 | transforms.ToTensor(), 80 | normalize, 81 | ]) 82 | train_transform = val_transform 83 | if train_aug: 84 | train_transform = transforms.Compose([ 85 | transforms.RandomCrop(32, padding=4), 86 | transforms.RandomHorizontalFlip(), 87 | transforms.ToTensor(), 88 | normalize, 89 | ]) 90 | 91 | train_dataset = torchvision.datasets.CIFAR100( 92 | root=dataroot, 93 | train=True, 94 | download=True, 95 | transform=train_transform 96 | ) 97 | train_dataset = CacheClassLabel(train_dataset) 98 | 99 | val_dataset = torchvision.datasets.CIFAR100( 100 | root=dataroot, 101 | train=False, 102 | download=True, 103 | transform=val_transform 104 | ) 105 | val_dataset = CacheClassLabel(val_dataset) 106 | 107 | return train_dataset, val_dataset 108 | 109 | -------------------------------------------------------------------------------- /cl/dataloaders/datasetGen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from random import shuffle 3 | from .wrapper import Subclass, AppendName, Permutation 4 | 5 | 6 | def SplitGen(train_dataset, val_dataset, first_split_sz=2, other_split_sz=2, rand_split=False, remap_class=False): 7 | ''' 8 | Generate the dataset splits based on the labels. 9 | :param train_dataset: (torch.utils.data.dataset) 10 | :param val_dataset: (torch.utils.data.dataset) 11 | :param first_split_sz: (int) 12 | :param other_split_sz: (int) 13 | :param rand_split: (bool) Randomize the set of label in each split 14 | :param remap_class: (bool) Ex: remap classes in a split from [2,4,6 ...] to [0,1,2 ...] 15 | :return: train_loaders {task_name:loader}, val_loaders {task_name:loader}, out_dim {task_name:num_classes} 16 | ''' 17 | assert train_dataset.number_classes==val_dataset.number_classes,'Train/Val has different number of classes' 18 | num_classes = train_dataset.number_classes 19 | 20 | # Calculate the boundary index of classes for splits 21 | # Ex: [0,2,4,6,8,10] or [0,50,60,70,80,90,100] 22 | split_boundaries = [0, first_split_sz] 23 | while split_boundaries[-1]0 else None 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv2 = conv3x3(planes, planes) 22 | 23 | if stride != 1 or in_planes != self.expansion*planes: 24 | self.shortcut = nn.Sequential( 25 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 26 | ) 27 | 28 | def forward(self, x): 29 | out = F.relu(self.bn1(x)) 30 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 31 | out = self.conv1(out) 32 | if self.drop is not None: 33 | out = self.drop(out) 34 | out = self.conv2(F.relu(self.bn2(out))) 35 | out += shortcut 36 | return out 37 | 38 | 39 | class PreActBottleneck(nn.Module): 40 | '''Pre-activation version of the original Bottleneck module.''' 41 | expansion = 4 42 | 43 | def __init__(self, in_planes, planes, stride=1, droprate=None): 44 | super(PreActBottleneck, self).__init__() 45 | self.bn1 = nn.BatchNorm2d(in_planes) 46 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 47 | self.bn2 = nn.BatchNorm2d(planes) 48 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 49 | self.bn3 = nn.BatchNorm2d(planes) 50 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 51 | 52 | if stride != 1 or in_planes != self.expansion*planes: 53 | self.shortcut = nn.Sequential( 54 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 55 | ) 56 | 57 | def forward(self, x): 58 | out = F.relu(self.bn1(x)) 59 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 60 | out = self.conv1(out) 61 | out = self.conv2(F.relu(self.bn2(out))) 62 | out = self.conv3(F.relu(self.bn3(out))) 63 | out += shortcut 64 | return out 65 | 66 | 67 | class PreActResNet(nn.Module): 68 | def __init__(self, block, num_blocks, num_classes=10, in_channels=3): 69 | super(PreActResNet, self).__init__() 70 | self.in_planes = 64 71 | last_planes = 512*block.expansion 72 | 73 | self.conv1 = conv3x3(in_channels, 64) 74 | self.stage1 = self._make_layer(block, 64, num_blocks[0], stride=1) 75 | self.stage2 = self._make_layer(block, 128, num_blocks[1], stride=2) 76 | self.stage3 = self._make_layer(block, 256, num_blocks[2], stride=2) 77 | self.stage4 = self._make_layer(block, 512, num_blocks[3], stride=2) 78 | self.bn_last = nn.BatchNorm2d(last_planes) 79 | self.last = nn.Linear(last_planes, num_classes) 80 | 81 | def _make_layer(self, block, planes, num_blocks, stride): 82 | strides = [stride] + [1]*(num_blocks-1) 83 | layers = [] 84 | for stride in strides: 85 | layers.append(block(self.in_planes, planes, stride)) 86 | self.in_planes = planes * block.expansion 87 | return nn.Sequential(*layers) 88 | 89 | def features(self, x): 90 | out = self.conv1(x) 91 | out = self.stage1(out) 92 | out = self.stage2(out) 93 | out = self.stage3(out) 94 | out = self.stage4(out) 95 | return out 96 | 97 | def logits(self, x): 98 | x = self.last(x) 99 | return x 100 | 101 | def forward(self, x): 102 | x = self.features(x) 103 | x = F.relu(self.bn_last(x)) 104 | x = F.adaptive_avg_pool2d(x, 1) 105 | x = self.logits(x.view(x.size(0), -1)) 106 | return x 107 | 108 | 109 | class PreActResNet_cifar(nn.Module): 110 | def __init__(self, block, num_blocks, filters, num_classes=10, droprate=0): 111 | super(PreActResNet_cifar, self).__init__() 112 | self.in_planes = 16 113 | last_planes = filters[2]*block.expansion 114 | 115 | self.conv1 = conv3x3(3, self.in_planes) 116 | self.stage1 = self._make_layer(block, filters[0], num_blocks[0], stride=1, droprate=droprate) 117 | self.stage2 = self._make_layer(block, filters[1], num_blocks[1], stride=2, droprate=droprate) 118 | self.stage3 = self._make_layer(block, filters[2], num_blocks[2], stride=2, droprate=droprate) 119 | self.bn_last = nn.BatchNorm2d(last_planes) 120 | self.last = nn.Linear(last_planes, num_classes) 121 | 122 | """ 123 | for m in self.modules(): 124 | if isinstance(m, nn.Conv2d): 125 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 126 | m.weight.data.normal_(0, math.sqrt(2. / n)) 127 | # m.bias.data.zero_() 128 | elif isinstance(m, nn.BatchNorm2d): 129 | m.weight.data.fill_(1) 130 | m.bias.data.zero_() 131 | elif isinstance(m, nn.Linear): 132 | init.kaiming_normal(m.weight) 133 | m.bias.data.zero_() 134 | """ 135 | 136 | def _make_layer(self, block, planes, num_blocks, stride, droprate): 137 | strides = [stride] + [1]*(num_blocks-1) 138 | layers = [] 139 | for stride in strides: 140 | layers.append(block(self.in_planes, planes, stride, droprate)) 141 | self.in_planes = planes * block.expansion 142 | return nn.Sequential(*layers) 143 | 144 | def features(self, x): 145 | out = self.conv1(x) 146 | out = self.stage1(out) 147 | out = self.stage2(out) 148 | out = self.stage3(out) 149 | return out 150 | 151 | def logits(self, x): 152 | x = self.last(x) 153 | return x 154 | 155 | def forward(self, x): 156 | out = self.features(x) 157 | out = F.relu(self.bn_last(out)) 158 | out = F.avg_pool2d(out, 8) 159 | out = self.logits(out.view(out.size(0), -1)) 160 | return out 161 | 162 | 163 | # ResNet for Cifar10/100 or the dataset with image size 32x32 164 | 165 | def ResNet20_cifar(out_dim=10): 166 | return PreActResNet_cifar(PreActBlock, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim) 167 | 168 | def ResNet56_cifar(out_dim=10): 169 | return PreActResNet_cifar(PreActBlock, [9 , 9 , 9 ], [16, 32, 64], num_classes=out_dim) 170 | 171 | def ResNet110_cifar(out_dim=10): 172 | return PreActResNet_cifar(PreActBlock, [18, 18, 18], [16, 32, 64], num_classes=out_dim) 173 | 174 | def ResNet29_cifar(out_dim=10): 175 | return PreActResNet_cifar(PreActBottleneck, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim) 176 | 177 | def ResNet164_cifar(out_dim=10): 178 | return PreActResNet_cifar(PreActBottleneck, [18, 18, 18], [16, 32, 64], num_classes=out_dim) 179 | 180 | def WideResNet_28_2_cifar(out_dim=10): 181 | return PreActResNet_cifar(PreActBlock, [4, 4, 4], [32, 64, 128], num_classes=out_dim) 182 | 183 | def WideResNet_28_2_drop_cifar(out_dim=10): 184 | return PreActResNet_cifar(PreActBlock, [4, 4, 4], [32, 64, 128], num_classes=out_dim, droprate=0.3) 185 | 186 | def WideResNet_28_10_cifar(out_dim=10): 187 | return PreActResNet_cifar(PreActBlock, [4, 4, 4], [160, 320, 640], num_classes=out_dim) 188 | 189 | # ResNet for general purpose. Ex:ImageNet 190 | 191 | def ResNet10(out_dim=10): 192 | return PreActResNet(PreActBlock, [1,1,1,1], num_classes=out_dim) 193 | 194 | def ResNet18S(out_dim=10): 195 | return PreActResNet(PreActBlock, [2,2,2,2], num_classes=out_dim, in_channels=1) 196 | 197 | def ResNet18(out_dim=10): 198 | return PreActResNet(PreActBlock, [2,2,2,2], num_classes=out_dim) 199 | 200 | def ResNet34(out_dim=10): 201 | return PreActResNet(PreActBlock, [3,4,6,3], num_classes=out_dim) 202 | 203 | def ResNet50(out_dim=10): 204 | return PreActResNet(PreActBottleneck, [3,4,6,3], num_classes=out_dim) 205 | 206 | def ResNet101(out_dim=10): 207 | return PreActResNet(PreActBottleneck, [3,4,23,3], num_classes=out_dim) 208 | 209 | def ResNet152(out_dim=10): 210 | return PreActResNet(PreActBottleneck, [3,8,36,3], num_classes=out_dim) -------------------------------------------------------------------------------- /cl/agents/exp_replay.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from importlib import import_module 4 | from .default import NormalNN 5 | from .regularization import SI, L2, EWC, MAS 6 | from dataloaders.wrapper import Storage 7 | 8 | 9 | class Memory(Storage): 10 | def reduce(self, m): 11 | self.storage = self.storage[:m] 12 | 13 | 14 | class Naive_Rehearsal(NormalNN): 15 | 16 | def __init__(self, agent_config): 17 | super(Naive_Rehearsal, self).__init__(agent_config) 18 | self.task_count = 0 19 | self.memory_size = 1000 20 | self.task_memory = {} 21 | 22 | def learn_batch(self, train_loader, val_loader=None): 23 | # 1.Combine training set 24 | dataset_list = [] 25 | for storage in self.task_memory.values(): 26 | dataset_list.append(storage) 27 | dataset_list *= max(len(train_loader.dataset)//self.memory_size,1) # Let old data: new data = 1:1 28 | dataset_list.append(train_loader.dataset) 29 | dataset = torch.utils.data.ConcatDataset(dataset_list) 30 | new_train_loader = torch.utils.data.DataLoader(dataset, 31 | batch_size=train_loader.batch_size, 32 | shuffle=True, 33 | num_workers=train_loader.num_workers) 34 | 35 | # 2.Update model as normal 36 | super(Naive_Rehearsal, self).learn_batch(new_train_loader, val_loader) 37 | 38 | # 3.Randomly decide the images to stay in the memory 39 | self.task_count += 1 40 | # (a) Decide the number of samples for being saved 41 | num_sample_per_task = self.memory_size // self.task_count 42 | num_sample_per_task = min(len(train_loader.dataset),num_sample_per_task) 43 | # (b) Reduce current exemplar set to reserve the space for the new dataset 44 | for storage in self.task_memory.values(): 45 | storage.reduce(num_sample_per_task) 46 | # (c) Randomly choose some samples from new task and save them to the memory 47 | self.task_memory[self.task_count] = Memory() # Initialize the memory slot 48 | randind = torch.randperm(len(train_loader.dataset))[:num_sample_per_task] # randomly sample some data 49 | for ind in randind: # save it to the memory 50 | self.task_memory[self.task_count].append(train_loader.dataset[ind]) 51 | 52 | 53 | class Naive_Rehearsal_SI(Naive_Rehearsal, SI): 54 | 55 | def __init__(self, agent_config): 56 | super(Naive_Rehearsal_SI, self).__init__(agent_config) 57 | 58 | 59 | class Naive_Rehearsal_L2(Naive_Rehearsal, L2): 60 | 61 | def __init__(self, agent_config): 62 | super(Naive_Rehearsal_L2, self).__init__(agent_config) 63 | 64 | 65 | class Naive_Rehearsal_EWC(Naive_Rehearsal, EWC): 66 | 67 | def __init__(self, agent_config): 68 | super(Naive_Rehearsal_EWC, self).__init__(agent_config) 69 | self.online_reg = True # Online EWC 70 | 71 | 72 | class Naive_Rehearsal_MAS(Naive_Rehearsal, MAS): 73 | 74 | def __init__(self, agent_config): 75 | super(Naive_Rehearsal_MAS, self).__init__(agent_config) 76 | 77 | 78 | class GEM(Naive_Rehearsal): 79 | """ 80 | @inproceedings{GradientEpisodicMemory, 81 | title={Gradient Episodic Memory for Continual Learning}, 82 | author={Lopez-Paz, David and Ranzato, Marc'Aurelio}, 83 | booktitle={NIPS}, 84 | year={2017}, 85 | url={https://arxiv.org/abs/1706.08840} 86 | } 87 | """ 88 | 89 | def __init__(self, agent_config): 90 | super(GEM, self).__init__(agent_config) 91 | self.params = {n: p for n, p in self.model.named_parameters() if p.requires_grad} # For convenience 92 | self.task_grads = {} 93 | self.quadprog = import_module('quadprog') 94 | self.task_mem_cache = {} 95 | 96 | def grad_to_vector(self): 97 | vec = [] 98 | for n,p in self.params.items(): 99 | if p.grad is not None: 100 | vec.append(p.grad.view(-1)) 101 | else: 102 | # Part of the network might has no grad, fill zero for those terms 103 | vec.append(p.data.clone().fill_(0).view(-1)) 104 | return torch.cat(vec) 105 | 106 | def vector_to_grad(self, vec): 107 | # Overwrite current param.grad by slicing the values in vec (flatten grad) 108 | pointer = 0 109 | for n, p in self.params.items(): 110 | # The length of the parameter 111 | num_param = p.numel() 112 | if p.grad is not None: 113 | # Slice the vector, reshape it, and replace the old data of the grad 114 | p.grad.copy_(vec[pointer:pointer + num_param].view_as(p)) 115 | # Part of the network might has no grad, ignore those terms 116 | # Increment the pointer 117 | pointer += num_param 118 | 119 | def project2cone2(self, gradient, memories): 120 | """ 121 | Solves the GEM dual QP described in the paper given a proposed 122 | gradient "gradient", and a memory of task gradients "memories". 123 | Overwrites "gradient" with the final projected update. 124 | 125 | input: gradient, p-vector 126 | input: memories, (t * p)-vector 127 | output: x, p-vector 128 | 129 | Modified from: https://github.com/facebookresearch/GradientEpisodicMemory/blob/master/model/gem.py#L70 130 | """ 131 | margin = self.config['reg_coef'] 132 | memories_np = memories.cpu().contiguous().double().numpy() 133 | gradient_np = gradient.cpu().contiguous().view(-1).double().numpy() 134 | t = memories_np.shape[0] 135 | #print(memories_np.shape, gradient_np.shape) 136 | P = np.dot(memories_np, memories_np.transpose()) 137 | P = 0.5 * (P + P.transpose()) 138 | q = np.dot(memories_np, gradient_np) * -1 139 | G = np.eye(t) 140 | P = P + G * 0.001 141 | h = np.zeros(t) + margin 142 | v = self.quadprog.solve_qp(P, q, G, h)[0] 143 | x = np.dot(v, memories_np) + gradient_np 144 | new_grad = torch.Tensor(x).view(-1) 145 | if self.gpu: 146 | new_grad = new_grad.cuda() 147 | return new_grad 148 | 149 | def learn_batch(self, train_loader, val_loader=None): 150 | 151 | # 1.Update model as normal 152 | super(GEM, self).learn_batch(train_loader, val_loader) 153 | 154 | # 2.Randomly decide the images to stay in the memory 155 | self.task_count += 1 156 | # (a) Decide the number of samples for being saved 157 | num_sample_per_task = self.memory_size // self.task_count 158 | num_sample_per_task = min(len(train_loader.dataset),num_sample_per_task) 159 | # (b) Reduce current exemplar set to reserve the space for the new dataset 160 | for storage in self.task_memory.values(): 161 | storage.reduce(num_sample_per_task) 162 | # (c) Randomly choose some samples from new task and save them to the memory 163 | self.task_memory[self.task_count] = Memory() # Initialize the memory slot 164 | randind = torch.randperm(len(train_loader.dataset))[:num_sample_per_task] # randomly sample some data 165 | for ind in randind: # save it to the memory 166 | self.task_memory[self.task_count].append(train_loader.dataset[ind]) 167 | # (d) Cache the data for faster processing 168 | for t, mem in self.task_memory.items(): 169 | # Concatenate all data in each task 170 | mem_loader = torch.utils.data.DataLoader(mem, 171 | batch_size=len(mem), 172 | shuffle=False, 173 | num_workers=2) 174 | assert len(mem_loader)==1,'The length of mem_loader should be 1' 175 | for i, (mem_input, mem_target, mem_task) in enumerate(mem_loader): 176 | if self.gpu: 177 | mem_input = mem_input.cuda() 178 | mem_target = mem_target.cuda() 179 | self.task_mem_cache[t] = {'data':mem_input,'target':mem_target,'task':mem_task} 180 | 181 | def update_model(self, inputs, targets, tasks): 182 | 183 | # compute gradient on previous tasks 184 | if self.task_count > 0: 185 | for t,mem in self.task_memory.items(): 186 | self.zero_grad() 187 | # feed the data from memory and collect the gradients 188 | mem_out = self.forward(self.task_mem_cache[t]['data']) 189 | mem_loss = self.criterion(mem_out, self.task_mem_cache[t]['target'], self.task_mem_cache[t]['task']) 190 | mem_loss.backward() 191 | # Store the grads 192 | self.task_grads[t] = self.grad_to_vector() 193 | 194 | # now compute the grad on the current minibatch 195 | out = self.forward(inputs) 196 | loss = self.criterion(out, targets, tasks) 197 | self.optimizer.zero_grad() 198 | loss.backward() 199 | 200 | # check if gradient violates constraints 201 | if self.task_count > 0: 202 | current_grad_vec = self.grad_to_vector() 203 | mem_grad_vec = torch.stack(list(self.task_grads.values())) 204 | dotp = current_grad_vec * mem_grad_vec 205 | dotp = dotp.sum(dim=1) 206 | if (dotp < 0).sum() != 0: 207 | new_grad = self.project2cone2(current_grad_vec, mem_grad_vec) 208 | # copy gradients back 209 | self.vector_to_grad(new_grad) 210 | 211 | self.optimizer.step() 212 | return loss.detach(), out 213 | -------------------------------------------------------------------------------- /cl/iBatchLearn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import torch 5 | import numpy as np 6 | from random import shuffle 7 | from collections import OrderedDict 8 | import dataloaders.base 9 | from dataloaders.datasetGen import SplitGen, PermutedGen 10 | import agents 11 | 12 | 13 | def run(args): 14 | if not os.path.exists('outputs'): 15 | os.mkdir('outputs') 16 | 17 | # Prepare dataloaders 18 | train_dataset, val_dataset = dataloaders.base.__dict__[args.dataset](args.dataroot, args.train_aug) 19 | if args.n_permutation>0: 20 | train_dataset_splits, val_dataset_splits, task_output_space = PermutedGen(train_dataset, val_dataset, 21 | args.n_permutation,first_split_sz=args.first_split_size, 22 | other_split_sz=args.other_split_size, 23 | rand_split=args.rand_split, 24 | remap_class=not args.no_class_remap) 25 | else: 26 | train_dataset_splits, val_dataset_splits, task_output_space = SplitGen(train_dataset, val_dataset, 27 | first_split_sz=args.first_split_size, 28 | other_split_sz=args.other_split_size, 29 | rand_split=args.rand_split, 30 | remap_class=not args.no_class_remap) 31 | # Prepare the Agent (model) 32 | agent_config = {'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay,'schedule': args.schedule, 33 | 'model_type':args.model_type, 'model_name': args.model_name, 'model_weights':args.model_weights, 34 | 'out_dim':{'All':args.force_out_dim} if args.force_out_dim>0 else task_output_space, 35 | 'optimizer':args.optimizer, 36 | 'print_freq':args.print_freq, 'gpuid': args.gpuid, 37 | 'reg_coef':args.reg_coef, 'mode':args.mode} 38 | agent = agents.__dict__[args.agent_type].__dict__[args.agent_name](agent_config) 39 | print(agent.model) 40 | print('#parameter of model: ',agent.count_parameter()) 41 | 42 | num_params = 0 43 | for p in agent.model.parameters(): 44 | if p.requires_grad: 45 | num_params += p.data.view(-1).size(0) 46 | 47 | print("no learnable params: ", num_params) 48 | 49 | # Decide split ordering 50 | task_names = sorted(list(task_output_space.keys()), key=int) 51 | print('Task order:',task_names) 52 | if args.rand_split_order: 53 | shuffle(task_names) 54 | print('Shuffled task order:', task_names) 55 | 56 | acc_table = OrderedDict() 57 | if args.offline_training: # Non-incremental learning / offline_training / measure the upper-bound performance 58 | task_names = ['All'] 59 | train_dataset_all = torch.utils.data.ConcatDataset(train_dataset_splits.values()) 60 | val_dataset_all = torch.utils.data.ConcatDataset(val_dataset_splits.values()) 61 | train_loader = torch.utils.data.DataLoader(train_dataset_all, 62 | batch_size=args.batch_size, shuffle=True, num_workers=args.workers) 63 | val_loader = torch.utils.data.DataLoader(val_dataset_all, 64 | batch_size=args.batch_size, shuffle=False, num_workers=args.workers) 65 | 66 | agent.learn_batch(train_loader, val_loader) 67 | 68 | acc_table['All'] = {} 69 | acc_table['All']['All'] = agent.validation(val_loader) 70 | 71 | else: # Incremental learning 72 | # Feed data to agent and evaluate agent's performance 73 | for i in range(len(task_names)): 74 | train_name = task_names[i] 75 | print('======================',train_name,'=======================') 76 | train_loader = torch.utils.data.DataLoader(train_dataset_splits[train_name], 77 | batch_size=args.batch_size, shuffle=True, num_workers=args.workers) 78 | val_loader = torch.utils.data.DataLoader(val_dataset_splits[train_name], 79 | batch_size=args.batch_size, shuffle=False, num_workers=args.workers) 80 | 81 | if args.incremental_class: 82 | agent.add_valid_output_dim(task_output_space[train_name]) 83 | 84 | # Learn 85 | agent.learn_batch(train_loader, val_loader) 86 | 87 | # Evaluate 88 | acc_table[train_name] = OrderedDict() 89 | for j in range(i+1): 90 | val_name = task_names[j] 91 | print('validation split name:', val_name) 92 | val_data = val_dataset_splits[val_name] if not args.eval_on_train_set else train_dataset_splits[val_name] 93 | val_loader = torch.utils.data.DataLoader(val_data, 94 | batch_size=args.batch_size, shuffle=False, 95 | num_workers=args.workers) 96 | acc_table[val_name][train_name] = agent.validation(val_loader) 97 | 98 | return acc_table, task_names 99 | 100 | def get_args(argv): 101 | # This function prepares the variables shared across demo.py 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument('--gpuid', nargs="+", type=int, default=[0], 104 | help="The list of gpuid, ex:--gpuid 3 1. Negative value means cpu-only") 105 | parser.add_argument('--model_type', type=str, default='mlp', help="The type (mlp|lenet|vgg|resnet) of backbone network") 106 | parser.add_argument('--model_name', type=str, default='MLP', help="The name of actual model for the backbone") 107 | parser.add_argument('--force_out_dim', type=int, default=2, help="Set 0 to let the task decide the required output dimension") 108 | parser.add_argument('--agent_type', type=str, default='default', help="The type (filename) of agent") 109 | parser.add_argument('--agent_name', type=str, default='NormalNN', help="The class name of agent") 110 | parser.add_argument('--optimizer', type=str, default='SGD', help="SGD|Adam|RMSprop|amsgrad|Adadelta|Adagrad|Adamax ...") 111 | parser.add_argument('--dataroot', type=str, default='data', help="The root folder of dataset or downloaded data") 112 | parser.add_argument('--dataset', type=str, default='MNIST', help="MNIST(default)|CIFAR10|CIFAR100") 113 | parser.add_argument('--n_permutation', type=int, default=0, help="Enable permuted tests when >0") 114 | parser.add_argument('--first_split_size', type=int, default=2) 115 | parser.add_argument('--other_split_size', type=int, default=2) 116 | parser.add_argument('--no_class_remap', dest='no_class_remap', default=False, action='store_true', 117 | help="Avoid the dataset with a subset of classes doing the remapping. Ex: [2,5,6 ...] -> [0,1,2 ...]") 118 | parser.add_argument('--train_aug', dest='train_aug', default=False, action='store_true', 119 | help="Allow data augmentation during training") 120 | parser.add_argument('--rand_split', dest='rand_split', default=False, action='store_true', 121 | help="Randomize the classes in splits") 122 | parser.add_argument('--rand_split_order', dest='rand_split_order', default=False, action='store_true', 123 | help="Randomize the order of splits") 124 | parser.add_argument('--workers', type=int, default=3, help="#Thread for dataloader") 125 | parser.add_argument('--mode', type=str, default="mlp") 126 | parser.add_argument('--batch_size', type=int, default=100) 127 | parser.add_argument('--lr', type=float, default=0.01, help="Learning rate") 128 | parser.add_argument('--momentum', type=float, default=0) 129 | parser.add_argument('--weight_decay', type=float, default=0) 130 | parser.add_argument('--schedule', nargs="+", type=int, default=[2], 131 | help="The list of epoch numbers to reduce learning rate by factor of 0.1. Last number is the end epoch") 132 | parser.add_argument('--print_freq', type=float, default=100, help="Print the log at every x iteration") 133 | parser.add_argument('--model_weights', type=str, default=None, 134 | help="The path to the file for the model weights (*.pth).") 135 | parser.add_argument('--reg_coef', nargs="+", type=float, default=[0.], help="The coefficient for regularization. Larger means less plasilicity. Give a list for hyperparameter search.") 136 | parser.add_argument('--eval_on_train_set', dest='eval_on_train_set', default=False, action='store_true', 137 | help="Force the evaluation on train set") 138 | parser.add_argument('--offline_training', dest='offline_training', default=False, action='store_true', 139 | help="Non-incremental learning by make all data available in one batch. For measuring the upperbound performance.") 140 | parser.add_argument('--repeat', type=int, default=1, help="Repeat the experiment N times") 141 | parser.add_argument('--incremental_class', dest='incremental_class', default=False, action='store_true', 142 | help="The number of output node in the single-headed model increases along with new categories.") 143 | args = parser.parse_args(argv) 144 | return args 145 | 146 | if __name__ == '__main__': 147 | args = get_args(sys.argv[1:]) 148 | reg_coef_list = args.reg_coef 149 | avg_final_acc = {} 150 | 151 | # The for loops over hyper-paramerters or repeats 152 | for reg_coef in reg_coef_list: 153 | args.reg_coef = reg_coef 154 | avg_final_acc[reg_coef] = np.zeros(args.repeat) 155 | for r in range(args.repeat): 156 | 157 | # Run the experiment 158 | acc_table, task_names = run(args) 159 | print(acc_table) 160 | 161 | # Calculate average performance across tasks 162 | # Customize this part for a different performance metric 163 | avg_acc_history = [0] * len(task_names) 164 | for i in range(len(task_names)): 165 | train_name = task_names[i] 166 | cls_acc_sum = 0 167 | for j in range(i + 1): 168 | val_name = task_names[j] 169 | cls_acc_sum += acc_table[val_name][train_name] 170 | avg_acc_history[i] = cls_acc_sum / (i + 1) 171 | print('Task', train_name, 'average acc:', avg_acc_history[i]) 172 | 173 | # Gather the final avg accuracy 174 | avg_final_acc[reg_coef][r] = avg_acc_history[-1] 175 | 176 | # Print the summary so far 177 | print('===Summary of experiment repeats:',r+1,'/',args.repeat,'===') 178 | print('The regularization coefficient:', args.reg_coef) 179 | print('The last avg acc of all repeats:', avg_final_acc[reg_coef]) 180 | print('mean:', avg_final_acc[reg_coef].mean(), 'std:', avg_final_acc[reg_coef].std()) 181 | for reg_coef,v in avg_final_acc.items(): 182 | print('reg_coef:', reg_coef,'mean:', avg_final_acc[reg_coef].mean(), 'std:', avg_final_acc[reg_coef].std()) 183 | -------------------------------------------------------------------------------- /cl/agents/default.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | import torch.nn as nn 4 | from types import MethodType 5 | import models 6 | from utils.metric import accuracy, AverageMeter, Timer 7 | 8 | class NormalNN(nn.Module): 9 | ''' 10 | Normal Neural Network with SGD for classification 11 | ''' 12 | def __init__(self, agent_config): 13 | ''' 14 | :param agent_config (dict): lr=float,momentum=float,weight_decay=float, 15 | schedule=[int], # The last number in the list is the end of epoch 16 | model_type=str,model_name=str,out_dim={task:dim},model_weights=str 17 | force_single_head=bool 18 | print_freq=int 19 | gpuid=[int] 20 | ''' 21 | super(NormalNN, self).__init__() 22 | self.log = print if agent_config['print_freq'] > 0 else lambda \ 23 | *args: None # Use a void function to replace the print 24 | self.config = agent_config 25 | # If out_dim is a dict, there is a list of tasks. The model will have a head for each task. 26 | self.multihead = True if len(self.config['out_dim'])>1 else False # A convenience flag to indicate multi-head/task 27 | self.model = self.create_model() 28 | self.criterion_fn = nn.CrossEntropyLoss() 29 | if agent_config['gpuid'][0] >= 0: 30 | self.cuda() 31 | self.gpu = True 32 | else: 33 | self.gpu = False 34 | self.init_optimizer() 35 | self.reset_optimizer = False 36 | self.valid_out_dim = 'ALL' # Default: 'ALL' means all output nodes are active 37 | # Set a interger here for the incremental class scenario 38 | 39 | def init_optimizer(self): 40 | optimizer_arg = {'params':self.model.parameters(), 41 | 'lr':self.config['lr'], 42 | 'weight_decay':self.config['weight_decay']} 43 | if self.config['optimizer'] in ['SGD','RMSprop']: 44 | optimizer_arg['momentum'] = self.config['momentum'] 45 | elif self.config['optimizer'] in ['Rprop']: 46 | optimizer_arg.pop('weight_decay') 47 | elif self.config['optimizer'] == 'amsgrad': 48 | optimizer_arg['amsgrad'] = True 49 | self.config['optimizer'] = 'Adam' 50 | 51 | self.optimizer = torch.optim.__dict__[self.config['optimizer']](**optimizer_arg) 52 | self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=self.config['schedule'], 53 | gamma=0.1) 54 | 55 | def create_model(self): 56 | cfg = self.config 57 | 58 | # Define the backbone (MLP, LeNet, VGG, ResNet ... etc) of model 59 | model = models.__dict__[cfg['model_type']].__dict__[cfg['model_name']](mode=cfg['mode']) 60 | 61 | # Apply network surgery to the backbone 62 | # Create the heads for tasks (It can be single task or multi-task) 63 | n_feat = model.last.in_features 64 | 65 | # The output of the model will be a dict: {task_name1:output1, task_name2:output2 ...} 66 | # For a single-headed model the output will be {'All':output} 67 | model.last = nn.ModuleDict() 68 | for task,out_dim in cfg['out_dim'].items(): 69 | model.last[task] = model.get_last(n_feat,out_dim) 70 | 71 | # Redefine the task-dependent function 72 | def new_logits(self, x): 73 | outputs = {} 74 | for task, func in self.last.items(): 75 | outputs[task] = func(x) 76 | return outputs 77 | 78 | # Replace the task-dependent function 79 | model.logits = MethodType(new_logits, model) 80 | # Load pre-trained weights 81 | if cfg['model_weights'] is not None: 82 | print('=> Load model weights:', cfg['model_weights']) 83 | model_state = torch.load(cfg['model_weights'], 84 | map_location=lambda storage, loc: storage) # Load to CPU. 85 | model.load_state_dict(model_state) 86 | print('=> Load Done') 87 | return model 88 | 89 | def forward(self, x): 90 | return self.model.forward(x) 91 | 92 | def predict(self, inputs): 93 | self.model.eval() 94 | out = self.forward(inputs) 95 | for t in out.keys(): 96 | out[t] = out[t].detach() 97 | return out 98 | 99 | def validation(self, dataloader): 100 | # This function doesn't distinguish tasks. 101 | batch_timer = Timer() 102 | acc = AverageMeter() 103 | batch_timer.tic() 104 | 105 | orig_mode = self.training 106 | self.eval() 107 | for i, (input, target, task) in enumerate(dataloader): 108 | 109 | if self.gpu: 110 | with torch.no_grad(): 111 | input = input.cuda() 112 | target = target.cuda() 113 | output = self.predict(input) 114 | 115 | # Summarize the performance of all tasks, or 1 task, depends on dataloader. 116 | # Calculated by total number of data. 117 | acc = accumulate_acc(output, target, task, acc) 118 | 119 | self.train(orig_mode) 120 | 121 | self.log(' * Val Acc {acc.avg:.3f}, Total time {time:.2f}' 122 | .format(acc=acc,time=batch_timer.toc())) 123 | return acc.avg 124 | 125 | def criterion(self, preds, targets, tasks, **kwargs): 126 | # The inputs and targets could come from single task or a mix of tasks 127 | # The network always makes the predictions with all its heads 128 | # The criterion will match the head and task to calculate the loss. 129 | if self.multihead: 130 | loss = 0 131 | for t,t_preds in preds.items(): 132 | inds = [i for i in range(len(tasks)) if tasks[i]==t] # The index of inputs that matched specific task 133 | if len(inds)>0: 134 | t_preds = t_preds[inds] 135 | t_target = targets[inds] 136 | loss += self.criterion_fn(t_preds, t_target) * len(inds) # restore the loss from average 137 | loss /= len(targets) # Average the total loss by the mini-batch size 138 | else: 139 | pred = preds['All'] 140 | if isinstance(self.valid_out_dim, int): # (Not 'ALL') Mask out the outputs of unseen classes for incremental class scenario 141 | pred = preds['All'][:,:self.valid_out_dim] 142 | loss = self.criterion_fn(pred, targets) 143 | return loss 144 | 145 | def update_model(self, inputs, targets, tasks): 146 | out = self.forward(inputs) 147 | loss = self.criterion(out, targets, tasks) 148 | loss = loss + self.model.get_ploss() 149 | self.optimizer.zero_grad() 150 | loss.backward() 151 | self.optimizer.step() 152 | return loss.detach(), out 153 | 154 | def learn_batch(self, train_loader, val_loader=None): 155 | if self.reset_optimizer: # Reset optimizer before learning each task 156 | self.log('Optimizer is reset!') 157 | self.init_optimizer() 158 | 159 | for epoch in range(self.config['schedule'][-1]): 160 | data_timer = Timer() 161 | batch_timer = Timer() 162 | batch_time = AverageMeter() 163 | data_time = AverageMeter() 164 | losses = AverageMeter() 165 | acc = AverageMeter() 166 | 167 | # Config the model and optimizer 168 | self.log('Epoch:{0}'.format(epoch)) 169 | self.model.train() 170 | self.scheduler.step(epoch) 171 | for param_group in self.optimizer.param_groups: 172 | self.log('LR:',param_group['lr']) 173 | 174 | # Learning with mini-batch 175 | data_timer.tic() 176 | batch_timer.tic() 177 | self.log('Itr\t\tTime\t\t Data\t\t Loss\t\tAcc') 178 | for i, (input, target, task) in enumerate(train_loader): 179 | 180 | data_time.update(data_timer.toc()) # measure data loading time 181 | 182 | if self.gpu: 183 | input = input.cuda() 184 | target = target.cuda() 185 | 186 | loss, output = self.update_model(input, target, task) 187 | input = input.detach() 188 | target = target.detach() 189 | 190 | # measure accuracy and record loss 191 | acc = accumulate_acc(output, target, task, acc) 192 | losses.update(loss, input.size(0)) 193 | 194 | batch_time.update(batch_timer.toc()) # measure elapsed time 195 | data_timer.toc() 196 | 197 | if ((self.config['print_freq']>0) and (i % self.config['print_freq'] == 0)) or (i+1)==len(train_loader): 198 | self.log('[{0}/{1}]\t' 199 | '{batch_time.val:.4f} ({batch_time.avg:.4f})\t' 200 | '{data_time.val:.4f} ({data_time.avg:.4f})\t' 201 | '{loss.val:.3f} ({loss.avg:.3f})\t' 202 | '{acc.val:.2f} ({acc.avg:.2f})'.format( 203 | i, len(train_loader), batch_time=batch_time, 204 | data_time=data_time, loss=losses, acc=acc)) 205 | 206 | self.log(' * Train Acc {acc.avg:.3f}'.format(acc=acc)) 207 | 208 | # Evaluate the performance of current task 209 | if val_loader != None: 210 | self.validation(val_loader) 211 | 212 | def learn_stream(self, data, label): 213 | assert False,'No implementation yet' 214 | 215 | def add_valid_output_dim(self, dim=0): 216 | # This function is kind of ad-hoc, but it is the simplest way to support incremental class learning 217 | self.log('Incremental class: Old valid output dimension:', self.valid_out_dim) 218 | if self.valid_out_dim == 'ALL': 219 | self.valid_out_dim = 0 # Initialize it with zero 220 | self.valid_out_dim += dim 221 | self.log('Incremental class: New Valid output dimension:', self.valid_out_dim) 222 | return self.valid_out_dim 223 | 224 | def count_parameter(self): 225 | return sum(p.numel() for p in self.model.parameters()) 226 | 227 | def save_model(self, filename): 228 | model_state = self.model.state_dict() 229 | if isinstance(self.model,torch.nn.DataParallel): 230 | # Get rid of 'module' before the name of states 231 | model_state = self.model.module.state_dict() 232 | for key in model_state.keys(): # Always save it to cpu 233 | model_state[key] = model_state[key].cpu() 234 | print('=> Saving model to:', filename) 235 | torch.save(model_state, filename + '.pth') 236 | print('=> Save Done') 237 | 238 | def cuda(self): 239 | torch.cuda.set_device(self.config['gpuid'][0]) 240 | self.model = self.model.cuda() 241 | self.criterion_fn = self.criterion_fn.cuda() 242 | # Multi-GPU 243 | if len(self.config['gpuid']) > 1: 244 | self.model = torch.nn.DataParallel(self.model, device_ids=self.config['gpuid'], output_device=self.config['gpuid'][0]) 245 | return self 246 | 247 | def accumulate_acc(output, target, task, meter): 248 | if 'All' in output.keys(): # Single-headed model 249 | meter.update(accuracy(output['All'], target), len(target)) 250 | else: # outputs from multi-headed (multi-task) model 251 | for t, t_out in output.items(): 252 | inds = [i for i in range(len(task)) if task[i] == t] # The index of inputs that matched specific task 253 | if len(inds) > 0: 254 | t_out = t_out[inds] 255 | t_target = target[inds] 256 | meter.update(accuracy(t_out, t_target), len(inds)) 257 | 258 | return meter 259 | -------------------------------------------------------------------------------- /cl/models/nsa.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from models.controller import LSTMController 6 | 7 | 8 | δ = 1e-6 9 | def θ(a, b, dimA=2, dimB=2, normBy=2): 10 | """Batchwise Cosine distance 11 | 12 | Cosine distance 13 | 14 | Arguments: 15 | a {Tensor} -- A 3D Tensor (b * m * w) 16 | b {Tensor} -- A 3D Tensor (b * r * w) 17 | 18 | Keyword Arguments: 19 | dimA {number} -- exponent value of the norm for `a` (default: {2}) 20 | dimB {number} -- exponent value of the norm for `b` (default: {1}) 21 | 22 | Returns: 23 | Tensor -- Batchwise cosine distance (b * r * m) 24 | """ 25 | a_norm = torch.norm(a, normBy, dimA, keepdim=True).expand_as(a) + δ 26 | b_norm = torch.norm(b, normBy, dimB, keepdim=True).expand_as(b) + δ 27 | 28 | x = torch.bmm(a, b.transpose(1, 2)).transpose(1, 2) / ( 29 | torch.bmm(a_norm, b_norm.transpose(1, 2)).transpose(1, 2) + δ) 30 | return x 31 | 32 | 33 | class ProgammedController(nn.Module): 34 | def __init__(self, program_shape, program_interface_size, pkey_dim=5, num_program=20, 35 | bias=False, svd_num_features=8, top_lu=10, has_res_w="n", 36 | kc_mode="lk", rnn_step=10): 37 | super(ProgammedController, self).__init__() 38 | self.pkey_dim = pkey_dim 39 | self.program_size = num_program 40 | self.program_shape = program_shape 41 | self.has_bias = bias 42 | self.rnn_step = rnn_step 43 | self.top_lu = top_lu 44 | self.has_res_w = has_res_w 45 | self.svd_num_features = svd_num_features 46 | self.kc_mode = kc_mode 47 | 48 | self.PM_U = nn.Parameter(torch.zeros(self.program_size, 49 | self.pkey_dim + program_shape[0], 50 | requires_grad=True)) 51 | 52 | self.PM_V = nn.Parameter(torch.zeros(self.program_size, 53 | self.pkey_dim + program_shape[1], 54 | requires_grad=True)) 55 | self.PM_S = nn.Parameter(torch.zeros(self.program_size,self.pkey_dim+1, 56 | requires_grad=True)) 57 | 58 | if self.has_res_w == "y": 59 | self.res_weight = nn.Parameter(torch.zeros(program_shape[0], 60 | program_shape[1], 61 | requires_grad=True)) 62 | 63 | if self.rnn_step == 0: 64 | 65 | self.program_key_u = nn.Linear(program_interface_size,self.svd_num_features*self.pkey_dim) 66 | self.program_key_v = nn.Linear(program_interface_size, self.svd_num_features*self.pkey_dim) 67 | self.program_key_s= nn.Linear(program_interface_size, self.svd_num_features*self.pkey_dim) 68 | self.res_s = nn.Linear(program_interface_size, 1) 69 | 70 | else: 71 | 72 | self.rnn_program_controller = LSTMController(num_inputs=program_interface_size, 73 | num_outputs=self.svd_num_features*self.pkey_dim*3+1, 74 | num_layers=1) 75 | self.component_map = nn.Linear(self.svd_num_features*self.pkey_dim*3+1, program_interface_size) 76 | 77 | if self.top_lu>0: 78 | self.read_mode = nn.Linear(program_interface_size, 3*num_program) 79 | 80 | if self.kc_mode == "cb": 81 | self.p2ku = nn.Linear(program_shape[0], self.pkey_dim) 82 | self.p2kv = nn.Linear(program_shape[1], self.pkey_dim) 83 | 84 | for name, param in self.named_parameters(): 85 | if "PM" not in name: 86 | param.requires_grad = True 87 | 88 | self.bias = nn.Parameter(torch.zeros(program_shape[1], 89 | requires_grad=True)) 90 | self.record_Ua = [] 91 | self.record_Va = [] 92 | self.record_Sa = [] 93 | 94 | 95 | 96 | def initialize(self): 97 | nn.init.xavier_uniform_(self.PM_U, gain=1) 98 | nn.init.xavier_uniform_(self.PM_V, gain=1) 99 | nn.init.xavier_uniform_(self.PM_S, gain=1) 100 | if self.has_res_w == "y": 101 | nn.init.xavier_uniform_(self.res_weight, gain=1) 102 | nn.init.normal_(self.bias, std=0.01) 103 | 104 | if self.rnn_step == 0: 105 | nn.init.xavier_uniform_(self.program_key_u.weight, gain=1) 106 | nn.init.normal_(self.program_key_u.bias, std=0.01) 107 | nn.init.xavier_uniform_(self.program_key_v.weight, gain=1) 108 | nn.init.normal_(self.program_key_v.bias, std=0.01) 109 | nn.init.xavier_uniform_(self.program_key_s.weight, gain=1) 110 | nn.init.normal_(self.program_key_s.bias, std=0.01) 111 | nn.init.xavier_uniform_(self.res_s.weight, gain=1) 112 | nn.init.normal_(self.res_s.bias, std=0.01) 113 | else: 114 | self.rnn_program_controller.reset_parameters() 115 | 116 | def init_seq(self): 117 | self.record_Ua=[] 118 | self.record_Va=[] 119 | self.record_Sa=[] 120 | 121 | def updateMK(self, kc_mode=None): 122 | if self.kc_mode == "lk": 123 | self.PK_U = self.PM_U[:,:self.pkey_dim] 124 | self.PK_V = self.PM_V[:, :self.pkey_dim] 125 | self.PK_S = self.PM_S[:, :self.pkey_dim] 126 | elif self.kc_mode == "cb": 127 | self.PK_U = self.p2ku(self.PM_U[:, self.pkey_dim:]) 128 | self.PK_V = self.p2kv(self.PM_V[:, self.pkey_dim:]) 129 | self.PK_S = self.PM_S[:, :self.pkey_dim] 130 | 131 | def get_reg_loss(self): 132 | if torch.cuda.is_available(): 133 | I = torch.eye(self.program_size).cuda() 134 | else: 135 | I = torch.eye(self.program_size) 136 | 137 | ploss1 = torch.norm(torch.matmul(self.PM_U, self.PM_U.t()) - I) 138 | ploss2 = torch.norm(torch.matmul(self.PM_V, self.PM_V.t()) - I) 139 | return ploss1 + ploss2 140 | 141 | def read_Us(self, x): 142 | MK = self.PK_U.repeat(x.shape[0], 1, 1) 143 | MP = self.PM_U[:,self.pkey_dim:].repeat(x.shape[0], 1, 1) 144 | ku = self.program_key_u(x).view(x.shape[0],self.svd_num_features,-1) 145 | dU = θ(MK, ku) 146 | self.record_Ua.append(dU) 147 | dU = F.softmax(dU, dim=-1) 148 | Us = torch.matmul(dU, MP) 149 | return Us 150 | 151 | def read_Vs(self, x): 152 | MK = self.PK_V.repeat(x.shape[0], 1, 1) 153 | MP = self.PM_V[:,self.pkey_dim:].repeat(x.shape[0], 1, 1) 154 | kv = self.program_key_v(x).view(x.shape[0],self.svd_num_features,-1) 155 | dV = θ(MK, kv) 156 | self.record_Va.append(dV) 157 | dV = F.softmax(dV, dim=-1) 158 | Vs = torch.matmul(dV, MP) 159 | return Vs 160 | 161 | def read_Ss(self, x): 162 | MK = self.PK_S.repeat(x.shape[0], 1, 1) 163 | MP = self.PM_S[:,self.pkey_dim:].repeat(x.shape[0], 1, 1) 164 | ks = self.program_key_s(x).view(x.shape[0],self.svd_num_features,-1) 165 | dS = θ(MK, ks) 166 | self.record_Sa.append(dS) 167 | dS = F.softmax(dS, dim=-1) 168 | Ss = torch.matmul(dS, MP) 169 | Ss = F.softplus(Ss) 170 | Ss = torch.cumsum(Ss, dim=1).squeeze(-1) 171 | Ss = torch.diag_embed(Ss, offset=0, dim1=-2, dim2=-1) 172 | return Ss 173 | 174 | def read_component(self, MP, MK, k, luw=None, rm=0): 175 | d = θ(MK, k) 176 | d = F.softmax(d*10, dim=-1) 177 | 178 | 179 | if self.top_lu>0: 180 | m, _ = torch.max(d, dim=-1) 181 | d = d*1/m.unsqueeze(2) 182 | d = d*(1-rm) + luw*rm 183 | M = torch.matmul(d, MP) 184 | return M, d 185 | 186 | def readPM_recurrent(self, x): 187 | MKu = self.PK_U.repeat(x.shape[0], 1, 1) 188 | MPu = self.PM_U[:, self.pkey_dim:].repeat(x.shape[0], 1, 1) 189 | MKv = self.PK_V.repeat(x.shape[0], 1, 1) 190 | MPv = self.PM_V[:, self.pkey_dim:].repeat(x.shape[0], 1, 1) 191 | MKs = self.PK_S.repeat(x.shape[0], 1, 1) 192 | MPs = self.PM_S[:, self.pkey_dim:].repeat(x.shape[0], 1, 1) 193 | 194 | U = [] 195 | V = [] 196 | S = [] 197 | 198 | dUs = [] 199 | dVs = [] 200 | dSs = [] 201 | 202 | state = self.rnn_program_controller.create_new_state(x.shape[0]) 203 | if self.top_lu>0: 204 | luw_u = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu) 205 | luw_u2 = torch.ones(x.shape[0], 1, self.top_lu) 206 | luw_u = torch.cat([luw_u2, luw_u], dim=-1) 207 | luw_v = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu) 208 | luw_v2 = torch.ones(x.shape[0], 1, self.top_lu) 209 | luw_v = torch.cat([luw_v2, luw_v], dim=-1) 210 | luw_s = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu) 211 | luw_s2 = torch.ones(x.shape[0], 1, self.top_lu) 212 | luw_s = torch.cat([luw_s2, luw_s], dim=-1) 213 | 214 | if torch.cuda.is_available(): 215 | luw_u = luw_u.cuda() 216 | luw_v = luw_v.cuda() 217 | luw_s = luw_s.cuda() 218 | 219 | 220 | for step in range(self.rnn_step): 221 | interface, state = self.rnn_program_controller(x, state) 222 | # x = self.component_map(interface) 223 | key_u, key_v, key_s, rs = interface[:,:self.svd_num_features*self.pkey_dim],\ 224 | interface[:,self.svd_num_features*self.pkey_dim:self.svd_num_features*self.pkey_dim*2], \ 225 | interface[:, self.svd_num_features*self.pkey_dim*2:self.svd_num_features*self.pkey_dim*3],\ 226 | interface[:,self.svd_num_features*self.pkey_dim*3:] 227 | 228 | if self.top_lu==0: 229 | Ut, dU = self.read_component(MPu, MKu, key_u.view(x.shape[0],self.svd_num_features,-1)) 230 | Vt, dV = self.read_component(MPv, MKv, key_v.view(x.shape[0],self.svd_num_features,-1)) 231 | St, dS = self.read_component(MPs, MKs, key_s.view(x.shape[0],self.svd_num_features,-1)) 232 | else: 233 | rm = F.sigmoid(self.read_mode(x)) 234 | rm_u = rm[:,:self.program_size].unsqueeze(1) 235 | rm_v = rm[:, self.program_size:self.program_size*2].unsqueeze(1) 236 | rm_s = rm[:, self.program_size*2:].unsqueeze(1) 237 | Ut, dU = self.read_component(MPu, MKu, key_u.view(x.shape[0], self.svd_num_features, -1), 238 | luw_u, rm_u) 239 | Vt, dV = self.read_component(MPv, MKv, key_v.view(x.shape[0], self.svd_num_features, -1), 240 | luw_v, rm_v) 241 | St, dS = self.read_component(MPs, MKs, key_s.view(x.shape[0], self.svd_num_features, -1), 242 | luw_s, rm_s) 243 | 244 | U.append(Ut) 245 | V.append(Vt) 246 | S.append(St) 247 | 248 | dUs.append(dU) 249 | dVs.append(dV) 250 | dSs.append(dS) 251 | 252 | dU = torch.cat(dUs, dim=1) 253 | dV = torch.cat(dVs, dim=1) 254 | dS = torch.cat(dSs, dim=1) 255 | 256 | if self.top_lu>0: 257 | max_useu, _ = torch.max(dU, dim=1) 258 | upperu, _ = torch.max(max_useu, dim=-1) 259 | luw_u = 1 -max_useu 260 | luw_u_sort, _ = luw_u.sort(dim=-1, descending=True) 261 | th = luw_u_sort[:,self.top_lu].unsqueeze(1) 262 | luw_u = (luw_u*(luw_u>th).float()).unsqueeze(1) 263 | max_usev, _ = torch.max(dV, dim=1) 264 | upperv, _ = torch.max(max_usev, dim=-1) 265 | luw_v = 1-max_usev 266 | luw_v_sort, _ = luw_v.sort(dim=-1, descending=True) 267 | th = luw_v_sort[:, self.top_lu].unsqueeze(1) 268 | luw_v = (luw_v*(luw_v > th).float()).unsqueeze(1) 269 | max_uses, _ = torch.max(dS, dim=1) 270 | uppers, _ = torch.max(max_uses, dim=-1) 271 | luw_s = 1 - max_uses 272 | luw_s_sort, _ = luw_s.sort(dim=-1, descending=True) 273 | th = luw_s_sort[:, self.top_lu].unsqueeze(1) 274 | luw_s = (luw_s*(luw_s > th).float()).unsqueeze(1) 275 | 276 | 277 | 278 | U = torch.cat(U, dim=1) 279 | V = torch.cat(V, dim=1) 280 | S = torch.cat(S, dim=1) 281 | 282 | 283 | 284 | self.record_Ua.append(dU) 285 | self.record_Va.append(dV) 286 | self.record_Sa.append(dS) 287 | 288 | 289 | S = F.softplus(S) 290 | S = torch.cumsum(S, dim=1).squeeze(-1) 291 | S = torch.flip(S, dims=[1]) 292 | S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1) 293 | W = self.composeSVD(U, V, S) 294 | rs = F.sigmoid(rs) 295 | return W, rs, S[:, -1, -1] 296 | 297 | 298 | def composeSVD(self, U, V, S): 299 | US = torch.matmul(U.permute(0, 2, 1), S) 300 | USV = torch.matmul(US, V) 301 | return USV 302 | 303 | def forward(self, x, res_weight=None): 304 | self.updateMK() 305 | self.init_seq() 306 | if self.has_res_w == "y": 307 | res_weight = self.res_weight 308 | 309 | if self.rnn_step == 0: 310 | U = self.read_Us(x) 311 | V = self.read_Vs(x) 312 | S = self.read_Ss(x) 313 | W = self.composeSVD(U, V, S) 314 | rs = F.sigmoid(self.res_s(x)) 315 | s = S[:, 0, 0] 316 | else: 317 | W, rs, s = self.readPM_recurrent(x) 318 | 319 | 320 | if self.has_res_w == "y" or res_weight is not None: 321 | a = s.unsqueeze(1).unsqueeze(2) * rs.unsqueeze(2) 322 | W = W + a*res_weight.repeat(x.shape[0], 1, 1) 323 | 324 | y = torch.matmul(x.unsqueeze(1), W).squeeze(1) 325 | if self.has_bias: 326 | y = y + self.bias 327 | return y -------------------------------------------------------------------------------- /cl/agents/regularization.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | from .default import NormalNN 4 | 5 | 6 | class L2(NormalNN): 7 | """ 8 | @article{kirkpatrick2017overcoming, 9 | title={Overcoming catastrophic forgetting in neural networks}, 10 | author={Kirkpatrick, James and Pascanu, Razvan and Rabinowitz, Neil and Veness, Joel and Desjardins, Guillaume and Rusu, Andrei A and Milan, Kieran and Quan, John and Ramalho, Tiago and Grabska-Barwinska, Agnieszka and others}, 11 | journal={Proceedings of the national academy of sciences}, 12 | year={2017}, 13 | url={https://arxiv.org/abs/1612.00796} 14 | } 15 | """ 16 | def __init__(self, agent_config): 17 | super(L2, self).__init__(agent_config) 18 | self.params = {n: p for n, p in self.model.named_parameters() if p.requires_grad} # For convenience 19 | self.regularization_terms = {} 20 | self.task_count = 0 21 | self.online_reg = True # True: There will be only one importance matrix and previous model parameters 22 | # False: Each task has its own importance matrix and model parameters 23 | 24 | def calculate_importance(self, dataloader): 25 | # Use an identity importance so it is an L2 regularization. 26 | importance = {} 27 | for n, p in self.params.items(): 28 | importance[n] = p.clone().detach().fill_(1) # Identity 29 | return importance 30 | 31 | def learn_batch(self, train_loader, val_loader=None): 32 | 33 | self.log('#reg_term:', len(self.regularization_terms)) 34 | 35 | # 1.Learn the parameters for current task 36 | super(L2, self).learn_batch(train_loader, val_loader) 37 | 38 | # 2.Backup the weight of current task 39 | task_param = {} 40 | for n, p in self.params.items(): 41 | task_param[n] = p.clone().detach() 42 | 43 | # 3.Calculate the importance of weights for current task 44 | importance = self.calculate_importance(train_loader) 45 | 46 | # Save the weight and importance of weights of current task 47 | self.task_count += 1 48 | if self.online_reg and len(self.regularization_terms)>0: 49 | # Always use only one slot in self.regularization_terms 50 | self.regularization_terms[1] = {'importance':importance, 'task_param':task_param} 51 | else: 52 | # Use a new slot to store the task-specific information 53 | self.regularization_terms[self.task_count] = {'importance':importance, 'task_param':task_param} 54 | 55 | def criterion(self, inputs, targets, tasks, regularization=True, **kwargs): 56 | loss = super(L2, self).criterion(inputs, targets, tasks, **kwargs) 57 | 58 | if regularization and len(self.regularization_terms)>0: 59 | # Calculate the reg_loss only when the regularization_terms exists 60 | reg_loss = 0 61 | for i,reg_term in self.regularization_terms.items(): 62 | task_reg_loss = 0 63 | importance = reg_term['importance'] 64 | task_param = reg_term['task_param'] 65 | for n, p in self.params.items(): 66 | task_reg_loss += (importance[n] * (p - task_param[n]) ** 2).sum() 67 | reg_loss += task_reg_loss 68 | loss += self.config['reg_coef'] * reg_loss 69 | loss = loss + self.model.get_ploss() 70 | return loss 71 | 72 | 73 | class EWC(L2): 74 | """ 75 | @article{kirkpatrick2017overcoming, 76 | title={Overcoming catastrophic forgetting in neural networks}, 77 | author={Kirkpatrick, James and Pascanu, Razvan and Rabinowitz, Neil and Veness, Joel and Desjardins, Guillaume and Rusu, Andrei A and Milan, Kieran and Quan, John and Ramalho, Tiago and Grabska-Barwinska, Agnieszka and others}, 78 | journal={Proceedings of the national academy of sciences}, 79 | year={2017}, 80 | url={https://arxiv.org/abs/1612.00796} 81 | } 82 | """ 83 | 84 | def __init__(self, agent_config): 85 | super(EWC, self).__init__(agent_config) 86 | self.online_reg = False 87 | self.n_fisher_sample = None 88 | self.empFI = False 89 | 90 | def calculate_importance(self, dataloader): 91 | # Update the diag fisher information 92 | # There are several ways to estimate the F matrix. 93 | # We keep the implementation as simple as possible while maintaining a similar performance to the literature. 94 | self.log('Computing EWC') 95 | 96 | # Initialize the importance matrix 97 | if self.online_reg and len(self.regularization_terms)>0: 98 | importance = self.regularization_terms[1]['importance'] 99 | else: 100 | importance = {} 101 | for n, p in self.params.items(): 102 | importance[n] = p.clone().detach().fill_(0) # zero initialized 103 | 104 | # Sample a subset (n_fisher_sample) of data to estimate the fisher information (batch_size=1) 105 | # Otherwise it uses mini-batches for the estimation. This speeds up the process a lot with similar performance. 106 | if self.n_fisher_sample is not None: 107 | n_sample = min(self.n_fisher_sample, len(dataloader.dataset)) 108 | self.log('Sample',self.n_fisher_sample,'for estimating the F matrix.') 109 | rand_ind = random.sample(list(range(len(dataloader.dataset))), n_sample) 110 | subdata = torch.utils.data.Subset(dataloader.dataset, rand_ind) 111 | dataloader = torch.utils.data.DataLoader(subdata, shuffle=True, num_workers=2, batch_size=1) 112 | 113 | mode = self.training 114 | self.eval() 115 | 116 | # Accumulate the square of gradients 117 | for i, (input, target, task) in enumerate(dataloader): 118 | if self.gpu: 119 | input = input.cuda() 120 | target = target.cuda() 121 | 122 | preds = self.forward(input) 123 | 124 | # Sample the labels for estimating the gradients 125 | # For multi-headed model, the batch of data will be from the same task, 126 | # so we just use task[0] as the task name to fetch corresponding predictions 127 | # For single-headed model, just use the max of predictions from preds['All'] 128 | task_name = task[0] if self.multihead else 'All' 129 | 130 | # The flag self.valid_out_dim is for handling the case of incremental class learning. 131 | # if self.valid_out_dim is an integer, it means only the first 'self.valid_out_dim' dimensions are used 132 | # in calculating the loss. 133 | pred = preds[task_name] if not isinstance(self.valid_out_dim, int) else preds[task_name][:,:self.valid_out_dim] 134 | ind = pred.max(1)[1].flatten() # Choose the one with max 135 | 136 | # - Alternative ind by multinomial sampling. Its performance is similar. - 137 | # prob = torch.nn.functional.softmax(preds['All'],dim=1) 138 | # ind = torch.multinomial(prob,1).flatten() 139 | 140 | if self.empFI: # Use groundtruth label (default is without this) 141 | ind = target 142 | 143 | loss = self.criterion(preds, ind, task, regularization=False) 144 | loss = loss + self.model.get_ploss() 145 | 146 | self.model.zero_grad() 147 | loss.backward() 148 | for n, p in importance.items(): 149 | if self.params[n].grad is not None: # Some heads can have no grad if no loss applied on them. 150 | p += ((self.params[n].grad ** 2) * len(input) / len(dataloader)) 151 | 152 | self.train(mode=mode) 153 | 154 | return importance 155 | 156 | 157 | def EWC_online(agent_config): 158 | agent = EWC(agent_config) 159 | agent.online_reg = True 160 | return agent 161 | 162 | 163 | class SI(L2): 164 | """ 165 | @inproceedings{zenke2017continual, 166 | title={Continual Learning Through Synaptic Intelligence}, 167 | author={Zenke, Friedemann and Poole, Ben and Ganguli, Surya}, 168 | booktitle={International Conference on Machine Learning}, 169 | year={2017}, 170 | url={https://arxiv.org/abs/1703.04200} 171 | } 172 | """ 173 | 174 | def __init__(self, agent_config): 175 | super(SI, self).__init__(agent_config) 176 | self.online_reg = True # Original SI works in an online updating fashion 177 | self.damping_factor = 0.1 178 | self.w = {} 179 | for n, p in self.params.items(): 180 | self.w[n] = p.clone().detach().zero_() 181 | 182 | # The initial_params will only be used in the first task (when the regularization_terms is empty) 183 | self.initial_params = {} 184 | for n, p in self.params.items(): 185 | self.initial_params[n] = p.clone().detach() 186 | 187 | def update_model(self, inputs, targets, tasks): 188 | 189 | unreg_gradients = {} 190 | 191 | # 1.Save current parameters 192 | old_params = {} 193 | for n, p in self.params.items(): 194 | old_params[n] = p.clone().detach() 195 | 196 | # 2. Collect the gradients without regularization term 197 | out = self.forward(inputs) 198 | loss = self.criterion(out, targets, tasks, regularization=False) 199 | loss = loss + self.model.get_ploss() 200 | 201 | self.optimizer.zero_grad() 202 | loss.backward(retain_graph=True) 203 | for n, p in self.params.items(): 204 | if p.grad is not None: 205 | unreg_gradients[n] = p.grad.clone().detach() 206 | 207 | # 3. Normal update with regularization 208 | loss = self.criterion(out, targets, tasks, regularization=True) 209 | self.optimizer.zero_grad() 210 | loss.backward() 211 | self.optimizer.step() 212 | 213 | # 4. Accumulate the w 214 | for n, p in self.params.items(): 215 | delta = p.detach() - old_params[n] 216 | if n in unreg_gradients.keys(): # In multi-head network, some head could have no grad (lazy) since no loss go through it. 217 | self.w[n] -= unreg_gradients[n] * delta # w[n] is >=0 218 | 219 | return loss.detach(), out 220 | 221 | """ 222 | # - Alternative simplified implementation with similar performance - 223 | def update_model(self, inputs, targets, tasks): 224 | # A wrapper of original update step to include the estimation of w 225 | 226 | # Backup prev param if not done yet 227 | # The backup only happened at the beginning of a new task 228 | if len(self.prev_params) == 0: 229 | for n, p in self.params.items(): 230 | self.prev_params[n] = p.clone().detach() 231 | 232 | # 1.Save current parameters 233 | old_params = {} 234 | for n, p in self.params.items(): 235 | old_params[n] = p.clone().detach() 236 | 237 | # 2.Calculate the loss as usual 238 | loss, out = super(SI, self).update_model(inputs, targets, tasks) 239 | 240 | # 3.Accumulate the w 241 | for n, p in self.params.items(): 242 | delta = p.detach() - old_params[n] 243 | if p.grad is not None: # In multi-head network, some head could have no grad (lazy) since no loss go through it. 244 | self.w[n] -= p.grad * delta # w[n] is >=0 245 | 246 | return loss.detach(), out 247 | """ 248 | 249 | def calculate_importance(self, dataloader): 250 | self.log('Computing SI') 251 | assert self.online_reg,'SI needs online_reg=True' 252 | 253 | # Initialize the importance matrix 254 | if len(self.regularization_terms)>0: # The case of after the first task 255 | importance = self.regularization_terms[1]['importance'] 256 | prev_params = self.regularization_terms[1]['task_param'] 257 | else: # It is in the first task 258 | importance = {} 259 | for n, p in self.params.items(): 260 | importance[n] = p.clone().detach().fill_(0) # zero initialized 261 | prev_params = self.initial_params 262 | 263 | # Calculate or accumulate the Omega (the importance matrix) 264 | for n, p in importance.items(): 265 | delta_theta = self.params[n].detach() - prev_params[n] 266 | p += self.w[n]/(delta_theta**2 + self.damping_factor) 267 | self.w[n].zero_() 268 | 269 | return importance 270 | 271 | 272 | class MAS(L2): 273 | """ 274 | @article{aljundi2017memory, 275 | title={Memory Aware Synapses: Learning what (not) to forget}, 276 | author={Aljundi, Rahaf and Babiloni, Francesca and Elhoseiny, Mohamed and Rohrbach, Marcus and Tuytelaars, Tinne}, 277 | booktitle={ECCV}, 278 | year={2018}, 279 | url={https://eccv2018.org/openaccess/content_ECCV_2018/papers/Rahaf_Aljundi_Memory_Aware_Synapses_ECCV_2018_paper.pdf} 280 | } 281 | """ 282 | 283 | def __init__(self, agent_config): 284 | super(MAS, self).__init__(agent_config) 285 | self.online_reg = True 286 | 287 | def calculate_importance(self, dataloader): 288 | self.log('Computing MAS') 289 | 290 | # Initialize the importance matrix 291 | if self.online_reg and len(self.regularization_terms)>0: 292 | importance = self.regularization_terms[1]['importance'] 293 | else: 294 | importance = {} 295 | for n, p in self.params.items(): 296 | importance[n] = p.clone().detach().fill_(0) # zero initialized 297 | 298 | mode = self.training 299 | self.eval() 300 | 301 | # Accumulate the gradients of L2 loss on the outputs 302 | for i, (input, target, task) in enumerate(dataloader): 303 | if self.gpu: 304 | input = input.cuda() 305 | target = target.cuda() 306 | 307 | preds = self.forward(input) 308 | 309 | # Sample the labels for estimating the gradients 310 | # For multi-headed model, the batch of data will be from the same task, 311 | # so we just use task[0] as the task name to fetch corresponding predictions 312 | # For single-headed model, just use the max of predictions from preds['All'] 313 | task_name = task[0] if self.multihead else 'All' 314 | 315 | # The flag self.valid_out_dim is for handling the case of incremental class learning. 316 | # if self.valid_out_dim is an integer, it means only the first 'self.valid_out_dim' dimensions are used 317 | # in calculating the loss. 318 | pred = preds[task_name] if not isinstance(self.valid_out_dim, int) else preds[task_name][:,:self.valid_out_dim] 319 | 320 | pred.pow_(2) 321 | loss = pred.mean() 322 | loss = loss + self.model.get_ploss() 323 | 324 | self.model.zero_grad() 325 | loss.backward() 326 | for n, p in importance.items(): 327 | if self.params[n].grad is not None: # Some heads can have no grad if no loss applied on them. 328 | p += (self.params[n].grad.abs() / len(dataloader)) 329 | 330 | self.train(mode=mode) 331 | 332 | return importance -------------------------------------------------------------------------------- /cl/models/nsm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | δ = 1e-6 7 | def θ(a, b, dimA=2, dimB=2, normBy=2): 8 | """Batchwise Cosine distance 9 | 10 | Cosine distance 11 | 12 | Arguments: 13 | a {Tensor} -- A 3D Tensor (b * m * w) 14 | b {Tensor} -- A 3D Tensor (b * r * w) 15 | 16 | Keyword Arguments: 17 | dimA {number} -- exponent value of the norm for `a` (default: {2}) 18 | dimB {number} -- exponent value of the norm for `b` (default: {1}) 19 | 20 | Returns: 21 | Tensor -- Batchwise cosine distance (b * r * m) 22 | """ 23 | a_norm = torch.norm(a, normBy, dimA, keepdim=True).expand_as(a) + δ 24 | b_norm = torch.norm(b, normBy, dimB, keepdim=True).expand_as(b) + δ 25 | 26 | x = torch.bmm(a, b.transpose(1, 2)).transpose(1, 2) / ( 27 | torch.bmm(a_norm, b_norm.transpose(1, 2)).transpose(1, 2) + δ) 28 | return x 29 | 30 | 31 | class ProgammedController(nn.Module): 32 | def __init__(self, program_shape, program_interface_size, pkey_dim=10, num_program=2, 33 | bias=False, svd_num_features=8, program_read_mode="linear", att_mode="kv", kc_mode="lk"): 34 | super(ProgammedController, self).__init__() 35 | self.pkey_dim = pkey_dim 36 | self.program_size = num_program 37 | self.program_shape = program_shape 38 | self.has_bias = bias 39 | self.svd_num_features = svd_num_features 40 | self.program_read_mode = program_read_mode 41 | self.att_mode = att_mode 42 | self.kc_mode = kc_mode 43 | 44 | a=0 45 | if bias: 46 | a =1 47 | 48 | self.mprogram_weights = nn.Parameter(torch.zeros(self.program_size, 49 | self.pkey_dim + 50 | (program_shape[0]+a) * program_shape[1], 51 | requires_grad=True)) 52 | 53 | # if att_mode == "kv": 54 | self.program_key = nn.Linear(program_interface_size, self.pkey_dim) 55 | self.program_strength = nn.Linear(program_interface_size, 1) 56 | # elif att_mode == "da": 57 | self.address_net = nn.Linear(program_interface_size, self.program_size) 58 | # elif att_mode == "dasvd": 59 | self.address_svd_net = nn.Linear(program_interface_size, self.program_size*svd_num_features) 60 | self.address_svd_net2 = nn.Linear(program_interface_size, self.program_size) 61 | 62 | self.program_descriptor = nn.Linear(min(program_shape[0]+a,program_shape[1])*self.svd_num_features//2*3, self.pkey_dim) 63 | self.program_descriptor2 = nn.Linear(program_shape[1]*2+1, 64 | self.pkey_dim) 65 | # self.program_descriptor2.weight.require_grad=False 66 | # self.program_descriptor2.bias.require_grad=False 67 | 68 | self.program_key2 = nn.Linear(program_interface_size, self.pkey_dim*(self.svd_num_features+1)) 69 | # self.program_key2.weight.require_grad = False 70 | # self.program_key2.bias.require_grad = False 71 | self.code_len_net = nn.Linear(program_interface_size, program_shape[1]) 72 | 73 | self.pad_size = program_shape[0]+a-self.svd_num_features 74 | # stdv = 1. / math.sqrt(self.mprogram_weights.size(1)) 75 | # self.mprogram_weights.data.uniform_(-stdv, stdv) 76 | self.relu = nn.ReLU() 77 | self.MK = None 78 | self.Us = None 79 | self.Ss = None 80 | self.Vs = None 81 | for name, param in self.named_parameters(): 82 | if "mprogram_weights" not in name: 83 | param.requires_grad = True 84 | 85 | # self.updateMK() 86 | 87 | def initialize(self): 88 | nn.init.xavier_uniform_(self.mprogram_weights, gain=1.4) 89 | nn.init.xavier_uniform_(self.program_key.weight, gain=1.4) 90 | nn.init.normal_(self.program_key.bias, std=0.01) 91 | nn.init.xavier_uniform_(self.program_strength.weight, gain=1.4) 92 | nn.init.normal_(self.program_strength.bias, std=0.01) 93 | nn.init.xavier_uniform_(self.address_net.weight, gain=1.4) 94 | nn.init.normal_(self.address_net.bias, std=0.01) 95 | nn.init.xavier_uniform_(self.address_svd_net.weight, gain=1.4) 96 | nn.init.normal_(self.address_svd_net.bias, std=0.01) 97 | nn.init.xavier_uniform_(self.address_svd_net2.weight, gain=1.4) 98 | nn.init.normal_(self.address_svd_net2.bias, std=0.01) 99 | nn.init.xavier_uniform_(self.program_descriptor.weight, gain=1.4) 100 | nn.init.normal_(self.program_descriptor.bias, std=0.01) 101 | nn.init.xavier_uniform_(self.program_key2.weight, gain=1.4) 102 | nn.init.normal_(self.program_key2.bias, std=0.01) 103 | nn.init.xavier_uniform_(self.code_len_net.weight, gain=1.4) 104 | nn.init.normal_(self.code_len_net.bias, std=0.01) 105 | 106 | def get_mprogram_weight(self, p): 107 | return self.mprogram_weights[p,self.pkey_dim:self.pkey_dim+self.program_shape[0]*self.program_shape[1]] 108 | 109 | def attend_program(self, input): 110 | 111 | if self.att_mode == "kv": 112 | keys = F.tanh(self.program_key(input)) 113 | strength = F.softplus(self.program_strength(input)) 114 | K = keys.unsqueeze(1)[:, :, :self.pkey_dim] 115 | 116 | d = θ(self.MK.repeat(keys.shape[0], 1, 1), K) 117 | content_weights = F.softmax(d * strength.unsqueeze(2), dim=-1) 118 | return content_weights, keys, strength 119 | elif self.att_mode == "da": 120 | a = F.softmax(self.address_net(input), dim=-1) 121 | return a, None, None 122 | elif self.att_mode == "dasvd": 123 | a = self.address_svd_net(input).view(input.shape[0], self.svd_num_features, self.program_size) 124 | a2 = self.address_svd_net2(input).view(input.shape[0], 1, self.program_size) 125 | 126 | a = F.softmax(a, dim=-1) 127 | a2 = F.softmax(a2, dim=-1) 128 | 129 | pad = torch.ones(input.shape[0], self.pad_size, self.program_size) 130 | if torch.cuda.is_available(): 131 | pad = pad.cuda() 132 | a = torch.cat([a, pad*a2], dim=1) 133 | return a, None, None 134 | elif self.att_mode == "kvsvd": 135 | K = F.tanh(self.program_key2(input)).view(input.shape[0], self.svd_num_features+1, self.pkey_dim) 136 | 137 | MK = self.MK.permute(1,0,2) 138 | 139 | MK = MK.repeat(input.shape[0], 1, 1, 1) 140 | d = θ(MK.view(-1, self.program_size, self.pkey_dim), K.view(-1,1,self.pkey_dim)) 141 | content_weights = F.softmax(d , dim=-1).view(input.shape[0], self.svd_num_features+1,self.program_size) 142 | pad = torch.ones(input.shape[0], self.pad_size, self.program_size) 143 | if torch.cuda.is_available(): 144 | pad = pad.cuda() 145 | pad = pad*content_weights[:,-1,:].unsqueeze(1) 146 | a = torch.cat([content_weights[:,:self.svd_num_features,:], pad], dim=1) 147 | return a, K[:,0,:], None 148 | 149 | def updateMK(self, kc_mode="lk"): 150 | if "svd" in kc_mode: 151 | try: 152 | MP = self.mprogram_weights[:, self.pkey_dim:] 153 | A = MP.view(MP.shape[0], -1, self.program_shape[1]) 154 | MK = [] 155 | Us = [] 156 | Ss = [] 157 | Vs = [] 158 | for i in range(self.program_size): 159 | U,S,V = torch.svd(A[i]) 160 | if kc_mode=="svds": 161 | MK.append(S[:self.pkey_dim]) 162 | elif kc_mode == "svda": 163 | pfeature = torch.cat([U[:self.svd_num_features+1, :], 164 | V[:self.svd_num_features+1, :], 165 | S[:self.svd_num_features+1].unsqueeze(1)], dim=1) 166 | MK.append(self.program_descriptor2(pfeature)) 167 | else: 168 | pfeature = torch.cat([U[:self.svd_num_features//2,:].contiguous().view(-1), 169 | V[:self.svd_num_features //2, :].contiguous().view(-1), 170 | S[:self.svd_num_features//2]]) 171 | MK.append(self.program_descriptor(pfeature)) 172 | if self.program_read_mode!="linear": 173 | Us.append(U.contiguous().view(-1)) 174 | Ss.append(S) 175 | Vs.append(V.contiguous().view(-1)) 176 | self.MK = F.tanh(torch.stack(MK, dim=0)) 177 | if self.program_read_mode != "linear": 178 | self.Us = torch.stack(Us, dim=0) 179 | self.Ss = torch.stack(Ss, dim=0) 180 | self.Vs = torch.stack(Vs, dim=0) 181 | 182 | except Exception as e: 183 | print(f"svd err {e}") 184 | elif kc_mode == "lk": 185 | self.MK = F.tanh(self.mprogram_weights[:,:self.pkey_dim]) 186 | if self.program_read_mode!="linear": 187 | MP = self.mprogram_weights[:, self.pkey_dim:] 188 | A = MP.view(MP.shape[0], -1, self.program_shape[1]) 189 | MK = [] 190 | Us = [] 191 | Ss = [] 192 | Vs = [] 193 | for i in range(self.program_size): 194 | U, S, V = torch.svd(A[i]) 195 | Us.append(U.contiguous().view(-1)) 196 | Ss.append(S) 197 | Vs.append(V.contiguous().view(-1)) 198 | if self.program_read_mode != "linear": 199 | self.Us = torch.stack(Us, dim=0) 200 | self.Ss = torch.stack(Ss, dim=0) 201 | self.Vs = torch.stack(Vs, dim=0) 202 | 203 | 204 | def linear_read(self, MP, weights): 205 | return torch.matmul(weights, MP) 206 | 207 | def linear_svd_read(self, weights): 208 | U = torch.matmul(weights, self.Us.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1]) 209 | # U = torch.sum(self.Us, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1]) 210 | # V = torch.sum(self.Vs, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1]) 211 | V = torch.matmul(weights, self.Vs.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1]) 212 | 213 | S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1)) 214 | S = torch.torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1) 215 | US = torch.matmul(U, S) 216 | 217 | USV = torch.matmul(US, V.permute(0,2,1)) 218 | 219 | return USV 220 | 221 | def linear_svd_read1(self, weights): 222 | S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1)) 223 | S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1) 224 | 225 | U = torch.sum(self.Us, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 226 | V = torch.sum(self.Vs, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 227 | US = torch.matmul(U, S) 228 | USV = torch.matmul(US, V.permute(0, 2, 1)) 229 | 230 | 231 | 232 | return USV 233 | 234 | def linear_svd_read2(self, weights): 235 | S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1)) 236 | S = torch.torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1) 237 | 238 | U = self.Us[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 239 | V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 240 | US = torch.matmul(U, S) 241 | USV = torch.matmul(US, V.permute(0, 2, 1))*weights[:,:,0].unsqueeze(2) 242 | 243 | for i in range(self.program_size-1): 244 | U = self.Us[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1]) 245 | V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1]) 246 | US = torch.matmul(U, S) 247 | USV += torch.matmul(US, V.permute(0, 2, 1))*weights[:,:,i+1].unsqueeze(2) 248 | 249 | 250 | return USV 251 | 252 | def linear_svd_read3(self, weights): 253 | S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1)) 254 | S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1) 255 | 256 | U = self.Us[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 257 | # V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 258 | V = torch.matmul(weights, self.Vs.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1]) 259 | 260 | US = torch.matmul(U, S) 261 | USV = torch.matmul(US, V.permute(0, 2, 1)) 262 | 263 | for i in range(self.program_size-1): 264 | U = self.Us[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1]) 265 | V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1]) 266 | US = torch.matmul(U, S) 267 | USV += torch.matmul(US, V.permute(0, 2, 1)) 268 | 269 | 270 | return USV 271 | 272 | 273 | def linear_svd_read_da(self, weights, cg): 274 | S = self.Ss.repeat(weights.shape[0], 1, 1)#*cg.unsqueeze(1) 275 | S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1) 276 | 277 | U = (self.Us[0]).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])*weights[:,:, 0].unsqueeze(2) 278 | V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1]) 279 | US = torch.matmul(U, S[:,0,:,:]) 280 | USV = torch.matmul(US, V.permute(0, 2, 1)) 281 | 282 | for i in range(self.program_size-1): 283 | U = (self.Us[i+1]).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1])*weights[:,:, i+1].unsqueeze(2) 284 | V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1]) 285 | US = torch.matmul(U, S[:,i+1,:,:]) 286 | USV += torch.matmul(US, V.permute(0, 2, 1)) 287 | 288 | return USV 289 | 290 | def read_program(self, input): 291 | key_size = self.pkey_dim 292 | 293 | 294 | content_weights, keys, strengths = self.attend_program(input) 295 | 296 | # print(input.shape) 297 | # print(memory.shape) 298 | # print(keys.shape) 299 | # MK = F.tanh(memory.repeat(keys.shape[0], 1, 1))[:, :, :key_size] 300 | # MK = self.getMK().repeat(keys.shape[0], 1, 1) 301 | if self.has_bias: 302 | biases = self.mprogram_weights.repeat(input.shape[0], 1, 1)[:, :,-self.program_shape[1]:] 303 | 304 | MP = self.mprogram_weights.repeat(input.shape[0], 1, 1)[:, :, key_size:key_size+self.program_shape[0]*self.program_shape[1]] 305 | 306 | 307 | # print(MP.shape) 308 | # print(content_weights.shape) 309 | if self.program_read_mode == "linear": 310 | working_weight = self.linear_read(MP, content_weights) 311 | if self.program_read_mode == "svd": 312 | working_weight = self.linear_svd_read(content_weights) 313 | elif self.program_read_mode == "svd1": 314 | working_weight = self.linear_svd_read1(content_weights) 315 | elif self.program_read_mode == "svd2": 316 | working_weight = self.linear_svd_read2(content_weights) 317 | elif self.program_read_mode == "svd3": 318 | working_weight = self.linear_svd_read3(content_weights) 319 | elif self.program_read_mode == "svdda": 320 | cg = F.sigmoid(self.code_len_net(input)) 321 | working_weight = self.linear_svd_read_da(content_weights, cg) 322 | content_weights = content_weights[:,0,:] 323 | if len(content_weights.shape)==2: 324 | content_weights = content_weights.unsqueeze(1) 325 | # instruction = content_weights.view(content_weights.shape[0],self.program_size)[:,0].unsqueeze(1) * MP[:,0,:] 326 | # for i in range(self.program_size-1): 327 | # instruction*= content_weights.view(content_weights.shape[0],self.program_size)[:,i+1].unsqueeze(1)*MP[:,i+1,:] 328 | 329 | o = (torch.matmul(input.unsqueeze(1), working_weight.view(input.shape[0], self.program_shape[0], self.program_shape[1]))).squeeze(1) 330 | if self.has_bias: 331 | bias = torch.matmul(content_weights, biases).squeeze(1) 332 | o = o+bias 333 | program_scales = [] 334 | for p in range(self.program_size): 335 | s = torch.mean(torch.exp(-torch.abs(working_weight.view(input.shape[0],-1)-MP[:,p])) 336 | , dim=0) 337 | program_scales.append(s) 338 | return o 339 | 340 | def forward(self, x): 341 | self.updateMK() 342 | return self.read_program(x) 343 | -------------------------------------------------------------------------------- /cl/outputs/split_MNIST_incremental_domain /Offline.log : -------------------------------------------------------------------------------- 1 | split_boundaries: [0, 2, 4, 6, 8, 10] 2 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 3 | FF mlp 4 | MLP( 5 | (linear): Sequential( 6 | (0): Linear(in_features=1024, out_features=400, bias=False) 7 | (1): ReLU(inplace=True) 8 | (2): Linear(in_features=400, out_features=400, bias=False) 9 | (3): ReLU(inplace=True) 10 | ) 11 | (last): ModuleDict( 12 | (All): Linear(in_features=400, out_features=2, bias=True) 13 | ) 14 | ) 15 | #parameter of model: 570402 16 | no learnable params: 570402 17 | Task order: ['1', '2', '3', '4', '5'] 18 | Epoch:0 19 | LR: 0.001 20 | Itr Time Data Loss Acc 21 | [0/469] 0.5880 (0.5880) 0.0997 (0.0997) 0.690 (0.690) 48.44 (48.44) 22 | [100/469] 0.0037 (0.0177) 0.0015 (0.0101) 0.051 (0.194) 97.66 (92.37) 23 | [200/469] 0.0023 (0.0147) 0.0003 (0.0095) 0.066 (0.145) 97.66 (94.50) 24 | [300/469] 0.0034 (0.0136) 0.0003 (0.0093) 0.049 (0.123) 98.44 (95.37) 25 | [400/469] 0.0024 (0.0131) 0.0003 (0.0091) 0.102 (0.109) 96.09 (95.95) 26 | [468/469] 0.0023 (0.0130) 0.0002 (0.0091) 0.067 (0.103) 95.83 (96.22) 27 | * Train Acc 96.218 28 | * Val Acc 97.960, Total time 1.00 29 | Epoch:1 30 | LR: 0.001 31 | Itr Time Data Loss Acc 32 | [0/469] 0.1212 (0.1212) 0.1178 (0.1178) 0.039 (0.039) 98.44 (98.44) 33 | [100/469] 0.0091 (0.0125) 0.0065 (0.0099) 0.011 (0.048) 100.00 (98.37) 34 | [200/469] 0.0264 (0.0124) 0.0239 (0.0094) 0.086 (0.049) 97.66 (98.31) 35 | [300/469] 0.0057 (0.0120) 0.0034 (0.0091) 0.043 (0.050) 98.44 (98.25) 36 | [400/469] 0.0355 (0.0122) 0.0287 (0.0090) 0.023 (0.049) 99.22 (98.30) 37 | [468/469] 0.0031 (0.0120) 0.0010 (0.0089) 0.007 (0.050) 100.00 (98.26) 38 | * Train Acc 98.260 39 | * Val Acc 98.130, Total time 1.00 40 | Epoch:2 41 | LR: 0.001 42 | Itr Time Data Loss Acc 43 | [0/469] 0.1228 (0.1228) 0.1182 (0.1182) 0.026 (0.026) 99.22 (99.22) 44 | [100/469] 0.0224 (0.0126) 0.0201 (0.0100) 0.049 (0.035) 97.66 (98.69) 45 | [200/469] 0.0099 (0.0121) 0.0075 (0.0094) 0.027 (0.034) 99.22 (98.80) 46 | [300/469] 0.0025 (0.0119) 0.0003 (0.0092) 0.008 (0.035) 100.00 (98.77) 47 | [400/469] 0.0067 (0.0120) 0.0025 (0.0091) 0.016 (0.036) 99.22 (98.73) 48 | [468/469] 0.0021 (0.0119) 0.0002 (0.0091) 0.004 (0.036) 100.00 (98.73) 49 | * Train Acc 98.732 50 | * Val Acc 98.600, Total time 1.03 51 | Epoch:3 52 | LR: 0.001 53 | Itr Time Data Loss Acc 54 | [0/469] 0.1898 (0.1898) 0.1863 (0.1863) 0.016 (0.016) 100.00 (100.00) 55 | [100/469] 0.0040 (0.0134) 0.0015 (0.0108) 0.014 (0.025) 100.00 (99.13) 56 | [200/469] 0.0044 (0.0123) 0.0018 (0.0094) 0.015 (0.027) 99.22 (99.08) 57 | [300/469] 0.0081 (0.0120) 0.0057 (0.0092) 0.014 (0.026) 99.22 (99.10) 58 | [400/469] 0.0022 (0.0121) 0.0003 (0.0092) 0.018 (0.026) 99.22 (99.06) 59 | [468/469] 0.0022 (0.0120) 0.0002 (0.0092) 0.011 (0.028) 100.00 (99.02) 60 | * Train Acc 99.020 61 | * Val Acc 98.840, Total time 1.09 62 | * Val Acc 98.840, Total time 1.01 63 | OrderedDict([('All', {'All': 98.84})]) 64 | Task All average acc: 98.84 65 | ===Summary of experiment repeats: 1 / 10 === 66 | The regularization coefficient: 0.0 67 | The last avg acc of all repeats: [98.84 0. 0. 0. 0. 0. 0. 0. 0. 0. ] 68 | mean: 9.884 std: 29.652000000000005 69 | split_boundaries: [0, 2, 4, 6, 8, 10] 70 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 71 | FF mlp 72 | MLP( 73 | (linear): Sequential( 74 | (0): Linear(in_features=1024, out_features=400, bias=False) 75 | (1): ReLU(inplace=True) 76 | (2): Linear(in_features=400, out_features=400, bias=False) 77 | (3): ReLU(inplace=True) 78 | ) 79 | (last): ModuleDict( 80 | (All): Linear(in_features=400, out_features=2, bias=True) 81 | ) 82 | ) 83 | #parameter of model: 570402 84 | no learnable params: 570402 85 | Task order: ['1', '2', '3', '4', '5'] 86 | Epoch:0 87 | LR: 0.001 88 | Itr Time Data Loss Acc 89 | [0/469] 0.1353 (0.1353) 0.1312 (0.1312) 0.689 (0.689) 50.78 (50.78) 90 | [100/469] 0.0250 (0.0128) 0.0224 (0.0103) 0.088 (0.202) 96.09 (91.53) 91 | [200/469] 0.0153 (0.0121) 0.0128 (0.0094) 0.061 (0.146) 97.66 (94.15) 92 | [300/469] 0.0040 (0.0119) 0.0015 (0.0092) 0.083 (0.122) 97.66 (95.22) 93 | [400/469] 0.0024 (0.0120) 0.0003 (0.0093) 0.032 (0.111) 99.22 (95.78) 94 | [468/469] 0.0023 (0.0119) 0.0002 (0.0093) 0.028 (0.104) 98.96 (96.06) 95 | * Train Acc 96.060 96 | * Val Acc 97.890, Total time 1.00 97 | Epoch:1 98 | LR: 0.001 99 | Itr Time Data Loss Acc 100 | [0/469] 0.1581 (0.1581) 0.1544 (0.1544) 0.050 (0.050) 98.44 (98.44) 101 | [100/469] 0.0225 (0.0135) 0.0154 (0.0101) 0.019 (0.052) 99.22 (98.15) 102 | [200/469] 0.0069 (0.0127) 0.0044 (0.0094) 0.031 (0.050) 99.22 (98.18) 103 | [300/469] 0.0102 (0.0123) 0.0081 (0.0091) 0.033 (0.051) 99.22 (98.19) 104 | [400/469] 0.0031 (0.0120) 0.0007 (0.0090) 0.055 (0.050) 97.66 (98.24) 105 | [468/469] 0.0021 (0.0119) 0.0002 (0.0089) 0.037 (0.049) 98.96 (98.28) 106 | * Train Acc 98.285 107 | * Val Acc 98.670, Total time 0.99 108 | Epoch:2 109 | LR: 0.001 110 | Itr Time Data Loss Acc 111 | [0/469] 0.2354 (0.2354) 0.2297 (0.2297) 0.027 (0.027) 99.22 (99.22) 112 | [100/469] 0.0037 (0.0143) 0.0014 (0.0114) 0.087 (0.032) 96.88 (98.85) 113 | [200/469] 0.0024 (0.0133) 0.0003 (0.0105) 0.018 (0.036) 100.00 (98.71) 114 | [300/469] 0.0025 (0.0126) 0.0002 (0.0100) 0.056 (0.036) 98.44 (98.73) 115 | [400/469] 0.0268 (0.0123) 0.0244 (0.0098) 0.020 (0.036) 99.22 (98.73) 116 | [468/469] 0.0023 (0.0122) 0.0002 (0.0096) 0.021 (0.036) 98.96 (98.74) 117 | * Train Acc 98.738 118 | * Val Acc 98.540, Total time 1.08 119 | Epoch:3 120 | LR: 0.001 121 | Itr Time Data Loss Acc 122 | [0/469] 0.1093 (0.1093) 0.1058 (0.1058) 0.030 (0.030) 99.22 (99.22) 123 | [100/469] 0.0157 (0.0127) 0.0132 (0.0101) 0.029 (0.029) 99.22 (98.95) 124 | [200/469] 0.0104 (0.0120) 0.0081 (0.0094) 0.039 (0.028) 99.22 (99.02) 125 | [300/469] 0.0044 (0.0120) 0.0021 (0.0093) 0.083 (0.028) 96.88 (99.03) 126 | [400/469] 0.0085 (0.0118) 0.0058 (0.0091) 0.047 (0.028) 97.66 (99.03) 127 | [468/469] 0.0023 (0.0118) 0.0002 (0.0091) 0.043 (0.029) 96.88 (98.99) 128 | * Train Acc 98.988 129 | * Val Acc 98.660, Total time 1.01 130 | * Val Acc 98.660, Total time 0.99 131 | OrderedDict([('All', {'All': 98.66})]) 132 | Task All average acc: 98.66 133 | ===Summary of experiment repeats: 2 / 10 === 134 | The regularization coefficient: 0.0 135 | The last avg acc of all repeats: [98.84 98.66 0. 0. 0. 0. 0. 0. 0. 0. ] 136 | mean: 19.75 std: 39.50002050632379 137 | split_boundaries: [0, 2, 4, 6, 8, 10] 138 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 139 | FF mlp 140 | MLP( 141 | (linear): Sequential( 142 | (0): Linear(in_features=1024, out_features=400, bias=False) 143 | (1): ReLU(inplace=True) 144 | (2): Linear(in_features=400, out_features=400, bias=False) 145 | (3): ReLU(inplace=True) 146 | ) 147 | (last): ModuleDict( 148 | (All): Linear(in_features=400, out_features=2, bias=True) 149 | ) 150 | ) 151 | #parameter of model: 570402 152 | no learnable params: 570402 153 | Task order: ['1', '2', '3', '4', '5'] 154 | Epoch:0 155 | LR: 0.001 156 | Itr Time Data Loss Acc 157 | [0/469] 0.1701 (0.1701) 0.1658 (0.1658) 0.696 (0.696) 45.31 (45.31) 158 | [100/469] 0.0276 (0.0140) 0.0254 (0.0113) 0.136 (0.195) 95.31 (92.06) 159 | [200/469] 0.0038 (0.0131) 0.0014 (0.0103) 0.094 (0.146) 96.09 (94.20) 160 | [300/469] 0.0308 (0.0125) 0.0284 (0.0099) 0.047 (0.122) 99.22 (95.25) 161 | [400/469] 0.0251 (0.0123) 0.0227 (0.0097) 0.045 (0.108) 98.44 (95.84) 162 | [468/469] 0.0024 (0.0121) 0.0002 (0.0095) 0.123 (0.102) 96.88 (96.11) 163 | * Train Acc 96.110 164 | * Val Acc 97.900, Total time 0.99 165 | Epoch:1 166 | LR: 0.001 167 | Itr Time Data Loss Acc 168 | [0/469] 0.1156 (0.1156) 0.1122 (0.1122) 0.036 (0.036) 99.22 (99.22) 169 | [100/469] 0.0026 (0.0128) 0.0004 (0.0100) 0.053 (0.053) 97.66 (98.23) 170 | [200/469] 0.0050 (0.0124) 0.0021 (0.0094) 0.120 (0.055) 97.66 (98.08) 171 | [300/469] 0.0133 (0.0121) 0.0100 (0.0092) 0.069 (0.053) 96.88 (98.15) 172 | [400/469] 0.0302 (0.0120) 0.0277 (0.0092) 0.072 (0.051) 97.66 (98.24) 173 | [468/469] 0.0162 (0.0120) 0.0140 (0.0091) 0.026 (0.050) 98.96 (98.26) 174 | * Train Acc 98.258 175 | * Val Acc 98.580, Total time 1.01 176 | Epoch:2 177 | LR: 0.001 178 | Itr Time Data Loss Acc 179 | [0/469] 0.1011 (0.1011) 0.0981 (0.0981) 0.048 (0.048) 97.66 (97.66) 180 | [100/469] 0.0123 (0.0125) 0.0100 (0.0100) 0.061 (0.034) 96.88 (98.82) 181 | [200/469] 0.0061 (0.0120) 0.0037 (0.0094) 0.015 (0.036) 99.22 (98.75) 182 | [300/469] 0.0394 (0.0123) 0.0368 (0.0096) 0.037 (0.039) 98.44 (98.65) 183 | [400/469] 0.0029 (0.0120) 0.0003 (0.0093) 0.048 (0.038) 97.66 (98.69) 184 | [468/469] 0.0130 (0.0120) 0.0109 (0.0093) 0.051 (0.037) 98.96 (98.70) 185 | * Train Acc 98.700 186 | * Val Acc 98.570, Total time 0.99 187 | Epoch:3 188 | LR: 0.001 189 | Itr Time Data Loss Acc 190 | [0/469] 0.1563 (0.1563) 0.1494 (0.1494) 0.046 (0.046) 98.44 (98.44) 191 | [100/469] 0.0027 (0.0128) 0.0003 (0.0102) 0.075 (0.030) 96.88 (98.92) 192 | [200/469] 0.0027 (0.0127) 0.0003 (0.0097) 0.044 (0.030) 98.44 (98.92) 193 | [300/469] 0.0022 (0.0122) 0.0003 (0.0094) 0.008 (0.029) 100.00 (98.94) 194 | [400/469] 0.0204 (0.0120) 0.0180 (0.0092) 0.032 (0.029) 98.44 (98.95) 195 | [468/469] 0.0186 (0.0120) 0.0159 (0.0091) 0.030 (0.029) 98.96 (98.92) 196 | * Train Acc 98.917 197 | * Val Acc 98.850, Total time 1.09 198 | * Val Acc 98.850, Total time 1.05 199 | OrderedDict([('All', {'All': 98.85})]) 200 | Task All average acc: 98.85 201 | ===Summary of experiment repeats: 3 / 10 === 202 | The regularization coefficient: 0.0 203 | The last avg acc of all repeats: [98.84 98.66 98.85 0. 0. 0. 0. 0. 0. 0. ] 204 | mean: 29.635 std: 45.2682354968691 205 | split_boundaries: [0, 2, 4, 6, 8, 10] 206 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 207 | FF mlp 208 | MLP( 209 | (linear): Sequential( 210 | (0): Linear(in_features=1024, out_features=400, bias=False) 211 | (1): ReLU(inplace=True) 212 | (2): Linear(in_features=400, out_features=400, bias=False) 213 | (3): ReLU(inplace=True) 214 | ) 215 | (last): ModuleDict( 216 | (All): Linear(in_features=400, out_features=2, bias=True) 217 | ) 218 | ) 219 | #parameter of model: 570402 220 | no learnable params: 570402 221 | Task order: ['1', '2', '3', '4', '5'] 222 | Epoch:0 223 | LR: 0.001 224 | Itr Time Data Loss Acc 225 | [0/469] 0.1238 (0.1238) 0.1161 (0.1161) 0.699 (0.699) 50.00 (50.00) 226 | [100/469] 0.0051 (0.0124) 0.0022 (0.0098) 0.121 (0.200) 95.31 (91.48) 227 | [200/469] 0.0286 (0.0125) 0.0264 (0.0098) 0.072 (0.149) 96.88 (93.93) 228 | [300/469] 0.0026 (0.0122) 0.0003 (0.0096) 0.032 (0.127) 98.44 (94.95) 229 | [400/469] 0.0051 (0.0126) 0.0026 (0.0096) 0.039 (0.114) 98.44 (95.60) 230 | [468/469] 0.0019 (0.0125) 0.0001 (0.0096) 0.051 (0.106) 98.96 (95.94) 231 | * Train Acc 95.940 232 | * Val Acc 97.700, Total time 1.01 233 | Epoch:1 234 | LR: 0.001 235 | Itr Time Data Loss Acc 236 | [0/469] 0.1316 (0.1316) 0.1278 (0.1278) 0.068 (0.068) 96.88 (96.88) 237 | [100/469] 0.0245 (0.0126) 0.0202 (0.0100) 0.099 (0.053) 97.66 (98.14) 238 | [200/469] 0.0027 (0.0121) 0.0003 (0.0096) 0.028 (0.052) 100.00 (98.19) 239 | [300/469] 0.0025 (0.0118) 0.0003 (0.0092) 0.029 (0.053) 99.22 (98.16) 240 | [400/469] 0.0054 (0.0117) 0.0028 (0.0092) 0.042 (0.051) 99.22 (98.23) 241 | [468/469] 0.0073 (0.0118) 0.0015 (0.0090) 0.044 (0.050) 98.96 (98.28) 242 | * Train Acc 98.275 243 | * Val Acc 98.350, Total time 1.07 244 | Epoch:2 245 | LR: 0.001 246 | Itr Time Data Loss Acc 247 | [0/469] 0.1341 (0.1341) 0.1304 (0.1304) 0.051 (0.051) 97.66 (97.66) 248 | [100/469] 0.0250 (0.0128) 0.0223 (0.0102) 0.064 (0.033) 97.66 (98.84) 249 | [200/469] 0.0022 (0.0123) 0.0003 (0.0096) 0.149 (0.035) 96.09 (98.75) 250 | [300/469] 0.0093 (0.0120) 0.0053 (0.0093) 0.016 (0.035) 100.00 (98.73) 251 | [400/469] 0.0278 (0.0119) 0.0254 (0.0093) 0.025 (0.035) 99.22 (98.75) 252 | [468/469] 0.0033 (0.0118) 0.0011 (0.0092) 0.031 (0.035) 98.96 (98.74) 253 | * Train Acc 98.738 254 | * Val Acc 98.810, Total time 1.07 255 | Epoch:3 256 | LR: 0.001 257 | Itr Time Data Loss Acc 258 | [0/469] 0.1699 (0.1699) 0.1663 (0.1663) 0.013 (0.013) 100.00 (100.00) 259 | [100/469] 0.0025 (0.0128) 0.0003 (0.0102) 0.004 (0.026) 100.00 (99.17) 260 | [200/469] 0.0273 (0.0122) 0.0211 (0.0095) 0.004 (0.025) 100.00 (99.17) 261 | [300/469] 0.0025 (0.0120) 0.0003 (0.0093) 0.034 (0.026) 99.22 (99.10) 262 | [400/469] 0.0075 (0.0118) 0.0003 (0.0092) 0.015 (0.027) 99.22 (99.08) 263 | [468/469] 0.0108 (0.0118) 0.0079 (0.0092) 0.016 (0.028) 100.00 (99.04) 264 | * Train Acc 99.043 265 | * Val Acc 98.670, Total time 1.06 266 | * Val Acc 98.670, Total time 1.00 267 | OrderedDict([('All', {'All': 98.67})]) 268 | Task All average acc: 98.67 269 | ===Summary of experiment repeats: 4 / 10 === 270 | The regularization coefficient: 0.0 271 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 0. 0. 0. 0. 0. 0. ] 272 | mean: 39.501999999999995 std: 48.37990549804743 273 | split_boundaries: [0, 2, 4, 6, 8, 10] 274 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 275 | FF mlp 276 | MLP( 277 | (linear): Sequential( 278 | (0): Linear(in_features=1024, out_features=400, bias=False) 279 | (1): ReLU(inplace=True) 280 | (2): Linear(in_features=400, out_features=400, bias=False) 281 | (3): ReLU(inplace=True) 282 | ) 283 | (last): ModuleDict( 284 | (All): Linear(in_features=400, out_features=2, bias=True) 285 | ) 286 | ) 287 | #parameter of model: 570402 288 | no learnable params: 570402 289 | Task order: ['1', '2', '3', '4', '5'] 290 | Epoch:0 291 | LR: 0.001 292 | Itr Time Data Loss Acc 293 | [0/469] 0.1477 (0.1477) 0.1437 (0.1437) 0.697 (0.697) 54.69 (54.69) 294 | [100/469] 0.0022 (0.0136) 0.0003 (0.0109) 0.110 (0.192) 95.31 (92.01) 295 | [200/469] 0.0191 (0.0129) 0.0167 (0.0101) 0.077 (0.142) 96.88 (94.36) 296 | [300/469] 0.0025 (0.0124) 0.0003 (0.0098) 0.084 (0.119) 96.88 (95.36) 297 | [400/469] 0.0026 (0.0122) 0.0003 (0.0096) 0.079 (0.109) 98.44 (95.85) 298 | [468/469] 0.0021 (0.0121) 0.0002 (0.0095) 0.096 (0.102) 96.88 (96.16) 299 | * Train Acc 96.155 300 | * Val Acc 98.090, Total time 1.03 301 | Epoch:1 302 | LR: 0.001 303 | Itr Time Data Loss Acc 304 | [0/469] 0.2470 (0.2470) 0.2407 (0.2407) 0.075 (0.075) 97.66 (97.66) 305 | [100/469] 0.0025 (0.0139) 0.0003 (0.0110) 0.052 (0.050) 97.66 (98.14) 306 | [200/469] 0.0043 (0.0126) 0.0021 (0.0099) 0.066 (0.050) 97.66 (98.19) 307 | [300/469] 0.0072 (0.0126) 0.0051 (0.0096) 0.175 (0.050) 96.09 (98.24) 308 | [400/469] 0.0202 (0.0123) 0.0178 (0.0094) 0.096 (0.049) 99.22 (98.32) 309 | [468/469] 0.0027 (0.0121) 0.0008 (0.0092) 0.006 (0.048) 100.00 (98.35) 310 | * Train Acc 98.347 311 | * Val Acc 98.490, Total time 1.01 312 | Epoch:2 313 | LR: 0.001 314 | Itr Time Data Loss Acc 315 | [0/469] 0.1382 (0.1382) 0.1342 (0.1342) 0.030 (0.030) 98.44 (98.44) 316 | [100/469] 0.0049 (0.0137) 0.0024 (0.0110) 0.015 (0.031) 99.22 (98.94) 317 | [200/469] 0.0248 (0.0128) 0.0223 (0.0101) 0.069 (0.034) 95.31 (98.81) 318 | [300/469] 0.0024 (0.0126) 0.0003 (0.0100) 0.026 (0.034) 98.44 (98.76) 319 | [400/469] 0.0043 (0.0123) 0.0019 (0.0097) 0.018 (0.035) 100.00 (98.76) 320 | [468/469] 0.0019 (0.0122) 0.0001 (0.0096) 0.013 (0.036) 100.00 (98.70) 321 | * Train Acc 98.702 322 | * Val Acc 98.430, Total time 1.02 323 | Epoch:3 324 | LR: 0.001 325 | Itr Time Data Loss Acc 326 | [0/469] 0.1211 (0.1211) 0.1176 (0.1176) 0.010 (0.010) 100.00 (100.00) 327 | [100/469] 0.0235 (0.0126) 0.0212 (0.0100) 0.034 (0.025) 98.44 (99.09) 328 | [200/469] 0.0231 (0.0125) 0.0207 (0.0096) 0.057 (0.027) 98.44 (99.03) 329 | [300/469] 0.0120 (0.0122) 0.0096 (0.0093) 0.063 (0.027) 96.88 (99.05) 330 | [400/469] 0.0078 (0.0120) 0.0015 (0.0090) 0.019 (0.028) 99.22 (99.01) 331 | [468/469] 0.0211 (0.0121) 0.0189 (0.0091) 0.008 (0.028) 100.00 (99.01) 332 | * Train Acc 99.013 333 | * Val Acc 98.780, Total time 0.98 334 | * Val Acc 98.780, Total time 1.04 335 | OrderedDict([('All', {'All': 98.78})]) 336 | Task All average acc: 98.78 337 | ===Summary of experiment repeats: 5 / 10 === 338 | The regularization coefficient: 0.0 339 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 0. 0. 0. 0. 0. ] 340 | mean: 49.379999999999995 std: 49.38003341432648 341 | split_boundaries: [0, 2, 4, 6, 8, 10] 342 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 343 | FF mlp 344 | MLP( 345 | (linear): Sequential( 346 | (0): Linear(in_features=1024, out_features=400, bias=False) 347 | (1): ReLU(inplace=True) 348 | (2): Linear(in_features=400, out_features=400, bias=False) 349 | (3): ReLU(inplace=True) 350 | ) 351 | (last): ModuleDict( 352 | (All): Linear(in_features=400, out_features=2, bias=True) 353 | ) 354 | ) 355 | #parameter of model: 570402 356 | no learnable params: 570402 357 | Task order: ['1', '2', '3', '4', '5'] 358 | Epoch:0 359 | LR: 0.001 360 | Itr Time Data Loss Acc 361 | [0/469] 0.1113 (0.1113) 0.1071 (0.1071) 0.681 (0.681) 53.12 (53.12) 362 | [100/469] 0.0063 (0.0127) 0.0039 (0.0102) 0.127 (0.195) 93.75 (92.13) 363 | [200/469] 0.0023 (0.0123) 0.0003 (0.0098) 0.066 (0.147) 97.66 (94.27) 364 | [300/469] 0.0126 (0.0125) 0.0099 (0.0099) 0.026 (0.124) 99.22 (95.29) 365 | [400/469] 0.0343 (0.0123) 0.0319 (0.0097) 0.104 (0.110) 97.66 (95.86) 366 | [468/469] 0.0024 (0.0122) 0.0002 (0.0096) 0.068 (0.105) 97.92 (96.09) 367 | * Train Acc 96.087 368 | * Val Acc 98.420, Total time 1.01 369 | Epoch:1 370 | LR: 0.001 371 | Itr Time Data Loss Acc 372 | [0/469] 0.1149 (0.1149) 0.1114 (0.1114) 0.047 (0.047) 99.22 (99.22) 373 | [100/469] 0.0031 (0.0126) 0.0007 (0.0101) 0.078 (0.051) 96.88 (98.18) 374 | [200/469] 0.0041 (0.0127) 0.0020 (0.0095) 0.071 (0.054) 95.31 (98.09) 375 | [300/469] 0.0113 (0.0123) 0.0089 (0.0093) 0.038 (0.050) 99.22 (98.26) 376 | [400/469] 0.0024 (0.0121) 0.0003 (0.0091) 0.085 (0.051) 97.66 (98.23) 377 | [468/469] 0.0049 (0.0120) 0.0029 (0.0091) 0.005 (0.050) 100.00 (98.26) 378 | * Train Acc 98.260 379 | * Val Acc 98.380, Total time 1.03 380 | Epoch:2 381 | LR: 0.001 382 | Itr Time Data Loss Acc 383 | [0/469] 0.1301 (0.1301) 0.1266 (0.1266) 0.014 (0.014) 100.00 (100.00) 384 | [100/469] 0.0038 (0.0137) 0.0014 (0.0104) 0.023 (0.034) 99.22 (98.82) 385 | [200/469] 0.0115 (0.0126) 0.0090 (0.0096) 0.065 (0.036) 98.44 (98.76) 386 | [300/469] 0.0046 (0.0122) 0.0021 (0.0093) 0.015 (0.036) 98.44 (98.70) 387 | [400/469] 0.0052 (0.0120) 0.0031 (0.0092) 0.039 (0.038) 98.44 (98.65) 388 | [468/469] 0.0056 (0.0121) 0.0034 (0.0093) 0.032 (0.038) 98.96 (98.66) 389 | * Train Acc 98.663 390 | * Val Acc 98.460, Total time 1.03 391 | Epoch:3 392 | LR: 0.001 393 | Itr Time Data Loss Acc 394 | [0/469] 0.1454 (0.1454) 0.1418 (0.1418) 0.046 (0.046) 99.22 (99.22) 395 | [100/469] 0.0025 (0.0129) 0.0003 (0.0103) 0.041 (0.027) 96.88 (98.96) 396 | [200/469] 0.0029 (0.0123) 0.0003 (0.0096) 0.028 (0.027) 99.22 (99.01) 397 | [300/469] 0.0045 (0.0124) 0.0021 (0.0097) 0.017 (0.027) 100.00 (99.00) 398 | [400/469] 0.0025 (0.0122) 0.0003 (0.0094) 0.056 (0.028) 99.22 (98.99) 399 | [468/469] 0.0210 (0.0121) 0.0189 (0.0094) 0.018 (0.029) 98.96 (98.97) 400 | * Train Acc 98.967 401 | * Val Acc 98.450, Total time 1.05 402 | * Val Acc 98.450, Total time 1.02 403 | OrderedDict([('All', {'All': 98.45})]) 404 | Task All average acc: 98.45 405 | ===Summary of experiment repeats: 6 / 10 === 406 | The regularization coefficient: 0.0 407 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 0. 0. 0. 0. ] 408 | mean: 59.225 std: 48.35712693078446 409 | split_boundaries: [0, 2, 4, 6, 8, 10] 410 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 411 | FF mlp 412 | MLP( 413 | (linear): Sequential( 414 | (0): Linear(in_features=1024, out_features=400, bias=False) 415 | (1): ReLU(inplace=True) 416 | (2): Linear(in_features=400, out_features=400, bias=False) 417 | (3): ReLU(inplace=True) 418 | ) 419 | (last): ModuleDict( 420 | (All): Linear(in_features=400, out_features=2, bias=True) 421 | ) 422 | ) 423 | #parameter of model: 570402 424 | no learnable params: 570402 425 | Task order: ['1', '2', '3', '4', '5'] 426 | Epoch:0 427 | LR: 0.001 428 | Itr Time Data Loss Acc 429 | [0/469] 0.1043 (0.1043) 0.0988 (0.0988) 0.687 (0.687) 61.72 (61.72) 430 | [100/469] 0.0025 (0.0125) 0.0004 (0.0100) 0.076 (0.197) 96.09 (91.82) 431 | [200/469] 0.0027 (0.0119) 0.0003 (0.0093) 0.146 (0.147) 94.53 (94.12) 432 | [300/469] 0.0315 (0.0119) 0.0270 (0.0093) 0.059 (0.126) 97.66 (95.03) 433 | [400/469] 0.0079 (0.0120) 0.0037 (0.0091) 0.041 (0.113) 99.22 (95.62) 434 | [468/469] 0.0202 (0.0119) 0.0181 (0.0090) 0.087 (0.106) 96.88 (95.90) 435 | * Train Acc 95.900 436 | * Val Acc 97.920, Total time 1.10 437 | Epoch:1 438 | LR: 0.001 439 | Itr Time Data Loss Acc 440 | [0/469] 0.1452 (0.1452) 0.1416 (0.1416) 0.079 (0.079) 98.44 (98.44) 441 | [100/469] 0.0245 (0.0129) 0.0222 (0.0103) 0.050 (0.055) 97.66 (97.98) 442 | [200/469] 0.0102 (0.0121) 0.0078 (0.0096) 0.036 (0.052) 99.22 (98.08) 443 | [300/469] 0.0223 (0.0119) 0.0198 (0.0093) 0.124 (0.052) 96.09 (98.11) 444 | [400/469] 0.0025 (0.0120) 0.0003 (0.0091) 0.035 (0.052) 96.88 (98.14) 445 | [468/469] 0.0025 (0.0119) 0.0002 (0.0091) 0.012 (0.050) 100.00 (98.23) 446 | * Train Acc 98.235 447 | * Val Acc 98.430, Total time 1.00 448 | Epoch:2 449 | LR: 0.001 450 | Itr Time Data Loss Acc 451 | [0/469] 0.1390 (0.1390) 0.1355 (0.1355) 0.014 (0.014) 99.22 (99.22) 452 | [100/469] 0.0027 (0.0129) 0.0003 (0.0104) 0.026 (0.034) 100.00 (98.70) 453 | [200/469] 0.0039 (0.0121) 0.0016 (0.0096) 0.013 (0.035) 100.00 (98.75) 454 | [300/469] 0.0026 (0.0120) 0.0003 (0.0094) 0.003 (0.035) 100.00 (98.72) 455 | [400/469] 0.0025 (0.0121) 0.0003 (0.0094) 0.075 (0.036) 98.44 (98.71) 456 | [468/469] 0.0136 (0.0120) 0.0110 (0.0094) 0.013 (0.036) 100.00 (98.72) 457 | * Train Acc 98.717 458 | * Val Acc 98.360, Total time 1.01 459 | Epoch:3 460 | LR: 0.001 461 | Itr Time Data Loss Acc 462 | [0/469] 0.1502 (0.1502) 0.1463 (0.1463) 0.026 (0.026) 99.22 (99.22) 463 | [100/469] 0.0199 (0.0129) 0.0172 (0.0103) 0.047 (0.030) 96.88 (98.97) 464 | [200/469] 0.0036 (0.0122) 0.0014 (0.0095) 0.052 (0.028) 97.66 (99.06) 465 | [300/469] 0.0024 (0.0123) 0.0003 (0.0093) 0.026 (0.028) 99.22 (99.03) 466 | [400/469] 0.0166 (0.0121) 0.0143 (0.0092) 0.024 (0.028) 99.22 (99.03) 467 | [468/469] 0.0022 (0.0120) 0.0002 (0.0091) 0.057 (0.028) 96.88 (99.01) 468 | * Train Acc 99.012 469 | * Val Acc 98.470, Total time 0.99 470 | * Val Acc 98.470, Total time 1.07 471 | OrderedDict([('All', {'All': 98.47})]) 472 | Task All average acc: 98.47 473 | ===Summary of experiment repeats: 7 / 10 === 474 | The regularization coefficient: 0.0 475 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 0. 0. 0. ] 476 | mean: 69.072 std: 45.21841722130486 477 | split_boundaries: [0, 2, 4, 6, 8, 10] 478 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 479 | FF mlp 480 | MLP( 481 | (linear): Sequential( 482 | (0): Linear(in_features=1024, out_features=400, bias=False) 483 | (1): ReLU(inplace=True) 484 | (2): Linear(in_features=400, out_features=400, bias=False) 485 | (3): ReLU(inplace=True) 486 | ) 487 | (last): ModuleDict( 488 | (All): Linear(in_features=400, out_features=2, bias=True) 489 | ) 490 | ) 491 | #parameter of model: 570402 492 | no learnable params: 570402 493 | Task order: ['1', '2', '3', '4', '5'] 494 | Epoch:0 495 | LR: 0.001 496 | Itr Time Data Loss Acc 497 | [0/469] 0.1434 (0.1434) 0.1396 (0.1396) 0.725 (0.725) 44.53 (44.53) 498 | [100/469] 0.0169 (0.0127) 0.0137 (0.0101) 0.201 (0.216) 92.97 (90.67) 499 | [200/469] 0.0124 (0.0126) 0.0071 (0.0097) 0.116 (0.157) 92.97 (93.62) 500 | [300/469] 0.0025 (0.0122) 0.0003 (0.0094) 0.054 (0.130) 98.44 (94.85) 501 | [400/469] 0.0022 (0.0120) 0.0003 (0.0092) 0.165 (0.116) 95.31 (95.47) 502 | [468/469] 0.0022 (0.0120) 0.0002 (0.0092) 0.043 (0.109) 98.96 (95.79) 503 | * Train Acc 95.787 504 | * Val Acc 97.520, Total time 1.02 505 | Epoch:1 506 | LR: 0.001 507 | Itr Time Data Loss Acc 508 | [0/469] 0.1120 (0.1120) 0.1083 (0.1083) 0.065 (0.065) 97.66 (97.66) 509 | [100/469] 0.0022 (0.0133) 0.0003 (0.0105) 0.048 (0.049) 97.66 (98.36) 510 | [200/469] 0.0045 (0.0125) 0.0003 (0.0099) 0.096 (0.050) 96.09 (98.30) 511 | [300/469] 0.0042 (0.0123) 0.0003 (0.0095) 0.053 (0.050) 98.44 (98.32) 512 | [400/469] 0.0026 (0.0122) 0.0003 (0.0094) 0.107 (0.050) 96.09 (98.31) 513 | [468/469] 0.0019 (0.0121) 0.0001 (0.0094) 0.015 (0.049) 100.00 (98.35) 514 | * Train Acc 98.348 515 | * Val Acc 98.570, Total time 0.98 516 | Epoch:2 517 | LR: 0.001 518 | Itr Time Data Loss Acc 519 | [0/469] 0.1443 (0.1443) 0.1407 (0.1407) 0.021 (0.021) 99.22 (99.22) 520 | [100/469] 0.0094 (0.0128) 0.0071 (0.0103) 0.042 (0.036) 98.44 (98.82) 521 | [200/469] 0.0060 (0.0127) 0.0003 (0.0100) 0.063 (0.037) 96.09 (98.73) 522 | [300/469] 0.0085 (0.0123) 0.0047 (0.0096) 0.019 (0.037) 99.22 (98.74) 523 | [400/469] 0.0030 (0.0123) 0.0003 (0.0096) 0.012 (0.037) 99.22 (98.71) 524 | [468/469] 0.0020 (0.0122) 0.0002 (0.0095) 0.026 (0.038) 97.92 (98.69) 525 | * Train Acc 98.687 526 | * Val Acc 98.540, Total time 1.00 527 | Epoch:3 528 | LR: 0.001 529 | Itr Time Data Loss Acc 530 | [0/469] 0.1162 (0.1162) 0.1128 (0.1128) 0.016 (0.016) 100.00 (100.00) 531 | [100/469] 0.0252 (0.0127) 0.0226 (0.0101) 0.099 (0.031) 98.44 (98.88) 532 | [200/469] 0.0026 (0.0120) 0.0004 (0.0095) 0.009 (0.031) 100.00 (98.85) 533 | [300/469] 0.0028 (0.0119) 0.0004 (0.0093) 0.008 (0.031) 100.00 (98.88) 534 | [400/469] 0.0205 (0.0119) 0.0175 (0.0093) 0.016 (0.031) 99.22 (98.87) 535 | [468/469] 0.0053 (0.0118) 0.0034 (0.0091) 0.021 (0.031) 97.92 (98.88) 536 | * Train Acc 98.880 537 | * Val Acc 98.690, Total time 1.06 538 | * Val Acc 98.690, Total time 1.07 539 | OrderedDict([('All', {'All': 98.69})]) 540 | Task All average acc: 98.69 541 | ===Summary of experiment repeats: 8 / 10 === 542 | The regularization coefficient: 0.0 543 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69 0. 0. ] 544 | mean: 78.941 std: 39.470705200186124 545 | split_boundaries: [0, 2, 4, 6, 8, 10] 546 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 547 | FF mlp 548 | MLP( 549 | (linear): Sequential( 550 | (0): Linear(in_features=1024, out_features=400, bias=False) 551 | (1): ReLU(inplace=True) 552 | (2): Linear(in_features=400, out_features=400, bias=False) 553 | (3): ReLU(inplace=True) 554 | ) 555 | (last): ModuleDict( 556 | (All): Linear(in_features=400, out_features=2, bias=True) 557 | ) 558 | ) 559 | #parameter of model: 570402 560 | no learnable params: 570402 561 | Task order: ['1', '2', '3', '4', '5'] 562 | Epoch:0 563 | LR: 0.001 564 | Itr Time Data Loss Acc 565 | [0/469] 0.1476 (0.1476) 0.1435 (0.1435) 0.684 (0.684) 63.28 (63.28) 566 | [100/469] 0.0024 (0.0127) 0.0003 (0.0102) 0.082 (0.200) 98.44 (91.97) 567 | [200/469] 0.0032 (0.0125) 0.0005 (0.0094) 0.055 (0.148) 97.66 (94.24) 568 | [300/469] 0.0200 (0.0122) 0.0177 (0.0092) 0.066 (0.128) 96.88 (95.13) 569 | [400/469] 0.0274 (0.0121) 0.0248 (0.0091) 0.034 (0.113) 98.44 (95.73) 570 | [468/469] 0.0019 (0.0119) 0.0001 (0.0090) 0.015 (0.105) 100.00 (96.03) 571 | * Train Acc 96.032 572 | * Val Acc 98.140, Total time 1.00 573 | Epoch:1 574 | LR: 0.001 575 | Itr Time Data Loss Acc 576 | [0/469] 0.1270 (0.1270) 0.1192 (0.1192) 0.033 (0.033) 98.44 (98.44) 577 | [100/469] 0.0041 (0.0135) 0.0019 (0.0108) 0.042 (0.048) 98.44 (98.22) 578 | [200/469] 0.0028 (0.0125) 0.0003 (0.0099) 0.071 (0.052) 96.88 (98.14) 579 | [300/469] 0.0024 (0.0122) 0.0003 (0.0096) 0.013 (0.051) 100.00 (98.18) 580 | [400/469] 0.0299 (0.0121) 0.0277 (0.0095) 0.029 (0.051) 98.44 (98.19) 581 | [468/469] 0.0022 (0.0119) 0.0001 (0.0093) 0.045 (0.050) 98.96 (98.23) 582 | * Train Acc 98.228 583 | * Val Acc 98.380, Total time 1.00 584 | Epoch:2 585 | LR: 0.001 586 | Itr Time Data Loss Acc 587 | [0/469] 0.1106 (0.1106) 0.1005 (0.1005) 0.048 (0.048) 98.44 (98.44) 588 | [100/469] 0.0023 (0.0137) 0.0003 (0.0110) 0.054 (0.032) 96.88 (98.79) 589 | [200/469] 0.0280 (0.0127) 0.0259 (0.0101) 0.014 (0.035) 100.00 (98.76) 590 | [300/469] 0.0024 (0.0123) 0.0003 (0.0097) 0.005 (0.036) 100.00 (98.76) 591 | [400/469] 0.0086 (0.0123) 0.0063 (0.0097) 0.054 (0.037) 96.88 (98.73) 592 | [468/469] 0.0119 (0.0123) 0.0059 (0.0096) 0.080 (0.037) 96.88 (98.70) 593 | * Train Acc 98.702 594 | * Val Acc 98.440, Total time 1.09 595 | Epoch:3 596 | LR: 0.001 597 | Itr Time Data Loss Acc 598 | [0/469] 0.1449 (0.1449) 0.1415 (0.1415) 0.010 (0.010) 99.22 (99.22) 599 | [100/469] 0.0035 (0.0128) 0.0014 (0.0104) 0.005 (0.026) 100.00 (99.04) 600 | [200/469] 0.0028 (0.0121) 0.0003 (0.0096) 0.074 (0.027) 98.44 (99.04) 601 | [300/469] 0.0024 (0.0121) 0.0003 (0.0094) 0.047 (0.027) 97.66 (99.02) 602 | [400/469] 0.0043 (0.0119) 0.0005 (0.0093) 0.019 (0.027) 99.22 (99.05) 603 | [468/469] 0.0020 (0.0120) 0.0001 (0.0093) 0.005 (0.027) 100.00 (99.03) 604 | * Train Acc 99.035 605 | * Val Acc 98.820, Total time 1.03 606 | * Val Acc 98.820, Total time 1.03 607 | OrderedDict([('All', {'All': 98.82})]) 608 | Task All average acc: 98.82 609 | ===Summary of experiment repeats: 9 / 10 === 610 | The regularization coefficient: 0.0 611 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69 98.82 0. ] 612 | mean: 88.82300000000001 std: 29.607971240866874 613 | split_boundaries: [0, 2, 4, 6, 8, 10] 614 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]} 615 | FF mlp 616 | MLP( 617 | (linear): Sequential( 618 | (0): Linear(in_features=1024, out_features=400, bias=False) 619 | (1): ReLU(inplace=True) 620 | (2): Linear(in_features=400, out_features=400, bias=False) 621 | (3): ReLU(inplace=True) 622 | ) 623 | (last): ModuleDict( 624 | (All): Linear(in_features=400, out_features=2, bias=True) 625 | ) 626 | ) 627 | #parameter of model: 570402 628 | no learnable params: 570402 629 | Task order: ['1', '2', '3', '4', '5'] 630 | Epoch:0 631 | LR: 0.001 632 | Itr Time Data Loss Acc 633 | [0/469] 0.1118 (0.1118) 0.1081 (0.1081) 0.691 (0.691) 48.44 (48.44) 634 | [100/469] 0.0275 (0.0136) 0.0251 (0.0107) 0.138 (0.198) 95.31 (91.92) 635 | [200/469] 0.0090 (0.0125) 0.0066 (0.0099) 0.063 (0.149) 97.66 (94.10) 636 | [300/469] 0.0027 (0.0121) 0.0003 (0.0094) 0.039 (0.126) 99.22 (95.09) 637 | [400/469] 0.0138 (0.0122) 0.0065 (0.0092) 0.115 (0.112) 97.66 (95.74) 638 | [468/469] 0.0026 (0.0121) 0.0002 (0.0091) 0.158 (0.105) 95.83 (96.02) 639 | * Train Acc 96.017 640 | * Val Acc 97.890, Total time 1.02 641 | Epoch:1 642 | LR: 0.001 643 | Itr Time Data Loss Acc 644 | [0/469] 0.1237 (0.1237) 0.1168 (0.1168) 0.070 (0.070) 97.66 (97.66) 645 | [100/469] 0.0025 (0.0127) 0.0003 (0.0102) 0.036 (0.054) 98.44 (98.13) 646 | [200/469] 0.0195 (0.0125) 0.0173 (0.0098) 0.047 (0.055) 98.44 (98.08) 647 | [300/469] 0.0217 (0.0122) 0.0194 (0.0095) 0.024 (0.055) 100.00 (98.10) 648 | [400/469] 0.0134 (0.0120) 0.0109 (0.0093) 0.067 (0.053) 98.44 (98.16) 649 | [468/469] 0.0122 (0.0119) 0.0098 (0.0093) 0.025 (0.052) 100.00 (98.20) 650 | * Train Acc 98.205 651 | * Val Acc 98.440, Total time 1.07 652 | Epoch:2 653 | LR: 0.001 654 | Itr Time Data Loss Acc 655 | [0/469] 0.1333 (0.1333) 0.1301 (0.1301) 0.017 (0.017) 99.22 (99.22) 656 | [100/469] 0.0159 (0.0129) 0.0137 (0.0104) 0.016 (0.037) 99.22 (98.69) 657 | [200/469] 0.0285 (0.0122) 0.0261 (0.0097) 0.028 (0.035) 97.66 (98.73) 658 | [300/469] 0.0025 (0.0122) 0.0003 (0.0095) 0.017 (0.036) 99.22 (98.75) 659 | [400/469] 0.0025 (0.0120) 0.0003 (0.0093) 0.057 (0.035) 99.22 (98.78) 660 | [468/469] 0.0207 (0.0120) 0.0186 (0.0093) 0.013 (0.036) 98.96 (98.75) 661 | * Train Acc 98.747 662 | * Val Acc 98.540, Total time 1.01 663 | Epoch:3 664 | LR: 0.001 665 | Itr Time Data Loss Acc 666 | [0/469] 0.1425 (0.1425) 0.1363 (0.1363) 0.009 (0.009) 100.00 (100.00) 667 | [100/469] 0.0080 (0.0138) 0.0046 (0.0102) 0.040 (0.027) 99.22 (98.99) 668 | [200/469] 0.0033 (0.0127) 0.0003 (0.0096) 0.093 (0.027) 98.44 (99.01) 669 | [300/469] 0.0216 (0.0122) 0.0187 (0.0091) 0.010 (0.028) 100.00 (99.00) 670 | [400/469] 0.0040 (0.0121) 0.0016 (0.0091) 0.003 (0.029) 100.00 (98.94) 671 | [468/469] 0.0018 (0.0119) 0.0001 (0.0090) 0.106 (0.030) 98.96 (98.94) 672 | * Train Acc 98.940 673 | * Val Acc 98.470, Total time 0.93 674 | * Val Acc 98.470, Total time 0.89 675 | OrderedDict([('All', {'All': 98.47})]) 676 | Task All average acc: 98.47 677 | ===Summary of experiment repeats: 10 / 10 === 678 | The regularization coefficient: 0.0 679 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69 98.82 98.47] 680 | mean: 98.67 std: 0.14993331851192948 681 | reg_coef: 0.0 mean: 98.67 std: 0.14993331851192948 682 | --------------------------------------------------------------------------------