├── cl
    ├── modules
    │   ├── __init__.py
    │   └── criterions.py
    ├── utils
    │   ├── __init__.py
    │   └── metric.py
    ├── dataloaders
    │   ├── __init__.py
    │   ├── base.py
    │   ├── datasetGen.py
    │   └── wrapper.py
    ├── models
    │   ├── __init__.py
    │   ├── controller.py
    │   ├── senet.py
    │   ├── mlp.py
    │   ├── lenet.py
    │   ├── resnet.py
    │   ├── nsa.py
    │   └── nsm.py
    ├── agents
    │   ├── __init__.py
    │   ├── customization.py
    │   ├── exp_replay.py
    │   ├── default.py
    │   └── regularization.py
    ├── scripts
    │   └── split_MNIST_incremental_domain.sh
    ├── iBatchLearn.py
    └── outputs
    │   └── split_MNIST_incremental_domain
    │       └── Offline.log
├── README.md
├── LICENSE
├── requirements.txt
└── .gitignore


/cl/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cl/dataloaders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cl/models/__init__.py:
--------------------------------------------------------------------------------
1 | from . import mlp
2 | from . import lenet
3 | from . import resnet
4 | from . import senet


--------------------------------------------------------------------------------
/cl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from . import default
2 | from . import regularization
3 | from . import customization
4 | from . import exp_replay


--------------------------------------------------------------------------------
/cl/modules/criterions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class BCEauto(torch.nn.BCEWithLogitsLoss):
 4 |     """
 5 |     BCE with logits loss + automatically convert the target from class label to one-hot vector
 6 |     """
 7 |     def forward(self, x, y):
 8 |         assert x.ndimension() == 2, 'Input size must be 2D'
 9 |         assert y.numel() == x.size(0), 'The size of input and target doesnt match. Number of input:' + str(x.size(0)) + ' Number of target:' + str(y.numel())
10 |         y_onehot = x.clone().zero_()
11 |         y_onehot.scatter_(1, y.view(-1, 1), 1)
12 | 
13 |         return super(BCEauto, self).forward(x, y_onehot)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neurocoder
 2 | Code of Neurocoder paper  
 3 | ICML version: https://proceedings.mlr.press/v162/le22b.html  
 4 | Code Ref:
 5 | - Continual Learning tasks: https://github.com/GT-RIPL/Continual-Learning-Benchmark
 6 | - Other tasks: TBU
 7 | 
 8 | 
 9 | # Setup  
10 | ```
11 | pip install -r requirements.txt
12 | ```
13 | Install other packages if possible
14 | 
15 | 
16 | # Continual Learning Tasks
17 | 
18 | ```
19 | cd cl
20 | mkdir data
21 | 
22 | ```
23 | Run baseline MLP
24 | ```
25 | ./scripts/split_MNIST_incremental_domain.sh  mlp
26 | ```
27 | Run baseline Neurocoder
28 | ```
29 | ./scripts/split_MNIST_incremental_domain.sh  nsa
30 | ```
31 | Notes: 
32 | - Results are logged in cl/outputs/ 
33 | - Choose CL backbone by modifying cl/scripts/split_MNIST_incremental_domain.sh  
34 | - Core model code is in cl/models/nsa.py
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Tony 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | brotlipy==0.7.0
 2 | certifi @ file:///croot/certifi_1671487769961/work/certifi
 3 | cffi @ file:///croot/cffi_1670423208954/work
 4 | charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
 5 | cryptography @ file:///croot/cryptography_1673298753778/work
 6 | flit_core @ file:///opt/conda/conda-bld/flit-core_1644941570762/work/source/flit_core
 7 | idna @ file:///croot/idna_1666125576474/work
 8 | mkl-fft==1.3.1
 9 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work
10 | mkl-service==2.4.0
11 | numpy @ file:///croot/numpy_and_numpy_base_1672336185480/work
12 | Pillow==9.3.0
13 | pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
14 | pyOpenSSL @ file:///opt/conda/conda-bld/pyopenssl_1643788558760/work
15 | PySocks @ file:///tmp/build/80754af9/pysocks_1605305779399/work
16 | requests @ file:///opt/conda/conda-bld/requests_1657734628632/work
17 | six @ file:///tmp/build/80754af9/six_1644875935023/work
18 | torch==1.13.1
19 | torchaudio==0.13.1
20 | torchvision==0.14.1
21 | typing_extensions @ file:///croot/typing_extensions_1669924550328/work
22 | urllib3 @ file:///croot/urllib3_1673575502006/work
23 | 


--------------------------------------------------------------------------------
/cl/utils/metric.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | 
 4 | def accuracy(output, target, topk=(1,)):
 5 |     """Computes the precision@k for the specified values of k"""
 6 |     with torch.no_grad():
 7 |         maxk = max(topk)
 8 |         batch_size = target.size(0)
 9 | 
10 |         _, pred = output.topk(maxk, 1, True, True)
11 |         pred = pred.t()
12 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
13 | 
14 |         res = []
15 |         for k in topk:
16 |             correct_k = correct[:k].view(-1).float().sum().item()
17 |             res.append(correct_k*100.0 / batch_size)
18 | 
19 |         if len(res)==1:
20 |             return res[0]
21 |         else:
22 |             return res
23 | 
24 | 
25 | class AverageMeter(object):
26 |     """Computes and stores the average and current value"""
27 | 
28 |     def __init__(self):
29 |         self.reset()
30 | 
31 |     def reset(self):
32 |         self.val = 0
33 |         self.avg = 0
34 |         self.sum = 0
35 |         self.count = 0
36 | 
37 |     def update(self, val, n=1):
38 |         self.val = val
39 |         self.sum += val * n
40 |         self.count += n
41 |         self.avg = float(self.sum) / self.count
42 | 
43 | 
44 | class Timer(object):
45 |     """
46 |     """
47 | 
48 |     def __init__(self):
49 |         self.reset()
50 | 
51 |     def reset(self):
52 |         self.interval = 0
53 |         self.time = time.time()
54 | 
55 |     def value(self):
56 |         return time.time() - self.time
57 | 
58 |     def tic(self):
59 |         self.time = time.time()
60 | 
61 |     def toc(self):
62 |         self.interval = time.time() - self.time
63 |         self.time = time.time()
64 |         return self.interval


--------------------------------------------------------------------------------
/cl/scripts/split_MNIST_incremental_domain.sh:
--------------------------------------------------------------------------------
 1 | GPUID=0
 2 | OUTDIR=outputs/split_MNIST_incremental_domain
 3 | REPEAT=10
 4 | MODE=$1
 5 | mkdir -p outputs/split_MNIST_incremental_domain
 6 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam    --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400                                              --lr 0.001 --offline_training  | tee ${OUTDIR}/Offline.log
 7 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam    --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400                                              --lr 0.001   --mode $1                  | tee ${OUTDIR}/Adam_${MODE}.log
 8 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adagrad --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400                                              --lr 0.01        --mode $1                | tee ${OUTDIR}/Adagrad_${MODE}.log
 9 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam    --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type customization  --agent_name EWC_online_mnist --lr 0.001 --reg_coef 700 --mode $1     | tee ${OUTDIR}/EWC_online_${MODE}.log
10 | #python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam    --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type customization  --agent_name EWC_mnist        --lr 0.001 --reg_coef 100 --mode $1     | tee ${OUTDIR}/EWC_${MODE}.log
11 | python -u iBatchLearn.py --gpuid $GPUID --repeat $REPEAT --optimizer Adam    --force_out_dim 2 --first_split_size 2 --other_split_size 2 --schedule 4 --batch_size 128 --model_name MLP400 --agent_type regularization --agent_name L2  --lr 0.001 --reg_coef 0.5   --mode $1   | tee ${OUTDIR}/L2_${MODE}.log


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | /cl/data/
131 | /.idea


--------------------------------------------------------------------------------
/cl/models/controller.py:
--------------------------------------------------------------------------------
 1 | """LSTM Controller."""
 2 | import torch
 3 | from torch import nn
 4 | from torch.nn import Parameter
 5 | import numpy as np
 6 | 
 7 | 
 8 | class FFWController(nn.Module):
 9 |     """An NTM controller based on LSTM."""
10 |     def __init__(self, num_inputs, num_outputs, num_layers):
11 |         super(FFWController, self).__init__()
12 | 
13 |         self.num_inputs = num_inputs
14 |         self.num_outputs = num_outputs
15 |         self.num_layers = num_layers
16 | 
17 | 
18 | 
19 |     def create_new_state(self, batch_size):
20 |         h = torch.zeros(batch_size, self.num_outputs)
21 |         if torch.cuda.is_available():
22 |             h = h.cuda()
23 |         return h
24 | 
25 |     def reset_parameters(self):
26 |         pass
27 | 
28 |     def size(self):
29 |         return self.num_inputs, self.num_outputs
30 | 
31 |     def forward(self, x, prev_state):
32 |         return x, prev_state
33 | 
34 | class LSTMController(nn.Module):
35 |     """An NTM controller based on LSTM."""
36 |     def __init__(self, num_inputs, num_outputs, num_layers):
37 |         super(LSTMController, self).__init__()
38 | 
39 |         self.num_inputs = num_inputs
40 |         self.num_outputs = num_outputs
41 |         self.num_layers = num_layers
42 | 
43 |         self.lstm = nn.LSTM(input_size=num_inputs,
44 |                             hidden_size=num_outputs,
45 |                             num_layers=num_layers)
46 | 
47 |         # The hidden state is a learned parameter
48 |         if torch.cuda.is_available():
49 |             self.lstm_h_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs).cuda() * 0.05)
50 |             self.lstm_c_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs).cuda() * 0.05)
51 |         else:
52 |             self.lstm_h_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs) * 0.05)
53 |             self.lstm_c_bias = Parameter(torch.randn(self.num_layers, 1, self.num_outputs) * 0.05)
54 | 
55 |         self.reset_parameters()
56 | 
57 |     def create_new_state(self, batch_size):
58 |         # Dimension: (num_layers * num_directions, batch, hidden_size)
59 |         lstm_h = self.lstm_h_bias.clone().repeat(1, batch_size, 1)
60 |         lstm_c = self.lstm_c_bias.clone().repeat(1, batch_size, 1)
61 |         return lstm_h, lstm_c
62 | 
63 |     def reset_parameters(self):
64 |         for p in self.lstm.parameters():
65 |             if p.dim() == 1:
66 |                 nn.init.constant_(p, 0)
67 |             else:
68 |                 stdev = 5 / (np.sqrt(self.num_inputs +  self.num_outputs))
69 |                 nn.init.uniform_(p, -stdev, stdev)
70 | 
71 |     def size(self):
72 |         return self.num_inputs, self.num_outputs
73 | 
74 |     def forward(self, x, prev_state):
75 |         x = x.unsqueeze(0)
76 |         outp, state = self.lstm(x, prev_state)
77 |         return outp.squeeze(0), state


--------------------------------------------------------------------------------
/cl/dataloaders/base.py:
--------------------------------------------------------------------------------
  1 | import torchvision
  2 | from torchvision import transforms
  3 | from .wrapper import CacheClassLabel
  4 | 
  5 | def MNIST(dataroot, train_aug=False):
  6 |     # Add padding to make 32x32
  7 |     #normalize = transforms.Normalize(mean=(0.1307,), std=(0.3081,))  # for 28x28
  8 |     normalize = transforms.Normalize(mean=(0.1000,), std=(0.2752,))  # for 32x32
  9 | 
 10 |     val_transform = transforms.Compose([
 11 |         transforms.Pad(2, fill=0, padding_mode='constant'),
 12 |         transforms.ToTensor(),
 13 |         normalize,
 14 |     ])
 15 |     train_transform = val_transform
 16 |     if train_aug:
 17 |         train_transform = transforms.Compose([
 18 |             transforms.RandomCrop(32, padding=4),
 19 |             transforms.ToTensor(),
 20 |             normalize,
 21 |         ])
 22 | 
 23 |     train_dataset = torchvision.datasets.MNIST(
 24 |         root=dataroot,
 25 |         train=True,
 26 |         download=True,
 27 |         transform=train_transform
 28 |     )
 29 |     train_dataset = CacheClassLabel(train_dataset)
 30 | 
 31 |     val_dataset = torchvision.datasets.MNIST(
 32 |         dataroot,
 33 |         train=False,
 34 |         transform=val_transform
 35 |     )
 36 |     val_dataset = CacheClassLabel(val_dataset)
 37 | 
 38 |     return train_dataset, val_dataset
 39 | 
 40 | def CIFAR10(dataroot, train_aug=False):
 41 |     normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])
 42 | 
 43 |     val_transform = transforms.Compose([
 44 |         transforms.ToTensor(),
 45 |         normalize,
 46 |     ])
 47 |     train_transform = val_transform
 48 |     if train_aug:
 49 |         train_transform = transforms.Compose([
 50 |             transforms.RandomCrop(32, padding=4),
 51 |             transforms.RandomHorizontalFlip(),
 52 |             transforms.ToTensor(),
 53 |             normalize,
 54 |         ])
 55 | 
 56 |     train_dataset = torchvision.datasets.CIFAR10(
 57 |         root=dataroot,
 58 |         train=True,
 59 |         download=True,
 60 |         transform=train_transform
 61 |         )
 62 |     train_dataset = CacheClassLabel(train_dataset)
 63 | 
 64 |     val_dataset = torchvision.datasets.CIFAR10(
 65 |         root=dataroot,
 66 |         train=False,
 67 |         download=True,
 68 |         transform=val_transform
 69 |     )
 70 |     val_dataset = CacheClassLabel(val_dataset)
 71 | 
 72 |     return train_dataset, val_dataset
 73 | 
 74 | 
 75 | def CIFAR100(dataroot, train_aug=False):
 76 |     normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276])
 77 | 
 78 |     val_transform = transforms.Compose([
 79 |         transforms.ToTensor(),
 80 |         normalize,
 81 |     ])
 82 |     train_transform = val_transform
 83 |     if train_aug:
 84 |         train_transform = transforms.Compose([
 85 |             transforms.RandomCrop(32, padding=4),
 86 |             transforms.RandomHorizontalFlip(),
 87 |             transforms.ToTensor(),
 88 |             normalize,
 89 |         ])
 90 | 
 91 |     train_dataset = torchvision.datasets.CIFAR100(
 92 |         root=dataroot,
 93 |         train=True,
 94 |         download=True,
 95 |         transform=train_transform
 96 |     )
 97 |     train_dataset = CacheClassLabel(train_dataset)
 98 | 
 99 |     val_dataset = torchvision.datasets.CIFAR100(
100 |         root=dataroot,
101 |         train=False,
102 |         download=True,
103 |         transform=val_transform
104 |     )
105 |     val_dataset = CacheClassLabel(val_dataset)
106 | 
107 |     return train_dataset, val_dataset
108 | 
109 | 


--------------------------------------------------------------------------------
/cl/dataloaders/datasetGen.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from random import shuffle
 3 | from .wrapper import Subclass, AppendName, Permutation
 4 | 
 5 | 
 6 | def SplitGen(train_dataset, val_dataset, first_split_sz=2, other_split_sz=2, rand_split=False, remap_class=False):
 7 |     '''
 8 |     Generate the dataset splits based on the labels.
 9 |     :param train_dataset: (torch.utils.data.dataset)
10 |     :param val_dataset: (torch.utils.data.dataset)
11 |     :param first_split_sz: (int)
12 |     :param other_split_sz: (int)
13 |     :param rand_split: (bool) Randomize the set of label in each split
14 |     :param remap_class: (bool) Ex: remap classes in a split from [2,4,6 ...] to [0,1,2 ...]
15 |     :return: train_loaders {task_name:loader}, val_loaders {task_name:loader}, out_dim {task_name:num_classes}
16 |     '''
17 |     assert train_dataset.number_classes==val_dataset.number_classes,'Train/Val has different number of classes'
18 |     num_classes =  train_dataset.number_classes
19 | 
20 |     # Calculate the boundary index of classes for splits
21 |     # Ex: [0,2,4,6,8,10] or [0,50,60,70,80,90,100]
22 |     split_boundaries = [0, first_split_sz]
23 |     while split_boundaries[-1]<num_classes:
24 |         split_boundaries.append(split_boundaries[-1]+other_split_sz)
25 |     print('split_boundaries:',split_boundaries)
26 |     assert split_boundaries[-1]==num_classes,'Invalid split size'
27 | 
28 |     # Assign classes to each splits
29 |     # Create the dict: {split_name1:[2,6,7], split_name2:[0,3,9], ...}
30 |     if not rand_split:
31 |         class_lists = {str(i):list(range(split_boundaries[i-1],split_boundaries[i])) for i in range(1,len(split_boundaries))}
32 |     else:
33 |         randseq = torch.randperm(num_classes)
34 |         class_lists = {str(i):randseq[list(range(split_boundaries[i-1],split_boundaries[i]))].tolist() for i in range(1,len(split_boundaries))}
35 |     print(class_lists)
36 | 
37 |     # Generate the dicts of splits
38 |     # Ex: {split_name1:dataset_split1, split_name2:dataset_split2, ...}
39 |     train_dataset_splits = {}
40 |     val_dataset_splits = {}
41 |     task_output_space = {}
42 |     for name,class_list in class_lists.items():
43 |         train_dataset_splits[name] = AppendName(Subclass(train_dataset, class_list, remap_class), name)
44 |         val_dataset_splits[name] = AppendName(Subclass(val_dataset, class_list, remap_class), name)
45 |         task_output_space[name] = len(class_list)
46 | 
47 |     return train_dataset_splits, val_dataset_splits, task_output_space
48 | 
49 | 
50 | def PermutedGen(train_dataset, val_dataset, n_permute, first_split_sz=2, other_split_sz=2, rand_split=False, remap_class=False):
51 |     sample, _ = train_dataset[0]
52 |     n = sample.numel()
53 |     train_datasets = {}
54 |     val_datasets = {}
55 |     task_output_space = {}
56 | 
57 |     num_classes = train_dataset.number_classes
58 | 
59 |     # Calculate the boundary index of classes for splits
60 |     # Ex: [0,2,4,6,8,10] or [0,50,60,70,80,90,100]
61 |     split_boundaries = [0, first_split_sz]
62 |     while split_boundaries[-1] < num_classes:
63 |         split_boundaries.append(split_boundaries[-1] + other_split_sz)
64 |     print('split_boundaries:', split_boundaries)
65 |     assert split_boundaries[-1] == num_classes, 'Invalid split size'
66 | 
67 |     # Assign classes to each splits
68 |     # Create the dict: {split_name1:[2,6,7], split_name2:[0,3,9], ...}
69 |     if not rand_split:
70 |         class_lists = {str(i): list(range(split_boundaries[i - 1], split_boundaries[i])) for i in
71 |                        range(1, len(split_boundaries))}
72 |     else:
73 |         randseq = torch.randperm(num_classes)
74 |         class_lists = {str(i): randseq[list(range(split_boundaries[i - 1], split_boundaries[i]))].tolist() for i in
75 |                        range(1, len(split_boundaries))}
76 |     print(class_lists)
77 | 
78 |     train_dataset_splits = {}
79 |     val_dataset_splits = {}
80 |     task_output_space = {}
81 |     for p in range(n_permute):
82 |         for name, class_list in class_lists.items():
83 |             name = name+f'_{p}'
84 |             rand_ind = list(range(n))
85 |             shuffle(rand_ind)
86 |             train_dataset_splits[name] = AppendName(Permutation(train_dataset,rand_ind, class_list, remap_class), name)
87 |             val_dataset_splits[name] = AppendName(Permutation(val_dataset,rand_ind, class_list, remap_class), name)
88 |             task_output_space[name] = len(class_list)
89 | 
90 |     return train_dataset_splits, val_dataset_splits, task_output_space
91 | 


--------------------------------------------------------------------------------
/cl/dataloaders/wrapper.py:
--------------------------------------------------------------------------------
  1 | from os import path
  2 | import torch
  3 | import torch.utils.data as data
  4 | 
  5 | 
  6 | class CacheClassLabel(data.Dataset):
  7 |     """
  8 |     A dataset wrapper that has a quick access to all labels of data.
  9 |     """
 10 |     def __init__(self, dataset):
 11 |         super(CacheClassLabel, self).__init__()
 12 |         self.dataset = dataset
 13 |         self.labels = torch.LongTensor(len(dataset)).fill_(-1)
 14 |         label_cache_filename = path.join(dataset.root, str(type(dataset))+'_'+str(len(dataset))+'.pth')
 15 |         if path.exists(label_cache_filename):
 16 |             self.labels = torch.load(label_cache_filename)
 17 |         else:
 18 |             for i, data in enumerate(dataset):
 19 |                 self.labels[i] = data[1]
 20 |             torch.save(self.labels, label_cache_filename)
 21 |         self.number_classes = len(torch.unique(self.labels))
 22 | 
 23 |     def __len__(self):
 24 |         return len(self.dataset)
 25 | 
 26 |     def __getitem__(self, index):
 27 |         img,target = self.dataset[index]
 28 |         return img, target
 29 | 
 30 | 
 31 | class AppendName(data.Dataset):
 32 |     """
 33 |     A dataset wrapper that also return the name of the dataset/task
 34 |     """
 35 |     def __init__(self, dataset, name, first_class_ind=0):
 36 |         super(AppendName,self).__init__()
 37 |         self.dataset = dataset
 38 |         self.name = name
 39 |         self.first_class_ind = first_class_ind  # For remapping the class index
 40 | 
 41 |     def __len__(self):
 42 |         return len(self.dataset)
 43 | 
 44 |     def __getitem__(self, index):
 45 |         img,target = self.dataset[index]
 46 |         target = target + self.first_class_ind
 47 |         return img, target, self.name
 48 | 
 49 | 
 50 | class Subclass(data.Dataset):
 51 |     """
 52 |     A dataset wrapper that return the task name and remove the offset of labels (Let the labels start from 0)
 53 |     """
 54 |     def __init__(self, dataset, class_list, remap=True):
 55 |         '''
 56 |         :param dataset: (CacheClassLabel)
 57 |         :param class_list: (list) A list of integers
 58 |         :param remap: (bool) Ex: remap class [2,4,6 ...] to [0,1,2 ...]
 59 |         '''
 60 |         super(Subclass,self).__init__()
 61 |         assert isinstance(dataset, CacheClassLabel), 'dataset must be wrapped by CacheClassLabel'
 62 |         self.dataset = dataset
 63 |         self.class_list = class_list
 64 |         self.remap = remap
 65 |         self.indices = []
 66 |         for c in class_list:
 67 |             self.indices.extend((dataset.labels==c).nonzero().flatten().tolist())
 68 |         if remap:
 69 |             self.class_mapping = {c: i for i, c in enumerate(class_list)}
 70 | 
 71 |     def __len__(self):
 72 |         return len(self.indices)
 73 | 
 74 |     def __getitem__(self, index):
 75 |         img,target = self.dataset[self.indices[index]]
 76 |         if self.remap:
 77 |             raw_target = target.item() if isinstance(target,torch.Tensor) else target
 78 |             target = self.class_mapping[raw_target]
 79 |         return img, target
 80 | 
 81 | 
 82 | class Permutation(data.Dataset):
 83 |     """
 84 |     A dataset wrapper that permute the position of features
 85 |     """
 86 |     def __init__(self, dataset, permute_idx, class_list, remap=True):
 87 |         super(Permutation,self).__init__()
 88 |         self.dataset = dataset
 89 |         self.permute_idx = permute_idx
 90 |         self.class_list = class_list
 91 |         self.remap = remap
 92 |         self.indices = []
 93 |         for c in class_list:
 94 |             self.indices.extend((dataset.labels == c).nonzero().flatten().tolist())
 95 |         if remap:
 96 |             self.class_mapping = {c: i for i, c in enumerate(class_list)}
 97 | 
 98 |     def __len__(self):
 99 |         return len(self.indices)
100 | 
101 |     def __getitem__(self, index):
102 |         img, target = self.dataset[self.indices[index]]
103 |         shape = img.size()
104 |         img = img.view(-1)[self.permute_idx].view(shape)
105 |         if self.remap:
106 |             raw_target = target.item() if isinstance(target, torch.Tensor) else target
107 |             target = self.class_mapping[raw_target]
108 |         return img, target
109 | 
110 | 
111 | class Storage(data.Dataset):
112 |     """
113 |     A dataset wrapper used as a memory to store the data
114 |     """
115 |     def __init__(self):
116 |         super(Storage, self).__init__()
117 |         self.storage = []
118 | 
119 |     def __len__(self):
120 |         return len(self.storage)
121 | 
122 |     def __getitem__(self, index):
123 |         return self.storage[index]
124 | 
125 |     def append(self,x):
126 |         self.storage.append(x)
127 | 
128 |     def extend(self,x):
129 |         self.storage.extend(x)
130 | 


--------------------------------------------------------------------------------
/cl/agents/customization.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .default import NormalNN
  3 | from .regularization import SI, EWC, EWC_online
  4 | from .exp_replay import Naive_Rehearsal, GEM
  5 | from modules.criterions import BCEauto
  6 | 
  7 | def init_zero_weights(m):
  8 |     with torch.no_grad():
  9 |         if type(m) == torch.nn.Linear:
 10 |             m.weight.zero_()
 11 |             m.bias.zero_()
 12 |         elif type(m) == torch.nn.ModuleDict:
 13 |             for l in m.values():
 14 |                 init_zero_weights(l)
 15 |         else:
 16 |             assert False, 'Only support linear layer'
 17 | 
 18 | 
 19 | def NormalNN_reset_optim(agent_config):
 20 |     agent = NormalNN(agent_config)
 21 |     agent.reset_optimizer = True
 22 |     return agent
 23 | 
 24 | 
 25 | def NormalNN_BCE(agent_config):
 26 |     agent = NormalNN(agent_config)
 27 |     agent.criterion_fn = BCEauto()
 28 |     return agent
 29 | 
 30 | 
 31 | def SI_BCE(agent_config):
 32 |     agent = SI(agent_config)
 33 |     agent.criterion_fn = BCEauto()
 34 |     return agent
 35 | 
 36 | 
 37 | def SI_splitMNIST_zero_init(agent_config):
 38 |     agent = SI(agent_config)
 39 |     agent.damping_factor = 1e-3
 40 |     agent.reset_optimizer = True
 41 |     agent.model.last.apply(init_zero_weights)
 42 |     return agent
 43 | 
 44 | 
 45 | def SI_splitMNIST_rand_init(agent_config):
 46 |     agent = SI(agent_config)
 47 |     agent.damping_factor = 1e-3
 48 |     agent.reset_optimizer = True
 49 |     return agent
 50 | 
 51 | 
 52 | def EWC_BCE(agent_config):
 53 |     agent = EWC(agent_config)
 54 |     agent.criterion_fn = BCEauto()
 55 |     return agent
 56 | 
 57 | 
 58 | def EWC_mnist(agent_config):
 59 |     agent = EWC(agent_config)
 60 |     agent.n_fisher_sample = 60000
 61 |     return agent
 62 | 
 63 | 
 64 | def EWC_online_mnist(agent_config):
 65 |     agent = EWC(agent_config)
 66 |     agent.n_fisher_sample = 60000
 67 |     agent.online_reg = True
 68 |     return agent
 69 | 
 70 | 
 71 | def EWC_online_empFI(agent_config):
 72 |     agent = EWC(agent_config)
 73 |     agent.empFI = True
 74 |     return agent
 75 | 
 76 | 
 77 | def EWC_zero_init(agent_config):
 78 |     agent = EWC(agent_config)
 79 |     agent.reset_optimizer = True
 80 |     agent.model.last.apply(init_zero_weights)
 81 |     return agent
 82 | 
 83 | 
 84 | def EWC_rand_init(agent_config):
 85 |     agent = EWC(agent_config)
 86 |     agent.reset_optimizer = True
 87 |     return agent
 88 | 
 89 | 
 90 | def EWC_reset_optim(agent_config):
 91 |     agent = EWC(agent_config)
 92 |     agent.reset_optimizer = True
 93 |     return agent
 94 | 
 95 | 
 96 | def EWC_online_reset_optim(agent_config):
 97 |     agent = EWC_online(agent_config)
 98 |     agent.reset_optimizer = True
 99 |     return agent
100 | 
101 | 
102 | def Naive_Rehearsal_100(agent_config):
103 |     agent = Naive_Rehearsal(agent_config)
104 |     agent.memory_size = 100
105 |     return agent
106 | 
107 | 
108 | def Naive_Rehearsal_200(agent_config):
109 |     agent = Naive_Rehearsal(agent_config)
110 |     agent.memory_size = 200
111 |     return agent
112 | 
113 | 
114 | def Naive_Rehearsal_400(agent_config):
115 |     agent = Naive_Rehearsal(agent_config)
116 |     agent.memory_size = 400
117 |     return agent
118 | 
119 | 
120 | def Naive_Rehearsal_1100(agent_config):
121 |     agent = Naive_Rehearsal(agent_config)
122 |     agent.memory_size = 1100
123 |     return agent
124 | 
125 | 
126 | def Naive_Rehearsal_1400(agent_config):
127 |     agent = Naive_Rehearsal(agent_config)
128 |     agent.memory_size = 1400
129 |     return agent
130 | 
131 | 
132 | def Naive_Rehearsal_4000(agent_config):
133 |     agent = Naive_Rehearsal(agent_config)
134 |     agent.memory_size = 4000
135 |     return agent
136 | 
137 | 
138 | def Naive_Rehearsal_4400(agent_config):
139 |     agent = Naive_Rehearsal(agent_config)
140 |     agent.memory_size = 4400
141 |     return agent
142 | 
143 | 
144 | def Naive_Rehearsal_5600(agent_config):
145 |     agent = Naive_Rehearsal(agent_config)
146 |     agent.memory_size = 5600
147 |     return agent
148 | 
149 | 
150 | def Naive_Rehearsal_16000(agent_config):
151 |     agent = Naive_Rehearsal(agent_config)
152 |     agent.memory_size = 16000
153 |     return agent
154 | 
155 | 
156 | def GEM_100(agent_config):
157 |     agent = GEM(agent_config)
158 |     agent.memory_size = 100
159 |     return agent
160 | 
161 | 
162 | def GEM_200(agent_config):
163 |     agent = GEM(agent_config)
164 |     agent.memory_size = 200
165 |     return agent
166 | 
167 | 
168 | def GEM_400(agent_config):
169 |     agent = GEM(agent_config)
170 |     agent.memory_size = 400
171 |     return agent
172 | 
173 | 
174 | def GEM_1100(agent_config):
175 |     agent = GEM(agent_config)
176 |     agent.memory_size = 1100
177 |     return agent
178 | 
179 | 
180 | def GEM_4000(agent_config):
181 |     agent = GEM(agent_config)
182 |     agent.memory_size = 4000
183 |     return agent
184 | 
185 | 
186 | def GEM_4400(agent_config):
187 |     agent = GEM(agent_config)
188 |     agent.memory_size = 4400
189 |     return agent
190 | 
191 | 
192 | def GEM_16000(agent_config):
193 |     agent = GEM(agent_config)
194 |     agent.memory_size = 16000
195 |     return agent


--------------------------------------------------------------------------------
/cl/models/senet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .resnet import conv3x3, PreActResNet, PreActResNet_cifar
  5 | 
  6 | 
  7 | class SE_PreActBlock(nn.Module):
  8 |     '''Pre-activation version of the BasicBlock.'''
  9 |     expansion = 1
 10 | 
 11 |     def __init__(self, in_planes, planes, stride=1):
 12 |         super(SE_PreActBlock, self).__init__()
 13 |         self.bn1 = nn.BatchNorm2d(in_planes)
 14 |         self.conv1 = conv3x3(in_planes, planes, stride)
 15 |         self.bn2 = nn.BatchNorm2d(planes)
 16 |         self.conv2 = conv3x3(planes, planes)
 17 | 
 18 |         if stride != 1 or in_planes != self.expansion*planes:
 19 |             self.shortcut = nn.Sequential(
 20 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 21 |             )
 22 | 
 23 |         # SE layers
 24 |         self.fc1 = nn.Conv2d(planes, planes // 16, kernel_size=1)
 25 |         self.fc2 = nn.Conv2d(planes // 16, planes, kernel_size=1)
 26 | 
 27 |     def forward(self, x):
 28 |         out = F.relu(self.bn1(x))
 29 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 30 |         out = self.conv1(out)
 31 |         out = self.conv2(F.relu(self.bn2(out)))
 32 |         # Squeeze
 33 |         w = F.avg_pool2d(out, out.size(2))
 34 |         w = F.relu(self.fc1(w))
 35 |         w = torch.sigmoid(self.fc2(w))
 36 |         # Excitation
 37 |         out = out * w  # New broadcasting feature from v0.2!
 38 |         out += shortcut
 39 |         return out
 40 | 
 41 | 
 42 | class SE_PreActBottleneck(nn.Module):
 43 |     '''Pre-activation version of the original Bottleneck module.'''
 44 |     expansion = 4
 45 | 
 46 |     def __init__(self, in_planes, planes, stride=1):
 47 |         super(SE_PreActBottleneck, self).__init__()
 48 |         self.bn1 = nn.BatchNorm2d(in_planes)
 49 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 50 |         self.bn2 = nn.BatchNorm2d(planes)
 51 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 52 |         self.bn3 = nn.BatchNorm2d(planes)
 53 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
 54 | 
 55 |         if stride != 1 or in_planes != self.expansion*planes:
 56 |             self.shortcut = nn.Sequential(
 57 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 58 |             )
 59 | 
 60 |         # SE layers
 61 |         self.fc1 = nn.Conv2d(self.expansion*planes, self.expansion*planes // 16, kernel_size=1)
 62 |         self.fc2 = nn.Conv2d(self.expansion*planes // 16, self.expansion*planes, kernel_size=1)
 63 | 
 64 |     def forward(self, x):
 65 |         out = F.relu(self.bn1(x))
 66 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 67 |         out = self.conv1(out)
 68 |         out = self.conv2(F.relu(self.bn2(out)))
 69 |         out = self.conv3(F.relu(self.bn3(out)))
 70 |         # Squeeze
 71 |         w = F.avg_pool2d(out, out.size(2))
 72 |         w = F.relu(self.fc1(w))
 73 |         w = torch.sigmoid(self.fc2(w))
 74 |         # Excitation
 75 |         out = out * w
 76 |         out += shortcut
 77 |         return out
 78 | 
 79 | 
 80 | # ResNet for Cifar10/100 or the dataset with image size 32x32
 81 | 
 82 | def SE_ResNet20_cifar(out_dim=10):
 83 |     return PreActResNet_cifar(SE_PreActBlock, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim)
 84 | 
 85 | def SE_ResNet56_cifar(out_dim=10):
 86 |     return PreActResNet_cifar(SE_PreActBlock, [9 , 9 , 9 ], [16, 32, 64], num_classes=out_dim)
 87 | 
 88 | def ResNet110_cifar(out_dim=10):
 89 |     return PreActResNet_cifar(SE_PreActBlock, [18, 18, 18], [16, 32, 64], num_classes=out_dim)
 90 | 
 91 | def SE_ResNet29_cifar(out_dim=10):
 92 |     return PreActResNet_cifar(SE_PreActBottleneck, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim)
 93 | 
 94 | def SE_ResNet164_cifar(out_dim=10):
 95 |     return PreActResNet_cifar(SE_PreActBottleneck, [18, 18, 18], [16, 32, 64], num_classes=out_dim)
 96 | 
 97 | def SE_WideResNet_28_2_cifar(out_dim=10):
 98 |     return PreActResNet_cifar(SE_PreActBlock, [4, 4, 4], [32, 64, 128], num_classes=out_dim)
 99 | 
100 | def SE_WideResNet_28_10_cifar(out_dim=10):
101 |     return PreActResNet_cifar(SE_PreActBlock, [4, 4, 4], [160, 320, 640], num_classes=out_dim)
102 | 
103 | # ResNet for general purpose. Ex:ImageNet
104 | 
105 | def SE_ResNet10(out_dim=10):
106 |     return PreActResNet(SE_PreActBlock, [1,1,1,1], num_classes=out_dim)
107 | 
108 | def SE_ResNet18S(out_dim=10):
109 |     return PreActResNet(SE_PreActBlock, [2,2,2,2], num_classes=out_dim, in_channels=1)
110 | 
111 | def SE_ResNet18(out_dim=10):
112 |     return PreActResNet(SE_PreActBlock, [2,2,2,2], num_classes=out_dim)
113 | 
114 | def SE_ResNet34(out_dim=10):
115 |     return PreActResNet(SE_PreActBlock, [3,4,6,3], num_classes=out_dim)
116 | 
117 | def SE_ResNet50(out_dim=10):
118 |     return PreActResNet(SE_PreActBottleneck, [3,4,6,3], num_classes=out_dim)
119 | 
120 | def SE_ResNet101(out_dim=10):
121 |     return PreActResNet(SE_PreActBottleneck, [3,4,23,3], num_classes=out_dim)
122 | 
123 | def SE_ResNet152(out_dim=10):
124 |     return PreActResNet(SE_PreActBottleneck, [3,8,36,3], num_classes=out_dim)
125 | 
126 | 


--------------------------------------------------------------------------------
/cl/models/mlp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from models import nsa
  4 | from models import nsm
  5 | 
  6 | class MLP(nn.Module):
  7 | 
  8 |     def __init__(self, out_dim=10, in_channel=1, img_sz=32, hidden_dim=256, mode='nsa'):
  9 |         super(MLP, self).__init__()
 10 |         self.in_dim = in_channel*img_sz*img_sz
 11 | 
 12 |         self.mode=mode
 13 | 
 14 |         print("FF", self.mode)
 15 | 
 16 |         if self.mode=='nsa':
 17 |             self.nsa1 =  nsa.ProgammedController(program_shape=[self.in_dim, hidden_dim],
 18 |                                         program_interface_size=self.in_dim,
 19 |                                         pkey_dim=5, rnn_step=0, has_res_w='n',
 20 |                                         num_program=50,
 21 |                                         bias=False, svd_num_features=10, top_lu=5,
 22 |                                         kc_mode='cb')
 23 |             self.nsa2 = nsa.ProgammedController(program_shape=[hidden_dim, hidden_dim],
 24 |                                         program_interface_size=hidden_dim,
 25 |                                         pkey_dim=5, rnn_step=0, has_res_w='n',
 26 |                                         num_program=50,
 27 |                                         bias=False, svd_num_features=10, top_lu=5,
 28 |                                          kc_mode='cb')
 29 | 
 30 |             self.nsa1.initialize()
 31 |             self.nsa2.initialize()
 32 |             self.linear = nn.Sequential(
 33 |                 self.nsa1,
 34 |                 nn.ReLU(inplace=True),
 35 |                 self.nsa2,
 36 |                 nn.ReLU(inplace=True),
 37 |             )
 38 | 
 39 |         elif self.mode == "nsm":
 40 |             self.nsm1 = nsm.ProgammedController(program_shape=[self.in_dim, hidden_dim],
 41 |                                         program_interface_size=self.in_dim,
 42 |                                     pkey_dim=5,
 43 |                                     num_program=5,
 44 |                                     bias=False, svd_num_features=15,
 45 |                                     att_mode="kv", program_read_mode="linear")
 46 |             self.nsm2 = nsm.ProgammedController(program_shape=[hidden_dim, hidden_dim],
 47 |                                                 program_interface_size=hidden_dim,
 48 |                                                 pkey_dim=5,
 49 |                                                 num_program=5,
 50 |                                                 bias=False, svd_num_features=15,
 51 |                                                 att_mode="kv", program_read_mode="linear")
 52 |             self.nsm1.initialize()
 53 |             self.nsm2.initialize()
 54 |             self.linear = nn.Sequential(
 55 |                 self.nsm1,
 56 |                 nn.ReLU(inplace=True),
 57 |                 self.nsm2,
 58 |                 nn.ReLU(inplace=True),
 59 |             )
 60 | 
 61 |         else:
 62 |             self.linear = nn.Sequential(
 63 |                 nn.Linear(self.in_dim, hidden_dim, bias=False),
 64 |                 nn.ReLU(inplace=True),
 65 |                 nn.Linear(hidden_dim, hidden_dim, bias=False),
 66 |                 nn.ReLU(inplace=True),
 67 |             )
 68 |         self.last = nn.Linear(hidden_dim, out_dim)  # Subject to be replaced dependent on task
 69 | 
 70 | 
 71 |     def features(self, x):
 72 |         x = self.linear(x.view(-1,self.in_dim))
 73 |         return x
 74 | 
 75 |     def logits(self, x):
 76 |         x = self.last(x)
 77 |         return x
 78 | 
 79 |     def forward(self, x):
 80 |         x = self.features(x)
 81 |         x = self.logits(x)
 82 |         return x
 83 | 
 84 |     def get_last(self, nin, nout):
 85 |         if self.mode=="mlp":
 86 |             return nn.Linear(nin, nout)
 87 |         elif self.mode=="nsa":
 88 |             self.nsa_last = nsa.ProgammedController(program_shape=[nin, nout],
 89 |                                         program_interface_size=nin,
 90 |                                         pkey_dim=5, rnn_step=0, has_res_w='n',
 91 |                                         num_program=50,
 92 |                                         bias=False, svd_num_features=10, top_lu=5,
 93 |                                          kc_mode='cb')
 94 |             self.nsa_last.initialize()
 95 |             return self.nsa_last
 96 |         elif self.mode=="nsm":
 97 |             nsa_last = nsm.ProgammedController(program_shape=[nin, nout],
 98 |                                         program_interface_size=nin,
 99 |                                         pkey_dim=5,  num_program=5,
100 |                                         bias=False, svd_num_features=15,
101 |                                         att_mode="kv", program_read_mode="linear")
102 |             nsa_last.initialize()
103 |             return nsa_last
104 | 
105 | 
106 |     def get_ploss(self):
107 |         if self.mode=="nsa":
108 |             loss = self.nsa1.get_reg_loss() + self.nsa2.get_reg_loss()+self.nsa_last.get_reg_loss()
109 |             return loss
110 |         else:
111 |             return torch.tensor(0.0)
112 |         # print(self.last)
113 | 
114 | def MLP100():
115 |     return MLP(hidden_dim=100)
116 | 
117 | 
118 | def MLP400(mode="mlp"):
119 |     return MLP(hidden_dim=400, mode=mode)
120 | 
121 | 
122 | def MLP1000():
123 |     return MLP(hidden_dim=1000)
124 | 
125 | 
126 | def MLP2000():
127 |     return MLP(hidden_dim=2000)
128 | 
129 | 
130 | def MLP5000():
131 |     return MLP(hidden_dim=5000)


--------------------------------------------------------------------------------
/cl/models/lenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from models import nsa
  4 | from models import  nsm
  5 | import torchvision.models as models
  6 | 
  7 | class LeNet(nn.Module):
  8 | 
  9 |     def __init__(self, out_dim=10, in_channel=3, img_sz=32, mode='mlp'):
 10 |         super(LeNet, self).__init__()
 11 |         feat_map_sz = img_sz//4
 12 |         self.mode = mode
 13 | 
 14 |         # self.n_feat = 50 * feat_map_sz * feat_map_sz
 15 |         # self.conv = nn.Sequential(
 16 |         #     nn.Conv2d(in_channel, 20, 5, padding=2),
 17 |         #     # nn.BatchNorm2d(20),
 18 |         #     nn.ReLU(inplace=True),
 19 |         #     nn.MaxPool2d(2, 2),
 20 |         #     nn.Conv2d(20, 50, 5, padding=2),
 21 |         #     # nn.BatchNorm2d(50),
 22 |         #     nn.ReLU(inplace=True),
 23 |         #     nn.MaxPool2d(2, 2)
 24 |         # )
 25 | 
 26 |         self.n_feat = 1000
 27 |         self.conv = models.resnet18(pretrained=True)
 28 |         for param in self.conv.parameters():
 29 |             param.requires_grad = True
 30 | 
 31 |         if self.mode=='nsa':
 32 |             self.nsa1 =  nsa.ProgammedController(program_shape=[self.n_feat, 500],
 33 |                                         program_interface_size=self.n_feat,
 34 |                                         pkey_dim=5, rnn_step=0, has_res_w='n',
 35 |                                         num_program=50,
 36 |                                         bias=False, svd_num_features=10, top_lu=5,
 37 |                                         att_mode="kv", program_read_mode="linear", kc_mode='cb')
 38 | 
 39 |             self.nsa1.initialize()
 40 |             self.nsa2 = nsa.ProgammedController(program_shape=[500, 500],
 41 |                                                 program_interface_size=500,
 42 |                                                 pkey_dim=5, rnn_step=0, has_res_w='n',
 43 |                                                 num_program=50,
 44 |                                                 bias=False, svd_num_features=10, top_lu=5,
 45 |                                                 att_mode="kv", program_read_mode="linear", kc_mode='cb')
 46 | 
 47 |             self.nsa2.initialize()
 48 |             self.nsa3 = nsa.ProgammedController(program_shape=[500, 500],
 49 |                                                program_interface_size=500,
 50 |                                                pkey_dim=5, rnn_step=0, has_res_w='n',
 51 |                                                num_program=50,
 52 |                                                bias=False, svd_num_features=10, top_lu=5,
 53 |                                                att_mode="kv", program_read_mode="linear", kc_mode='cb')
 54 | 
 55 |             self.nsa3.initialize()
 56 |             self.linear = nn.Sequential(
 57 |                 self.nsa1,
 58 |                 # nn.BatchNorm1d(500),
 59 |                 nn.ReLU(inplace=True),
 60 |                 self.nsa2,
 61 |                 # nn.BatchNorm1d(500),
 62 |                 nn.ReLU(inplace=True),
 63 |                 self.nsa3,
 64 |                 # nn.BatchNorm1d(500),
 65 |                 nn.ReLU(inplace=True),
 66 |             )
 67 | 
 68 |         elif self.mode == "nsm":
 69 |             self.nsm1 = nsm.ProgammedController(program_shape=[self.n_feat, 500],
 70 |                                         program_interface_size=self.n_feat,
 71 |                                     pkey_dim=5,
 72 |                                     num_program=5,
 73 |                                     bias=False, svd_num_features=15,
 74 |                                     att_mode="kv", program_read_mode="linear")
 75 | 
 76 |             self.nsm1.initialize()
 77 |             self.linear = nn.Sequential(
 78 |                 self.nsm1,
 79 |                 # nn.BatchNorm1d(500),
 80 |                 nn.ReLU(inplace=True),
 81 |             )
 82 | 
 83 |         else:
 84 |             self.linear = nn.Sequential(
 85 |                 nn.Linear(self.n_feat, 500, bias=False),
 86 |                 # nn.BatchNorm1d(500),
 87 |                 nn.ReLU(inplace=True),
 88 |                 nn.Linear(500, 500, bias=False),
 89 |                 # nn.BatchNorm1d(500),
 90 |                 nn.ReLU(inplace=True),
 91 |                 nn.Linear(500, 500, bias=False),
 92 |                 # nn.BatchNorm1d(500),
 93 |                 nn.ReLU(inplace=True),
 94 |             )
 95 |         self.last = nn.Linear(500, out_dim)  # Subject to be replaced dependent on task
 96 | 
 97 |     def get_last(self, nin, nout):
 98 |         # return nn.Linear(nin, nout)
 99 | 
100 |         if self.mode=="mlp":
101 |             return nn.Linear(nin, nout)
102 |         elif self.mode=="nsa":
103 |             nsa_last = nsa.ProgammedController(program_shape=[nin, nout],
104 |                                         program_interface_size=nin,
105 |                                         pkey_dim=10, rnn_step=0, has_res_w='n',
106 |                                         num_program=50,
107 |                                         bias=False, svd_num_features=10, top_lu=5,
108 |                                         att_mode="kv", program_read_mode="linear", kc_mode='cb')
109 |             nsa_last.initialize()
110 |             self.nsa_last = nsa_last
111 |             return nsa_last
112 |         elif self.mode=="nsm":
113 |             nsa_last = nsm.ProgammedController(program_shape=[nin, nout],
114 |                                         program_interface_size=nin,
115 |                                         pkey_dim=5,  num_program=5,
116 |                                         bias=False, svd_num_features=15,
117 |                                         att_mode="kv", program_read_mode="linear")
118 |             nsa_last.initialize()
119 |             return nsa_last
120 | 
121 | 
122 |     def get_ploss(self):
123 |         if self.mode=="nsa":
124 |             loss = self.nsa1.get_reg_loss()+self.nsa2.get_reg_loss()+self.nsa3.get_reg_loss()+self.nsa_last.get_reg_loss()
125 |             return 10*loss
126 |         else:
127 |             return torch.tensor(0.0)
128 |         # print(self.last)
129 | 
130 |     def features(self, x):
131 |         x = self.conv(x)
132 |         x = self.linear(x.view(-1, self.n_feat))
133 |         return x
134 | 
135 |     def logits(self, x):
136 |         x = self.last(x)
137 |         return x
138 | 
139 |     def forward(self, x):
140 |         x = self.features(x)
141 |         x = self.logits(x)
142 |         return x
143 | 
144 | 
145 | def LeNetC(out_dim=10):  # LeNet with color input
146 |     return LeNet(out_dim=out_dim, in_channel=3, img_sz=32)


--------------------------------------------------------------------------------
/cl/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import math
  4 | from torch.nn import init
  5 | 
  6 | 
  7 | def conv3x3(in_planes, out_planes, stride=1):
  8 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
  9 | 
 10 | 
 11 | class PreActBlock(nn.Module):
 12 |     '''Pre-activation version of the BasicBlock.'''
 13 |     expansion = 1
 14 | 
 15 |     def __init__(self, in_planes, planes, stride=1, droprate=0):
 16 |         super(PreActBlock, self).__init__()
 17 |         self.bn1 = nn.BatchNorm2d(in_planes)
 18 |         self.conv1 = conv3x3(in_planes, planes, stride)
 19 |         self.drop = nn.Dropout(p=droprate) if droprate>0 else None
 20 |         self.bn2 = nn.BatchNorm2d(planes)
 21 |         self.conv2 = conv3x3(planes, planes)
 22 | 
 23 |         if stride != 1 or in_planes != self.expansion*planes:
 24 |             self.shortcut = nn.Sequential(
 25 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 26 |             )
 27 | 
 28 |     def forward(self, x):
 29 |         out = F.relu(self.bn1(x))
 30 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 31 |         out = self.conv1(out)
 32 |         if self.drop is not None:
 33 |             out = self.drop(out)
 34 |         out = self.conv2(F.relu(self.bn2(out)))
 35 |         out += shortcut
 36 |         return out
 37 | 
 38 | 
 39 | class PreActBottleneck(nn.Module):
 40 |     '''Pre-activation version of the original Bottleneck module.'''
 41 |     expansion = 4
 42 | 
 43 |     def __init__(self, in_planes, planes, stride=1, droprate=None):
 44 |         super(PreActBottleneck, self).__init__()
 45 |         self.bn1 = nn.BatchNorm2d(in_planes)
 46 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 47 |         self.bn2 = nn.BatchNorm2d(planes)
 48 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 49 |         self.bn3 = nn.BatchNorm2d(planes)
 50 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
 51 | 
 52 |         if stride != 1 or in_planes != self.expansion*planes:
 53 |             self.shortcut = nn.Sequential(
 54 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 55 |             )
 56 | 
 57 |     def forward(self, x):
 58 |         out = F.relu(self.bn1(x))
 59 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 60 |         out = self.conv1(out)
 61 |         out = self.conv2(F.relu(self.bn2(out)))
 62 |         out = self.conv3(F.relu(self.bn3(out)))
 63 |         out += shortcut
 64 |         return out
 65 | 
 66 | 
 67 | class PreActResNet(nn.Module):
 68 |     def __init__(self, block, num_blocks, num_classes=10, in_channels=3):
 69 |         super(PreActResNet, self).__init__()
 70 |         self.in_planes = 64
 71 |         last_planes = 512*block.expansion
 72 | 
 73 |         self.conv1 = conv3x3(in_channels, 64)
 74 |         self.stage1 = self._make_layer(block, 64, num_blocks[0], stride=1)
 75 |         self.stage2 = self._make_layer(block, 128, num_blocks[1], stride=2)
 76 |         self.stage3 = self._make_layer(block, 256, num_blocks[2], stride=2)
 77 |         self.stage4 = self._make_layer(block, 512, num_blocks[3], stride=2)
 78 |         self.bn_last = nn.BatchNorm2d(last_planes)
 79 |         self.last = nn.Linear(last_planes, num_classes)
 80 | 
 81 |     def _make_layer(self, block, planes, num_blocks, stride):
 82 |         strides = [stride] + [1]*(num_blocks-1)
 83 |         layers = []
 84 |         for stride in strides:
 85 |             layers.append(block(self.in_planes, planes, stride))
 86 |             self.in_planes = planes * block.expansion
 87 |         return nn.Sequential(*layers)
 88 | 
 89 |     def features(self, x):
 90 |         out = self.conv1(x)
 91 |         out = self.stage1(out)
 92 |         out = self.stage2(out)
 93 |         out = self.stage3(out)
 94 |         out = self.stage4(out)
 95 |         return out
 96 | 
 97 |     def logits(self, x):
 98 |         x = self.last(x)
 99 |         return x
100 | 
101 |     def forward(self, x):
102 |         x = self.features(x)
103 |         x = F.relu(self.bn_last(x))
104 |         x = F.adaptive_avg_pool2d(x, 1)
105 |         x = self.logits(x.view(x.size(0), -1))
106 |         return x
107 | 
108 | 
109 | class PreActResNet_cifar(nn.Module):
110 |     def __init__(self, block, num_blocks, filters, num_classes=10, droprate=0):
111 |         super(PreActResNet_cifar, self).__init__()
112 |         self.in_planes = 16
113 |         last_planes = filters[2]*block.expansion
114 | 
115 |         self.conv1 = conv3x3(3, self.in_planes)
116 |         self.stage1 = self._make_layer(block, filters[0], num_blocks[0], stride=1, droprate=droprate)
117 |         self.stage2 = self._make_layer(block, filters[1], num_blocks[1], stride=2, droprate=droprate)
118 |         self.stage3 = self._make_layer(block, filters[2], num_blocks[2], stride=2, droprate=droprate)
119 |         self.bn_last = nn.BatchNorm2d(last_planes)
120 |         self.last = nn.Linear(last_planes, num_classes)
121 | 
122 |         """
123 |         for m in self.modules():
124 |             if isinstance(m, nn.Conv2d):
125 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
126 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
127 |                 # m.bias.data.zero_()
128 |             elif isinstance(m, nn.BatchNorm2d):
129 |                 m.weight.data.fill_(1)
130 |                 m.bias.data.zero_()
131 |             elif isinstance(m, nn.Linear):
132 |                 init.kaiming_normal(m.weight)
133 |                 m.bias.data.zero_()
134 |         """
135 | 
136 |     def _make_layer(self, block, planes, num_blocks, stride, droprate):
137 |         strides = [stride] + [1]*(num_blocks-1)
138 |         layers = []
139 |         for stride in strides:
140 |             layers.append(block(self.in_planes, planes, stride, droprate))
141 |             self.in_planes = planes * block.expansion
142 |         return nn.Sequential(*layers)
143 | 
144 |     def features(self, x):
145 |         out = self.conv1(x)
146 |         out = self.stage1(out)
147 |         out = self.stage2(out)
148 |         out = self.stage3(out)
149 |         return out
150 | 
151 |     def logits(self, x):
152 |         x = self.last(x)
153 |         return x
154 | 
155 |     def forward(self, x):
156 |         out = self.features(x)
157 |         out = F.relu(self.bn_last(out))
158 |         out = F.avg_pool2d(out, 8)
159 |         out = self.logits(out.view(out.size(0), -1))
160 |         return out
161 | 
162 | 
163 | # ResNet for Cifar10/100 or the dataset with image size 32x32
164 | 
165 | def ResNet20_cifar(out_dim=10):
166 |     return PreActResNet_cifar(PreActBlock, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim)
167 | 
168 | def ResNet56_cifar(out_dim=10):
169 |     return PreActResNet_cifar(PreActBlock, [9 , 9 , 9 ], [16, 32, 64], num_classes=out_dim)
170 | 
171 | def ResNet110_cifar(out_dim=10):
172 |     return PreActResNet_cifar(PreActBlock, [18, 18, 18], [16, 32, 64], num_classes=out_dim)
173 | 
174 | def ResNet29_cifar(out_dim=10):
175 |     return PreActResNet_cifar(PreActBottleneck, [3 , 3 , 3 ], [16, 32, 64], num_classes=out_dim)
176 | 
177 | def ResNet164_cifar(out_dim=10):
178 |     return PreActResNet_cifar(PreActBottleneck, [18, 18, 18], [16, 32, 64], num_classes=out_dim)
179 | 
180 | def WideResNet_28_2_cifar(out_dim=10):
181 |     return PreActResNet_cifar(PreActBlock, [4, 4, 4], [32, 64, 128], num_classes=out_dim)
182 | 
183 | def WideResNet_28_2_drop_cifar(out_dim=10):
184 |     return PreActResNet_cifar(PreActBlock, [4, 4, 4], [32, 64, 128], num_classes=out_dim, droprate=0.3)
185 | 
186 | def WideResNet_28_10_cifar(out_dim=10):
187 |     return PreActResNet_cifar(PreActBlock, [4, 4, 4], [160, 320, 640], num_classes=out_dim)
188 | 
189 | # ResNet for general purpose. Ex:ImageNet
190 | 
191 | def ResNet10(out_dim=10):
192 |     return PreActResNet(PreActBlock, [1,1,1,1], num_classes=out_dim)
193 | 
194 | def ResNet18S(out_dim=10):
195 |     return PreActResNet(PreActBlock, [2,2,2,2], num_classes=out_dim, in_channels=1)
196 | 
197 | def ResNet18(out_dim=10):
198 |     return PreActResNet(PreActBlock, [2,2,2,2], num_classes=out_dim)
199 | 
200 | def ResNet34(out_dim=10):
201 |     return PreActResNet(PreActBlock, [3,4,6,3], num_classes=out_dim)
202 | 
203 | def ResNet50(out_dim=10):
204 |     return PreActResNet(PreActBottleneck, [3,4,6,3], num_classes=out_dim)
205 | 
206 | def ResNet101(out_dim=10):
207 |     return PreActResNet(PreActBottleneck, [3,4,23,3], num_classes=out_dim)
208 | 
209 | def ResNet152(out_dim=10):
210 |     return PreActResNet(PreActBottleneck, [3,8,36,3], num_classes=out_dim)


--------------------------------------------------------------------------------
/cl/agents/exp_replay.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from importlib import import_module
  4 | from .default import NormalNN
  5 | from .regularization import SI, L2, EWC, MAS
  6 | from dataloaders.wrapper import Storage
  7 | 
  8 | 
  9 | class Memory(Storage):
 10 |     def reduce(self, m):
 11 |         self.storage = self.storage[:m]
 12 | 
 13 | 
 14 | class Naive_Rehearsal(NormalNN):
 15 | 
 16 |     def __init__(self, agent_config):
 17 |         super(Naive_Rehearsal, self).__init__(agent_config)
 18 |         self.task_count = 0
 19 |         self.memory_size = 1000
 20 |         self.task_memory = {}
 21 | 
 22 |     def learn_batch(self, train_loader, val_loader=None):
 23 |         # 1.Combine training set
 24 |         dataset_list = []
 25 |         for storage in self.task_memory.values():
 26 |             dataset_list.append(storage)
 27 |         dataset_list *= max(len(train_loader.dataset)//self.memory_size,1)  # Let old data: new data = 1:1
 28 |         dataset_list.append(train_loader.dataset)
 29 |         dataset = torch.utils.data.ConcatDataset(dataset_list)
 30 |         new_train_loader = torch.utils.data.DataLoader(dataset,
 31 |                                                        batch_size=train_loader.batch_size,
 32 |                                                        shuffle=True,
 33 |                                                        num_workers=train_loader.num_workers)
 34 | 
 35 |         # 2.Update model as normal
 36 |         super(Naive_Rehearsal, self).learn_batch(new_train_loader, val_loader)
 37 | 
 38 |         # 3.Randomly decide the images to stay in the memory
 39 |         self.task_count += 1
 40 |         # (a) Decide the number of samples for being saved
 41 |         num_sample_per_task = self.memory_size // self.task_count
 42 |         num_sample_per_task = min(len(train_loader.dataset),num_sample_per_task)
 43 |         # (b) Reduce current exemplar set to reserve the space for the new dataset
 44 |         for storage in self.task_memory.values():
 45 |             storage.reduce(num_sample_per_task)
 46 |         # (c) Randomly choose some samples from new task and save them to the memory
 47 |         self.task_memory[self.task_count] = Memory()  # Initialize the memory slot
 48 |         randind = torch.randperm(len(train_loader.dataset))[:num_sample_per_task]  # randomly sample some data
 49 |         for ind in randind:  # save it to the memory
 50 |             self.task_memory[self.task_count].append(train_loader.dataset[ind])
 51 | 
 52 | 
 53 | class Naive_Rehearsal_SI(Naive_Rehearsal, SI):
 54 | 
 55 |     def __init__(self, agent_config):
 56 |         super(Naive_Rehearsal_SI, self).__init__(agent_config)
 57 | 
 58 | 
 59 | class Naive_Rehearsal_L2(Naive_Rehearsal, L2):
 60 | 
 61 |     def __init__(self, agent_config):
 62 |         super(Naive_Rehearsal_L2, self).__init__(agent_config)
 63 | 
 64 | 
 65 | class Naive_Rehearsal_EWC(Naive_Rehearsal, EWC):
 66 | 
 67 |     def __init__(self, agent_config):
 68 |         super(Naive_Rehearsal_EWC, self).__init__(agent_config)
 69 |         self.online_reg = True  # Online EWC
 70 | 
 71 | 
 72 | class Naive_Rehearsal_MAS(Naive_Rehearsal, MAS):
 73 | 
 74 |     def __init__(self, agent_config):
 75 |         super(Naive_Rehearsal_MAS, self).__init__(agent_config)
 76 | 
 77 | 
 78 | class GEM(Naive_Rehearsal):
 79 |     """
 80 |     @inproceedings{GradientEpisodicMemory,
 81 |         title={Gradient Episodic Memory for Continual Learning},
 82 |         author={Lopez-Paz, David and Ranzato, Marc'Aurelio},
 83 |         booktitle={NIPS},
 84 |         year={2017},
 85 |         url={https://arxiv.org/abs/1706.08840}
 86 |     }
 87 |     """
 88 | 
 89 |     def __init__(self, agent_config):
 90 |         super(GEM, self).__init__(agent_config)
 91 |         self.params = {n: p for n, p in self.model.named_parameters() if p.requires_grad}  # For convenience
 92 |         self.task_grads = {}
 93 |         self.quadprog = import_module('quadprog')
 94 |         self.task_mem_cache = {}
 95 | 
 96 |     def grad_to_vector(self):
 97 |         vec = []
 98 |         for n,p in self.params.items():
 99 |             if p.grad is not None:
100 |                 vec.append(p.grad.view(-1))
101 |             else:
102 |                 # Part of the network might has no grad, fill zero for those terms
103 |                 vec.append(p.data.clone().fill_(0).view(-1))
104 |         return torch.cat(vec)
105 | 
106 |     def vector_to_grad(self, vec):
107 |         # Overwrite current param.grad by slicing the values in vec (flatten grad)
108 |         pointer = 0
109 |         for n, p in self.params.items():
110 |             # The length of the parameter
111 |             num_param = p.numel()
112 |             if p.grad is not None:
113 |                 # Slice the vector, reshape it, and replace the old data of the grad
114 |                 p.grad.copy_(vec[pointer:pointer + num_param].view_as(p))
115 |                 # Part of the network might has no grad, ignore those terms
116 |             # Increment the pointer
117 |             pointer += num_param
118 | 
119 |     def project2cone2(self, gradient, memories):
120 |         """
121 |             Solves the GEM dual QP described in the paper given a proposed
122 |             gradient "gradient", and a memory of task gradients "memories".
123 |             Overwrites "gradient" with the final projected update.
124 | 
125 |             input:  gradient, p-vector
126 |             input:  memories, (t * p)-vector
127 |             output: x, p-vector
128 | 
129 |             Modified from: https://github.com/facebookresearch/GradientEpisodicMemory/blob/master/model/gem.py#L70
130 |         """
131 |         margin = self.config['reg_coef']
132 |         memories_np = memories.cpu().contiguous().double().numpy()
133 |         gradient_np = gradient.cpu().contiguous().view(-1).double().numpy()
134 |         t = memories_np.shape[0]
135 |         #print(memories_np.shape, gradient_np.shape)
136 |         P = np.dot(memories_np, memories_np.transpose())
137 |         P = 0.5 * (P + P.transpose())
138 |         q = np.dot(memories_np, gradient_np) * -1
139 |         G = np.eye(t)
140 |         P = P + G * 0.001
141 |         h = np.zeros(t) + margin
142 |         v = self.quadprog.solve_qp(P, q, G, h)[0]
143 |         x = np.dot(v, memories_np) + gradient_np
144 |         new_grad = torch.Tensor(x).view(-1)
145 |         if self.gpu:
146 |             new_grad = new_grad.cuda()
147 |         return new_grad
148 | 
149 |     def learn_batch(self, train_loader, val_loader=None):
150 | 
151 |         # 1.Update model as normal
152 |         super(GEM, self).learn_batch(train_loader, val_loader)
153 | 
154 |         # 2.Randomly decide the images to stay in the memory
155 |         self.task_count += 1
156 |         # (a) Decide the number of samples for being saved
157 |         num_sample_per_task = self.memory_size // self.task_count
158 |         num_sample_per_task = min(len(train_loader.dataset),num_sample_per_task)
159 |         # (b) Reduce current exemplar set to reserve the space for the new dataset
160 |         for storage in self.task_memory.values():
161 |             storage.reduce(num_sample_per_task)
162 |         # (c) Randomly choose some samples from new task and save them to the memory
163 |         self.task_memory[self.task_count] = Memory()  # Initialize the memory slot
164 |         randind = torch.randperm(len(train_loader.dataset))[:num_sample_per_task]  # randomly sample some data
165 |         for ind in randind:  # save it to the memory
166 |             self.task_memory[self.task_count].append(train_loader.dataset[ind])
167 |         # (d) Cache the data for faster processing
168 |         for t, mem in self.task_memory.items():
169 |             # Concatenate all data in each task
170 |             mem_loader = torch.utils.data.DataLoader(mem,
171 |                                                      batch_size=len(mem),
172 |                                                      shuffle=False,
173 |                                                      num_workers=2)
174 |             assert len(mem_loader)==1,'The length of mem_loader should be 1'
175 |             for i, (mem_input, mem_target, mem_task) in enumerate(mem_loader):
176 |                 if self.gpu:
177 |                     mem_input = mem_input.cuda()
178 |                     mem_target = mem_target.cuda()
179 |             self.task_mem_cache[t] = {'data':mem_input,'target':mem_target,'task':mem_task}
180 | 
181 |     def update_model(self, inputs, targets, tasks):
182 | 
183 |         # compute gradient on previous tasks
184 |         if self.task_count > 0:
185 |             for t,mem in self.task_memory.items():
186 |                 self.zero_grad()
187 |                 # feed the data from memory and collect the gradients
188 |                 mem_out = self.forward(self.task_mem_cache[t]['data'])
189 |                 mem_loss = self.criterion(mem_out, self.task_mem_cache[t]['target'], self.task_mem_cache[t]['task'])
190 |                 mem_loss.backward()
191 |                 # Store the grads
192 |                 self.task_grads[t] = self.grad_to_vector()
193 | 
194 |         # now compute the grad on the current minibatch
195 |         out = self.forward(inputs)
196 |         loss = self.criterion(out, targets, tasks)
197 |         self.optimizer.zero_grad()
198 |         loss.backward()
199 | 
200 |         # check if gradient violates constraints
201 |         if self.task_count > 0:
202 |             current_grad_vec = self.grad_to_vector()
203 |             mem_grad_vec = torch.stack(list(self.task_grads.values()))
204 |             dotp = current_grad_vec * mem_grad_vec
205 |             dotp = dotp.sum(dim=1)
206 |             if (dotp < 0).sum() != 0:
207 |                 new_grad = self.project2cone2(current_grad_vec, mem_grad_vec)
208 |                 # copy gradients back
209 |                 self.vector_to_grad(new_grad)
210 | 
211 |         self.optimizer.step()
212 |         return loss.detach(), out
213 | 


--------------------------------------------------------------------------------
/cl/iBatchLearn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import torch
  5 | import numpy as np
  6 | from random import shuffle
  7 | from collections import OrderedDict
  8 | import dataloaders.base
  9 | from dataloaders.datasetGen import SplitGen, PermutedGen
 10 | import agents
 11 | 
 12 | 
 13 | def run(args):
 14 |     if not os.path.exists('outputs'):
 15 |         os.mkdir('outputs')
 16 | 
 17 |     # Prepare dataloaders
 18 |     train_dataset, val_dataset = dataloaders.base.__dict__[args.dataset](args.dataroot, args.train_aug)
 19 |     if args.n_permutation>0:
 20 |         train_dataset_splits, val_dataset_splits, task_output_space = PermutedGen(train_dataset, val_dataset,
 21 |                                                                              args.n_permutation,first_split_sz=args.first_split_size,
 22 |                                                                           other_split_sz=args.other_split_size,
 23 |                                                                           rand_split=args.rand_split,
 24 |                                                                              remap_class=not args.no_class_remap)
 25 |     else:
 26 |         train_dataset_splits, val_dataset_splits, task_output_space = SplitGen(train_dataset, val_dataset,
 27 |                                                                           first_split_sz=args.first_split_size,
 28 |                                                                           other_split_sz=args.other_split_size,
 29 |                                                                           rand_split=args.rand_split,
 30 |                                                                           remap_class=not args.no_class_remap)
 31 |     # Prepare the Agent (model)
 32 |     agent_config = {'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay,'schedule': args.schedule,
 33 |                     'model_type':args.model_type, 'model_name': args.model_name, 'model_weights':args.model_weights,
 34 |                     'out_dim':{'All':args.force_out_dim} if args.force_out_dim>0 else task_output_space,
 35 |                     'optimizer':args.optimizer,
 36 |                     'print_freq':args.print_freq, 'gpuid': args.gpuid,
 37 |                     'reg_coef':args.reg_coef, 'mode':args.mode}
 38 |     agent = agents.__dict__[args.agent_type].__dict__[args.agent_name](agent_config)
 39 |     print(agent.model)
 40 |     print('#parameter of model: ',agent.count_parameter())
 41 | 
 42 |     num_params = 0
 43 |     for p in agent.model.parameters():
 44 |         if p.requires_grad:
 45 |             num_params += p.data.view(-1).size(0)
 46 | 
 47 |     print("no learnable params: ", num_params)
 48 | 
 49 |     # Decide split ordering
 50 |     task_names = sorted(list(task_output_space.keys()), key=int)
 51 |     print('Task order:',task_names)
 52 |     if args.rand_split_order:
 53 |         shuffle(task_names)
 54 |         print('Shuffled task order:', task_names)
 55 | 
 56 |     acc_table = OrderedDict()
 57 |     if args.offline_training:  # Non-incremental learning / offline_training / measure the upper-bound performance
 58 |         task_names = ['All']
 59 |         train_dataset_all = torch.utils.data.ConcatDataset(train_dataset_splits.values())
 60 |         val_dataset_all = torch.utils.data.ConcatDataset(val_dataset_splits.values())
 61 |         train_loader = torch.utils.data.DataLoader(train_dataset_all,
 62 |                                                    batch_size=args.batch_size, shuffle=True, num_workers=args.workers)
 63 |         val_loader = torch.utils.data.DataLoader(val_dataset_all,
 64 |                                                  batch_size=args.batch_size, shuffle=False, num_workers=args.workers)
 65 | 
 66 |         agent.learn_batch(train_loader, val_loader)
 67 | 
 68 |         acc_table['All'] = {}
 69 |         acc_table['All']['All'] = agent.validation(val_loader)
 70 | 
 71 |     else:  # Incremental learning
 72 |         # Feed data to agent and evaluate agent's performance
 73 |         for i in range(len(task_names)):
 74 |             train_name = task_names[i]
 75 |             print('======================',train_name,'=======================')
 76 |             train_loader = torch.utils.data.DataLoader(train_dataset_splits[train_name],
 77 |                                                         batch_size=args.batch_size, shuffle=True, num_workers=args.workers)
 78 |             val_loader = torch.utils.data.DataLoader(val_dataset_splits[train_name],
 79 |                                                       batch_size=args.batch_size, shuffle=False, num_workers=args.workers)
 80 | 
 81 |             if args.incremental_class:
 82 |                 agent.add_valid_output_dim(task_output_space[train_name])
 83 | 
 84 |             # Learn
 85 |             agent.learn_batch(train_loader, val_loader)
 86 | 
 87 |             # Evaluate
 88 |             acc_table[train_name] = OrderedDict()
 89 |             for j in range(i+1):
 90 |                 val_name = task_names[j]
 91 |                 print('validation split name:', val_name)
 92 |                 val_data = val_dataset_splits[val_name] if not args.eval_on_train_set else train_dataset_splits[val_name]
 93 |                 val_loader = torch.utils.data.DataLoader(val_data,
 94 |                                                          batch_size=args.batch_size, shuffle=False,
 95 |                                                          num_workers=args.workers)
 96 |                 acc_table[val_name][train_name] = agent.validation(val_loader)
 97 | 
 98 |     return acc_table, task_names
 99 | 
100 | def get_args(argv):
101 |     # This function prepares the variables shared across demo.py
102 |     parser = argparse.ArgumentParser()
103 |     parser.add_argument('--gpuid', nargs="+", type=int, default=[0],
104 |                         help="The list of gpuid, ex:--gpuid 3 1. Negative value means cpu-only")
105 |     parser.add_argument('--model_type', type=str, default='mlp', help="The type (mlp|lenet|vgg|resnet) of backbone network")
106 |     parser.add_argument('--model_name', type=str, default='MLP', help="The name of actual model for the backbone")
107 |     parser.add_argument('--force_out_dim', type=int, default=2, help="Set 0 to let the task decide the required output dimension")
108 |     parser.add_argument('--agent_type', type=str, default='default', help="The type (filename) of agent")
109 |     parser.add_argument('--agent_name', type=str, default='NormalNN', help="The class name of agent")
110 |     parser.add_argument('--optimizer', type=str, default='SGD', help="SGD|Adam|RMSprop|amsgrad|Adadelta|Adagrad|Adamax ...")
111 |     parser.add_argument('--dataroot', type=str, default='data', help="The root folder of dataset or downloaded data")
112 |     parser.add_argument('--dataset', type=str, default='MNIST', help="MNIST(default)|CIFAR10|CIFAR100")
113 |     parser.add_argument('--n_permutation', type=int, default=0, help="Enable permuted tests when >0")
114 |     parser.add_argument('--first_split_size', type=int, default=2)
115 |     parser.add_argument('--other_split_size', type=int, default=2)
116 |     parser.add_argument('--no_class_remap', dest='no_class_remap', default=False, action='store_true',
117 |                         help="Avoid the dataset with a subset of classes doing the remapping. Ex: [2,5,6 ...] -> [0,1,2 ...]")
118 |     parser.add_argument('--train_aug', dest='train_aug', default=False, action='store_true',
119 |                         help="Allow data augmentation during training")
120 |     parser.add_argument('--rand_split', dest='rand_split', default=False, action='store_true',
121 |                         help="Randomize the classes in splits")
122 |     parser.add_argument('--rand_split_order', dest='rand_split_order', default=False, action='store_true',
123 |                         help="Randomize the order of splits")
124 |     parser.add_argument('--workers', type=int, default=3, help="#Thread for dataloader")
125 |     parser.add_argument('--mode', type=str, default="mlp")
126 |     parser.add_argument('--batch_size', type=int, default=100)
127 |     parser.add_argument('--lr', type=float, default=0.01, help="Learning rate")
128 |     parser.add_argument('--momentum', type=float, default=0)
129 |     parser.add_argument('--weight_decay', type=float, default=0)
130 |     parser.add_argument('--schedule', nargs="+", type=int, default=[2],
131 |                         help="The list of epoch numbers to reduce learning rate by factor of 0.1. Last number is the end epoch")
132 |     parser.add_argument('--print_freq', type=float, default=100, help="Print the log at every x iteration")
133 |     parser.add_argument('--model_weights', type=str, default=None,
134 |                         help="The path to the file for the model weights (*.pth).")
135 |     parser.add_argument('--reg_coef', nargs="+", type=float, default=[0.], help="The coefficient for regularization. Larger means less plasilicity. Give a list for hyperparameter search.")
136 |     parser.add_argument('--eval_on_train_set', dest='eval_on_train_set', default=False, action='store_true',
137 |                         help="Force the evaluation on train set")
138 |     parser.add_argument('--offline_training', dest='offline_training', default=False, action='store_true',
139 |                         help="Non-incremental learning by make all data available in one batch. For measuring the upperbound performance.")
140 |     parser.add_argument('--repeat', type=int, default=1, help="Repeat the experiment N times")
141 |     parser.add_argument('--incremental_class', dest='incremental_class', default=False, action='store_true',
142 |                         help="The number of output node in the single-headed model increases along with new categories.")
143 |     args = parser.parse_args(argv)
144 |     return args
145 | 
146 | if __name__ == '__main__':
147 |     args = get_args(sys.argv[1:])
148 |     reg_coef_list = args.reg_coef
149 |     avg_final_acc = {}
150 | 
151 |     # The for loops over hyper-paramerters or repeats
152 |     for reg_coef in reg_coef_list:
153 |         args.reg_coef = reg_coef
154 |         avg_final_acc[reg_coef] = np.zeros(args.repeat)
155 |         for r in range(args.repeat):
156 | 
157 |             # Run the experiment
158 |             acc_table, task_names = run(args)
159 |             print(acc_table)
160 | 
161 |             # Calculate average performance across tasks
162 |             # Customize this part for a different performance metric
163 |             avg_acc_history = [0] * len(task_names)
164 |             for i in range(len(task_names)):
165 |                 train_name = task_names[i]
166 |                 cls_acc_sum = 0
167 |                 for j in range(i + 1):
168 |                     val_name = task_names[j]
169 |                     cls_acc_sum += acc_table[val_name][train_name]
170 |                 avg_acc_history[i] = cls_acc_sum / (i + 1)
171 |                 print('Task', train_name, 'average acc:', avg_acc_history[i])
172 | 
173 |             # Gather the final avg accuracy
174 |             avg_final_acc[reg_coef][r] = avg_acc_history[-1]
175 | 
176 |             # Print the summary so far
177 |             print('===Summary of experiment repeats:',r+1,'/',args.repeat,'===')
178 |             print('The regularization coefficient:', args.reg_coef)
179 |             print('The last avg acc of all repeats:', avg_final_acc[reg_coef])
180 |             print('mean:', avg_final_acc[reg_coef].mean(), 'std:', avg_final_acc[reg_coef].std())
181 |     for reg_coef,v in avg_final_acc.items():
182 |         print('reg_coef:', reg_coef,'mean:', avg_final_acc[reg_coef].mean(), 'std:', avg_final_acc[reg_coef].std())
183 | 


--------------------------------------------------------------------------------
/cl/agents/default.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import torch
  3 | import torch.nn as nn
  4 | from types import MethodType
  5 | import models
  6 | from utils.metric import accuracy, AverageMeter, Timer
  7 | 
  8 | class NormalNN(nn.Module):
  9 |     '''
 10 |     Normal Neural Network with SGD for classification
 11 |     '''
 12 |     def __init__(self, agent_config):
 13 |         '''
 14 |         :param agent_config (dict): lr=float,momentum=float,weight_decay=float,
 15 |                                     schedule=[int],  # The last number in the list is the end of epoch
 16 |                                     model_type=str,model_name=str,out_dim={task:dim},model_weights=str
 17 |                                     force_single_head=bool
 18 |                                     print_freq=int
 19 |                                     gpuid=[int]
 20 |         '''
 21 |         super(NormalNN, self).__init__()
 22 |         self.log = print if agent_config['print_freq'] > 0 else lambda \
 23 |             *args: None  # Use a void function to replace the print
 24 |         self.config = agent_config
 25 |         # If out_dim is a dict, there is a list of tasks. The model will have a head for each task.
 26 |         self.multihead = True if len(self.config['out_dim'])>1 else False  # A convenience flag to indicate multi-head/task
 27 |         self.model = self.create_model()
 28 |         self.criterion_fn = nn.CrossEntropyLoss()
 29 |         if agent_config['gpuid'][0] >= 0:
 30 |             self.cuda()
 31 |             self.gpu = True
 32 |         else:
 33 |             self.gpu = False
 34 |         self.init_optimizer()
 35 |         self.reset_optimizer = False
 36 |         self.valid_out_dim = 'ALL'  # Default: 'ALL' means all output nodes are active
 37 |                                     # Set a interger here for the incremental class scenario
 38 | 
 39 |     def init_optimizer(self):
 40 |         optimizer_arg = {'params':self.model.parameters(),
 41 |                          'lr':self.config['lr'],
 42 |                          'weight_decay':self.config['weight_decay']}
 43 |         if self.config['optimizer'] in ['SGD','RMSprop']:
 44 |             optimizer_arg['momentum'] = self.config['momentum']
 45 |         elif self.config['optimizer'] in ['Rprop']:
 46 |             optimizer_arg.pop('weight_decay')
 47 |         elif self.config['optimizer'] == 'amsgrad':
 48 |             optimizer_arg['amsgrad'] = True
 49 |             self.config['optimizer'] = 'Adam'
 50 | 
 51 |         self.optimizer = torch.optim.__dict__[self.config['optimizer']](**optimizer_arg)
 52 |         self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=self.config['schedule'],
 53 |                                                               gamma=0.1)
 54 | 
 55 |     def create_model(self):
 56 |         cfg = self.config
 57 | 
 58 |         # Define the backbone (MLP, LeNet, VGG, ResNet ... etc) of model
 59 |         model = models.__dict__[cfg['model_type']].__dict__[cfg['model_name']](mode=cfg['mode'])
 60 | 
 61 |         # Apply network surgery to the backbone
 62 |         # Create the heads for tasks (It can be single task or multi-task)
 63 |         n_feat = model.last.in_features
 64 | 
 65 |         # The output of the model will be a dict: {task_name1:output1, task_name2:output2 ...}
 66 |         # For a single-headed model the output will be {'All':output}
 67 |         model.last = nn.ModuleDict()
 68 |         for task,out_dim in cfg['out_dim'].items():
 69 |             model.last[task] = model.get_last(n_feat,out_dim)
 70 | 
 71 |         # Redefine the task-dependent function
 72 |         def new_logits(self, x):
 73 |             outputs = {}
 74 |             for task, func in self.last.items():
 75 |                 outputs[task] = func(x)
 76 |             return outputs
 77 | 
 78 |         # Replace the task-dependent function
 79 |         model.logits = MethodType(new_logits, model)
 80 |         # Load pre-trained weights
 81 |         if cfg['model_weights'] is not None:
 82 |             print('=> Load model weights:', cfg['model_weights'])
 83 |             model_state = torch.load(cfg['model_weights'],
 84 |                                      map_location=lambda storage, loc: storage)  # Load to CPU.
 85 |             model.load_state_dict(model_state)
 86 |             print('=> Load Done')
 87 |         return model
 88 | 
 89 |     def forward(self, x):
 90 |         return self.model.forward(x)
 91 | 
 92 |     def predict(self, inputs):
 93 |         self.model.eval()
 94 |         out = self.forward(inputs)
 95 |         for t in out.keys():
 96 |             out[t] = out[t].detach()
 97 |         return out
 98 | 
 99 |     def validation(self, dataloader):
100 |         # This function doesn't distinguish tasks.
101 |         batch_timer = Timer()
102 |         acc = AverageMeter()
103 |         batch_timer.tic()
104 | 
105 |         orig_mode = self.training
106 |         self.eval()
107 |         for i, (input, target, task) in enumerate(dataloader):
108 | 
109 |             if self.gpu:
110 |                 with torch.no_grad():
111 |                     input = input.cuda()
112 |                     target = target.cuda()
113 |             output = self.predict(input)
114 | 
115 |             # Summarize the performance of all tasks, or 1 task, depends on dataloader.
116 |             # Calculated by total number of data.
117 |             acc = accumulate_acc(output, target, task, acc)
118 | 
119 |         self.train(orig_mode)
120 | 
121 |         self.log(' * Val Acc {acc.avg:.3f}, Total time {time:.2f}'
122 |               .format(acc=acc,time=batch_timer.toc()))
123 |         return acc.avg
124 | 
125 |     def criterion(self, preds, targets, tasks, **kwargs):
126 |         # The inputs and targets could come from single task or a mix of tasks
127 |         # The network always makes the predictions with all its heads
128 |         # The criterion will match the head and task to calculate the loss.
129 |         if self.multihead:
130 |             loss = 0
131 |             for t,t_preds in preds.items():
132 |                 inds = [i for i in range(len(tasks)) if tasks[i]==t]  # The index of inputs that matched specific task
133 |                 if len(inds)>0:
134 |                     t_preds = t_preds[inds]
135 |                     t_target = targets[inds]
136 |                     loss += self.criterion_fn(t_preds, t_target) * len(inds)  # restore the loss from average
137 |             loss /= len(targets)  # Average the total loss by the mini-batch size
138 |         else:
139 |             pred = preds['All']
140 |             if isinstance(self.valid_out_dim, int):  # (Not 'ALL') Mask out the outputs of unseen classes for incremental class scenario
141 |                 pred = preds['All'][:,:self.valid_out_dim]
142 |             loss = self.criterion_fn(pred, targets)
143 |         return loss
144 | 
145 |     def update_model(self, inputs, targets, tasks):
146 |         out = self.forward(inputs)
147 |         loss = self.criterion(out, targets, tasks)
148 |         loss = loss + self.model.get_ploss()
149 |         self.optimizer.zero_grad()
150 |         loss.backward()
151 |         self.optimizer.step()
152 |         return loss.detach(), out
153 | 
154 |     def learn_batch(self, train_loader, val_loader=None):
155 |         if self.reset_optimizer:  # Reset optimizer before learning each task
156 |             self.log('Optimizer is reset!')
157 |             self.init_optimizer()
158 | 
159 |         for epoch in range(self.config['schedule'][-1]):
160 |             data_timer = Timer()
161 |             batch_timer = Timer()
162 |             batch_time = AverageMeter()
163 |             data_time = AverageMeter()
164 |             losses = AverageMeter()
165 |             acc = AverageMeter()
166 | 
167 |             # Config the model and optimizer
168 |             self.log('Epoch:{0}'.format(epoch))
169 |             self.model.train()
170 |             self.scheduler.step(epoch)
171 |             for param_group in self.optimizer.param_groups:
172 |                 self.log('LR:',param_group['lr'])
173 | 
174 |             # Learning with mini-batch
175 |             data_timer.tic()
176 |             batch_timer.tic()
177 |             self.log('Itr\t\tTime\t\t  Data\t\t  Loss\t\tAcc')
178 |             for i, (input, target, task) in enumerate(train_loader):
179 | 
180 |                 data_time.update(data_timer.toc())  # measure data loading time
181 | 
182 |                 if self.gpu:
183 |                     input = input.cuda()
184 |                     target = target.cuda()
185 | 
186 |                 loss, output = self.update_model(input, target, task)
187 |                 input = input.detach()
188 |                 target = target.detach()
189 | 
190 |                 # measure accuracy and record loss
191 |                 acc = accumulate_acc(output, target, task, acc)
192 |                 losses.update(loss, input.size(0))
193 | 
194 |                 batch_time.update(batch_timer.toc())  # measure elapsed time
195 |                 data_timer.toc()
196 | 
197 |                 if ((self.config['print_freq']>0) and (i % self.config['print_freq'] == 0)) or (i+1)==len(train_loader):
198 |                     self.log('[{0}/{1}]\t'
199 |                           '{batch_time.val:.4f} ({batch_time.avg:.4f})\t'
200 |                           '{data_time.val:.4f} ({data_time.avg:.4f})\t'
201 |                           '{loss.val:.3f} ({loss.avg:.3f})\t'
202 |                           '{acc.val:.2f} ({acc.avg:.2f})'.format(
203 |                         i, len(train_loader), batch_time=batch_time,
204 |                         data_time=data_time, loss=losses, acc=acc))
205 | 
206 |             self.log(' * Train Acc {acc.avg:.3f}'.format(acc=acc))
207 | 
208 |             # Evaluate the performance of current task
209 |             if val_loader != None:
210 |                 self.validation(val_loader)
211 | 
212 |     def learn_stream(self, data, label):
213 |         assert False,'No implementation yet'
214 | 
215 |     def add_valid_output_dim(self, dim=0):
216 |         # This function is kind of ad-hoc, but it is the simplest way to support incremental class learning
217 |         self.log('Incremental class: Old valid output dimension:', self.valid_out_dim)
218 |         if self.valid_out_dim == 'ALL':
219 |             self.valid_out_dim = 0  # Initialize it with zero
220 |         self.valid_out_dim += dim
221 |         self.log('Incremental class: New Valid output dimension:', self.valid_out_dim)
222 |         return self.valid_out_dim
223 | 
224 |     def count_parameter(self):
225 |         return sum(p.numel() for p in self.model.parameters())
226 | 
227 |     def save_model(self, filename):
228 |         model_state = self.model.state_dict()
229 |         if isinstance(self.model,torch.nn.DataParallel):
230 |             # Get rid of 'module' before the name of states
231 |             model_state = self.model.module.state_dict()
232 |         for key in model_state.keys():  # Always save it to cpu
233 |             model_state[key] = model_state[key].cpu()
234 |         print('=> Saving model to:', filename)
235 |         torch.save(model_state, filename + '.pth')
236 |         print('=> Save Done')
237 | 
238 |     def cuda(self):
239 |         torch.cuda.set_device(self.config['gpuid'][0])
240 |         self.model = self.model.cuda()
241 |         self.criterion_fn = self.criterion_fn.cuda()
242 |         # Multi-GPU
243 |         if len(self.config['gpuid']) > 1:
244 |             self.model = torch.nn.DataParallel(self.model, device_ids=self.config['gpuid'], output_device=self.config['gpuid'][0])
245 |         return self
246 | 
247 | def accumulate_acc(output, target, task, meter):
248 |     if 'All' in output.keys(): # Single-headed model
249 |         meter.update(accuracy(output['All'], target), len(target))
250 |     else:  # outputs from multi-headed (multi-task) model
251 |         for t, t_out in output.items():
252 |             inds = [i for i in range(len(task)) if task[i] == t]  # The index of inputs that matched specific task
253 |             if len(inds) > 0:
254 |                 t_out = t_out[inds]
255 |                 t_target = target[inds]
256 |                 meter.update(accuracy(t_out, t_target), len(inds))
257 | 
258 |     return meter
259 | 


--------------------------------------------------------------------------------
/cl/models/nsa.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from models.controller import LSTMController
  6 | 
  7 | 
  8 | δ = 1e-6
  9 | def θ(a, b, dimA=2, dimB=2, normBy=2):
 10 |     """Batchwise Cosine distance
 11 | 
 12 |     Cosine distance
 13 | 
 14 |     Arguments:
 15 |         a {Tensor} -- A 3D Tensor (b * m * w)
 16 |         b {Tensor} -- A 3D Tensor (b * r * w)
 17 | 
 18 |     Keyword Arguments:
 19 |         dimA {number} -- exponent value of the norm for `a` (default: {2})
 20 |         dimB {number} -- exponent value of the norm for `b` (default: {1})
 21 | 
 22 |     Returns:
 23 |         Tensor -- Batchwise cosine distance (b * r * m)
 24 |     """
 25 |     a_norm = torch.norm(a, normBy, dimA, keepdim=True).expand_as(a) + δ
 26 |     b_norm = torch.norm(b, normBy, dimB, keepdim=True).expand_as(b) + δ
 27 | 
 28 |     x = torch.bmm(a, b.transpose(1, 2)).transpose(1, 2) / (
 29 |             torch.bmm(a_norm, b_norm.transpose(1, 2)).transpose(1, 2) + δ)
 30 |     return x
 31 | 
 32 | 
 33 | class ProgammedController(nn.Module):
 34 |     def __init__(self, program_shape, program_interface_size, pkey_dim=5, num_program=20,
 35 |                  bias=False, svd_num_features=8, top_lu=10, has_res_w="n",
 36 |                  kc_mode="lk", rnn_step=10):
 37 |         super(ProgammedController, self).__init__()
 38 |         self.pkey_dim = pkey_dim
 39 |         self.program_size = num_program
 40 |         self.program_shape = program_shape
 41 |         self.has_bias = bias
 42 |         self.rnn_step = rnn_step
 43 |         self.top_lu = top_lu
 44 |         self.has_res_w = has_res_w
 45 |         self.svd_num_features = svd_num_features
 46 |         self.kc_mode = kc_mode
 47 | 
 48 |         self.PM_U = nn.Parameter(torch.zeros(self.program_size,
 49 |                                              self.pkey_dim + program_shape[0],
 50 |                                   requires_grad=True))
 51 | 
 52 |         self.PM_V = nn.Parameter(torch.zeros(self.program_size,
 53 |                                              self.pkey_dim + program_shape[1],
 54 |                                              requires_grad=True))
 55 |         self.PM_S = nn.Parameter(torch.zeros(self.program_size,self.pkey_dim+1,
 56 |                                              requires_grad=True))
 57 | 
 58 |         if self.has_res_w == "y":
 59 |             self.res_weight = nn.Parameter(torch.zeros(program_shape[0],
 60 |                                       program_shape[1],
 61 |                                       requires_grad=True))
 62 | 
 63 |         if self.rnn_step == 0:
 64 | 
 65 |             self.program_key_u = nn.Linear(program_interface_size,self.svd_num_features*self.pkey_dim)
 66 |             self.program_key_v = nn.Linear(program_interface_size, self.svd_num_features*self.pkey_dim)
 67 |             self.program_key_s= nn.Linear(program_interface_size, self.svd_num_features*self.pkey_dim)
 68 |             self.res_s = nn.Linear(program_interface_size, 1)
 69 | 
 70 |         else:
 71 | 
 72 |             self.rnn_program_controller = LSTMController(num_inputs=program_interface_size,
 73 |                                                          num_outputs=self.svd_num_features*self.pkey_dim*3+1,
 74 |                                                          num_layers=1)
 75 |             self.component_map = nn.Linear(self.svd_num_features*self.pkey_dim*3+1, program_interface_size)
 76 | 
 77 |             if self.top_lu>0:
 78 |                 self.read_mode = nn.Linear(program_interface_size, 3*num_program)
 79 | 
 80 |         if self.kc_mode == "cb":
 81 |             self.p2ku = nn.Linear(program_shape[0], self.pkey_dim)
 82 |             self.p2kv = nn.Linear(program_shape[1], self.pkey_dim)
 83 | 
 84 |         for name, param in self.named_parameters():
 85 |             if "PM" not in name:
 86 |                 param.requires_grad = True
 87 | 
 88 |         self.bias = nn.Parameter(torch.zeros(program_shape[1],
 89 |                                              requires_grad=True))
 90 |         self.record_Ua = []
 91 |         self.record_Va = []
 92 |         self.record_Sa = []
 93 | 
 94 | 
 95 | 
 96 |     def initialize(self):
 97 |         nn.init.xavier_uniform_(self.PM_U, gain=1)
 98 |         nn.init.xavier_uniform_(self.PM_V, gain=1)
 99 |         nn.init.xavier_uniform_(self.PM_S, gain=1)
100 |         if self.has_res_w == "y":
101 |             nn.init.xavier_uniform_(self.res_weight, gain=1)
102 |             nn.init.normal_(self.bias, std=0.01)
103 | 
104 |         if self.rnn_step == 0:
105 |             nn.init.xavier_uniform_(self.program_key_u.weight, gain=1)
106 |             nn.init.normal_(self.program_key_u.bias, std=0.01)
107 |             nn.init.xavier_uniform_(self.program_key_v.weight, gain=1)
108 |             nn.init.normal_(self.program_key_v.bias, std=0.01)
109 |             nn.init.xavier_uniform_(self.program_key_s.weight, gain=1)
110 |             nn.init.normal_(self.program_key_s.bias, std=0.01)
111 |             nn.init.xavier_uniform_(self.res_s.weight, gain=1)
112 |             nn.init.normal_(self.res_s.bias, std=0.01)
113 |         else:
114 |             self.rnn_program_controller.reset_parameters()
115 | 
116 |     def init_seq(self):
117 |         self.record_Ua=[]
118 |         self.record_Va=[]
119 |         self.record_Sa=[]
120 | 
121 |     def updateMK(self, kc_mode=None):
122 |         if self.kc_mode == "lk":
123 |             self.PK_U = self.PM_U[:,:self.pkey_dim]
124 |             self.PK_V = self.PM_V[:, :self.pkey_dim]
125 |             self.PK_S = self.PM_S[:, :self.pkey_dim]
126 |         elif self.kc_mode == "cb":
127 |             self.PK_U = self.p2ku(self.PM_U[:, self.pkey_dim:])
128 |             self.PK_V = self.p2kv(self.PM_V[:, self.pkey_dim:])
129 |             self.PK_S = self.PM_S[:, :self.pkey_dim]
130 | 
131 |     def get_reg_loss(self):
132 |         if torch.cuda.is_available():
133 |             I = torch.eye(self.program_size).cuda()
134 |         else:
135 |             I = torch.eye(self.program_size)
136 | 
137 |         ploss1 = torch.norm(torch.matmul(self.PM_U, self.PM_U.t()) - I)
138 |         ploss2 = torch.norm(torch.matmul(self.PM_V, self.PM_V.t()) - I)
139 |         return ploss1 + ploss2
140 | 
141 |     def read_Us(self, x):
142 |         MK = self.PK_U.repeat(x.shape[0], 1, 1)
143 |         MP = self.PM_U[:,self.pkey_dim:].repeat(x.shape[0], 1, 1)
144 |         ku = self.program_key_u(x).view(x.shape[0],self.svd_num_features,-1)
145 |         dU = θ(MK, ku)
146 |         self.record_Ua.append(dU)
147 |         dU = F.softmax(dU, dim=-1)
148 |         Us = torch.matmul(dU, MP)
149 |         return Us
150 | 
151 |     def read_Vs(self, x):
152 |         MK = self.PK_V.repeat(x.shape[0], 1, 1)
153 |         MP = self.PM_V[:,self.pkey_dim:].repeat(x.shape[0], 1, 1)
154 |         kv = self.program_key_v(x).view(x.shape[0],self.svd_num_features,-1)
155 |         dV = θ(MK, kv)
156 |         self.record_Va.append(dV)
157 |         dV = F.softmax(dV, dim=-1)
158 |         Vs = torch.matmul(dV, MP)
159 |         return Vs
160 | 
161 |     def read_Ss(self, x):
162 |         MK = self.PK_S.repeat(x.shape[0], 1, 1)
163 |         MP = self.PM_S[:,self.pkey_dim:].repeat(x.shape[0], 1, 1)
164 |         ks = self.program_key_s(x).view(x.shape[0],self.svd_num_features,-1)
165 |         dS = θ(MK, ks)
166 |         self.record_Sa.append(dS)
167 |         dS = F.softmax(dS, dim=-1)
168 |         Ss = torch.matmul(dS, MP)
169 |         Ss = F.softplus(Ss)
170 |         Ss = torch.cumsum(Ss, dim=1).squeeze(-1)
171 |         Ss = torch.diag_embed(Ss, offset=0, dim1=-2, dim2=-1)
172 |         return Ss
173 | 
174 |     def read_component(self, MP, MK, k, luw=None, rm=0):
175 |         d = θ(MK, k)
176 |         d = F.softmax(d*10, dim=-1)
177 | 
178 | 
179 |         if self.top_lu>0:
180 |             m, _ = torch.max(d, dim=-1)
181 |             d = d*1/m.unsqueeze(2)
182 |             d = d*(1-rm) + luw*rm
183 |         M = torch.matmul(d, MP)
184 |         return M, d
185 | 
186 |     def readPM_recurrent(self, x):
187 |         MKu = self.PK_U.repeat(x.shape[0], 1, 1)
188 |         MPu = self.PM_U[:, self.pkey_dim:].repeat(x.shape[0], 1, 1)
189 |         MKv = self.PK_V.repeat(x.shape[0], 1, 1)
190 |         MPv = self.PM_V[:, self.pkey_dim:].repeat(x.shape[0], 1, 1)
191 |         MKs = self.PK_S.repeat(x.shape[0], 1, 1)
192 |         MPs = self.PM_S[:, self.pkey_dim:].repeat(x.shape[0], 1, 1)
193 | 
194 |         U = []
195 |         V = []
196 |         S = []
197 | 
198 |         dUs = []
199 |         dVs = []
200 |         dSs = []
201 | 
202 |         state = self.rnn_program_controller.create_new_state(x.shape[0])
203 |         if self.top_lu>0:
204 |             luw_u = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu)
205 |             luw_u2 = torch.ones(x.shape[0], 1, self.top_lu)
206 |             luw_u = torch.cat([luw_u2, luw_u], dim=-1)
207 |             luw_v = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu)
208 |             luw_v2 = torch.ones(x.shape[0], 1, self.top_lu)
209 |             luw_v = torch.cat([luw_v2, luw_v], dim=-1)
210 |             luw_s = torch.zeros(x.shape[0], 1, self.program_size-self.top_lu)
211 |             luw_s2 = torch.ones(x.shape[0], 1, self.top_lu)
212 |             luw_s = torch.cat([luw_s2, luw_s], dim=-1)
213 | 
214 |             if torch.cuda.is_available():
215 |                 luw_u = luw_u.cuda()
216 |                 luw_v = luw_v.cuda()
217 |                 luw_s = luw_s.cuda()
218 | 
219 | 
220 |         for step in range(self.rnn_step):
221 |             interface, state = self.rnn_program_controller(x, state)
222 |             # x = self.component_map(interface)
223 |             key_u, key_v, key_s, rs = interface[:,:self.svd_num_features*self.pkey_dim],\
224 |                                       interface[:,self.svd_num_features*self.pkey_dim:self.svd_num_features*self.pkey_dim*2], \
225 |                                       interface[:, self.svd_num_features*self.pkey_dim*2:self.svd_num_features*self.pkey_dim*3],\
226 |                                       interface[:,self.svd_num_features*self.pkey_dim*3:]
227 | 
228 |             if self.top_lu==0:
229 |                 Ut, dU = self.read_component(MPu, MKu, key_u.view(x.shape[0],self.svd_num_features,-1))
230 |                 Vt, dV = self.read_component(MPv, MKv, key_v.view(x.shape[0],self.svd_num_features,-1))
231 |                 St, dS = self.read_component(MPs, MKs, key_s.view(x.shape[0],self.svd_num_features,-1))
232 |             else:
233 |                 rm = F.sigmoid(self.read_mode(x))
234 |                 rm_u = rm[:,:self.program_size].unsqueeze(1)
235 |                 rm_v = rm[:, self.program_size:self.program_size*2].unsqueeze(1)
236 |                 rm_s = rm[:, self.program_size*2:].unsqueeze(1)
237 |                 Ut, dU = self.read_component(MPu, MKu, key_u.view(x.shape[0], self.svd_num_features, -1),
238 |                                              luw_u, rm_u)
239 |                 Vt, dV = self.read_component(MPv, MKv, key_v.view(x.shape[0], self.svd_num_features, -1),
240 |                                              luw_v, rm_v)
241 |                 St, dS = self.read_component(MPs, MKs, key_s.view(x.shape[0], self.svd_num_features, -1),
242 |                                              luw_s, rm_s)
243 | 
244 |             U.append(Ut)
245 |             V.append(Vt)
246 |             S.append(St)
247 | 
248 |             dUs.append(dU)
249 |             dVs.append(dV)
250 |             dSs.append(dS)
251 | 
252 |             dU = torch.cat(dUs, dim=1)
253 |             dV = torch.cat(dVs, dim=1)
254 |             dS = torch.cat(dSs, dim=1)
255 | 
256 |             if self.top_lu>0:
257 |                 max_useu, _ = torch.max(dU, dim=1)
258 |                 upperu, _ = torch.max(max_useu, dim=-1)
259 |                 luw_u = 1 -max_useu
260 |                 luw_u_sort, _ = luw_u.sort(dim=-1, descending=True)
261 |                 th = luw_u_sort[:,self.top_lu].unsqueeze(1)
262 |                 luw_u = (luw_u*(luw_u>th).float()).unsqueeze(1)
263 |                 max_usev, _ = torch.max(dV, dim=1)
264 |                 upperv, _ = torch.max(max_usev, dim=-1)
265 |                 luw_v = 1-max_usev
266 |                 luw_v_sort, _ = luw_v.sort(dim=-1, descending=True)
267 |                 th = luw_v_sort[:, self.top_lu].unsqueeze(1)
268 |                 luw_v = (luw_v*(luw_v > th).float()).unsqueeze(1)
269 |                 max_uses, _ = torch.max(dS, dim=1)
270 |                 uppers, _ = torch.max(max_uses, dim=-1)
271 |                 luw_s = 1 - max_uses
272 |                 luw_s_sort, _ = luw_s.sort(dim=-1, descending=True)
273 |                 th = luw_s_sort[:, self.top_lu].unsqueeze(1)
274 |                 luw_s = (luw_s*(luw_s > th).float()).unsqueeze(1)
275 | 
276 | 
277 | 
278 |         U = torch.cat(U, dim=1)
279 |         V = torch.cat(V, dim=1)
280 |         S = torch.cat(S, dim=1)
281 | 
282 | 
283 | 
284 |         self.record_Ua.append(dU)
285 |         self.record_Va.append(dV)
286 |         self.record_Sa.append(dS)
287 | 
288 | 
289 |         S = F.softplus(S)
290 |         S = torch.cumsum(S, dim=1).squeeze(-1)
291 |         S = torch.flip(S, dims=[1])
292 |         S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1)
293 |         W = self.composeSVD(U, V, S)
294 |         rs = F.sigmoid(rs)
295 |         return W, rs, S[:, -1, -1]
296 | 
297 | 
298 |     def composeSVD(self, U, V, S):
299 |         US = torch.matmul(U.permute(0, 2, 1), S)
300 |         USV = torch.matmul(US, V)
301 |         return USV
302 | 
303 |     def forward(self, x, res_weight=None):
304 |         self.updateMK()
305 |         self.init_seq()
306 |         if  self.has_res_w == "y":
307 |             res_weight = self.res_weight
308 | 
309 |         if self.rnn_step == 0:
310 |             U = self.read_Us(x)
311 |             V = self.read_Vs(x)
312 |             S = self.read_Ss(x)
313 |             W = self.composeSVD(U, V, S)
314 |             rs = F.sigmoid(self.res_s(x))
315 |             s = S[:, 0, 0]
316 |         else:
317 |             W, rs, s = self.readPM_recurrent(x)
318 | 
319 | 
320 |         if self.has_res_w == "y" or res_weight is not None:
321 |             a = s.unsqueeze(1).unsqueeze(2) * rs.unsqueeze(2)
322 |             W = W + a*res_weight.repeat(x.shape[0], 1, 1)
323 | 
324 |         y = torch.matmul(x.unsqueeze(1), W).squeeze(1)
325 |         if self.has_bias:
326 |             y = y + self.bias
327 |         return y


--------------------------------------------------------------------------------
/cl/agents/regularization.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import random
  3 | from .default import NormalNN
  4 | 
  5 | 
  6 | class L2(NormalNN):
  7 |     """
  8 |     @article{kirkpatrick2017overcoming,
  9 |         title={Overcoming catastrophic forgetting in neural networks},
 10 |         author={Kirkpatrick, James and Pascanu, Razvan and Rabinowitz, Neil and Veness, Joel and Desjardins, Guillaume and Rusu, Andrei A and Milan, Kieran and Quan, John and Ramalho, Tiago and Grabska-Barwinska, Agnieszka and others},
 11 |         journal={Proceedings of the national academy of sciences},
 12 |         year={2017},
 13 |         url={https://arxiv.org/abs/1612.00796}
 14 |     }
 15 |     """
 16 |     def __init__(self, agent_config):
 17 |         super(L2, self).__init__(agent_config)
 18 |         self.params = {n: p for n, p in self.model.named_parameters() if p.requires_grad}  # For convenience
 19 |         self.regularization_terms = {}
 20 |         self.task_count = 0
 21 |         self.online_reg = True  # True: There will be only one importance matrix and previous model parameters
 22 |                                 # False: Each task has its own importance matrix and model parameters
 23 | 
 24 |     def calculate_importance(self, dataloader):
 25 |         # Use an identity importance so it is an L2 regularization.
 26 |         importance = {}
 27 |         for n, p in self.params.items():
 28 |             importance[n] = p.clone().detach().fill_(1)  # Identity
 29 |         return importance
 30 | 
 31 |     def learn_batch(self, train_loader, val_loader=None):
 32 | 
 33 |         self.log('#reg_term:', len(self.regularization_terms))
 34 | 
 35 |         # 1.Learn the parameters for current task
 36 |         super(L2, self).learn_batch(train_loader, val_loader)
 37 | 
 38 |         # 2.Backup the weight of current task
 39 |         task_param = {}
 40 |         for n, p in self.params.items():
 41 |             task_param[n] = p.clone().detach()
 42 | 
 43 |         # 3.Calculate the importance of weights for current task
 44 |         importance = self.calculate_importance(train_loader)
 45 | 
 46 |         # Save the weight and importance of weights of current task
 47 |         self.task_count += 1
 48 |         if self.online_reg and len(self.regularization_terms)>0:
 49 |             # Always use only one slot in self.regularization_terms
 50 |             self.regularization_terms[1] = {'importance':importance, 'task_param':task_param}
 51 |         else:
 52 |             # Use a new slot to store the task-specific information
 53 |             self.regularization_terms[self.task_count] = {'importance':importance, 'task_param':task_param}
 54 | 
 55 |     def criterion(self, inputs, targets, tasks, regularization=True, **kwargs):
 56 |         loss = super(L2, self).criterion(inputs, targets, tasks, **kwargs)
 57 | 
 58 |         if regularization and len(self.regularization_terms)>0:
 59 |             # Calculate the reg_loss only when the regularization_terms exists
 60 |             reg_loss = 0
 61 |             for i,reg_term in self.regularization_terms.items():
 62 |                 task_reg_loss = 0
 63 |                 importance = reg_term['importance']
 64 |                 task_param = reg_term['task_param']
 65 |                 for n, p in self.params.items():
 66 |                     task_reg_loss += (importance[n] * (p - task_param[n]) ** 2).sum()
 67 |                 reg_loss += task_reg_loss
 68 |             loss += self.config['reg_coef'] * reg_loss
 69 |             loss = loss + self.model.get_ploss()
 70 |         return loss
 71 | 
 72 | 
 73 | class EWC(L2):
 74 |     """
 75 |     @article{kirkpatrick2017overcoming,
 76 |         title={Overcoming catastrophic forgetting in neural networks},
 77 |         author={Kirkpatrick, James and Pascanu, Razvan and Rabinowitz, Neil and Veness, Joel and Desjardins, Guillaume and Rusu, Andrei A and Milan, Kieran and Quan, John and Ramalho, Tiago and Grabska-Barwinska, Agnieszka and others},
 78 |         journal={Proceedings of the national academy of sciences},
 79 |         year={2017},
 80 |         url={https://arxiv.org/abs/1612.00796}
 81 |     }
 82 |     """
 83 | 
 84 |     def __init__(self, agent_config):
 85 |         super(EWC, self).__init__(agent_config)
 86 |         self.online_reg = False
 87 |         self.n_fisher_sample = None
 88 |         self.empFI = False
 89 | 
 90 |     def calculate_importance(self, dataloader):
 91 |         # Update the diag fisher information
 92 |         # There are several ways to estimate the F matrix.
 93 |         # We keep the implementation as simple as possible while maintaining a similar performance to the literature.
 94 |         self.log('Computing EWC')
 95 | 
 96 |         # Initialize the importance matrix
 97 |         if self.online_reg and len(self.regularization_terms)>0:
 98 |             importance = self.regularization_terms[1]['importance']
 99 |         else:
100 |             importance = {}
101 |             for n, p in self.params.items():
102 |                 importance[n] = p.clone().detach().fill_(0)  # zero initialized
103 | 
104 |         # Sample a subset (n_fisher_sample) of data to estimate the fisher information (batch_size=1)
105 |         # Otherwise it uses mini-batches for the estimation. This speeds up the process a lot with similar performance.
106 |         if self.n_fisher_sample is not None:
107 |             n_sample = min(self.n_fisher_sample, len(dataloader.dataset))
108 |             self.log('Sample',self.n_fisher_sample,'for estimating the F matrix.')
109 |             rand_ind = random.sample(list(range(len(dataloader.dataset))), n_sample)
110 |             subdata = torch.utils.data.Subset(dataloader.dataset, rand_ind)
111 |             dataloader = torch.utils.data.DataLoader(subdata, shuffle=True, num_workers=2, batch_size=1)
112 | 
113 |         mode = self.training
114 |         self.eval()
115 | 
116 |         # Accumulate the square of gradients
117 |         for i, (input, target, task) in enumerate(dataloader):
118 |             if self.gpu:
119 |                 input = input.cuda()
120 |                 target = target.cuda()
121 | 
122 |             preds = self.forward(input)
123 | 
124 |             # Sample the labels for estimating the gradients
125 |             # For multi-headed model, the batch of data will be from the same task,
126 |             # so we just use task[0] as the task name to fetch corresponding predictions
127 |             # For single-headed model, just use the max of predictions from preds['All']
128 |             task_name = task[0] if self.multihead else 'All'
129 | 
130 |             # The flag self.valid_out_dim is for handling the case of incremental class learning.
131 |             # if self.valid_out_dim is an integer, it means only the first 'self.valid_out_dim' dimensions are used
132 |             # in calculating the loss.
133 |             pred = preds[task_name] if not isinstance(self.valid_out_dim, int) else preds[task_name][:,:self.valid_out_dim]
134 |             ind = pred.max(1)[1].flatten()  # Choose the one with max
135 | 
136 |             # - Alternative ind by multinomial sampling. Its performance is similar. -
137 |             # prob = torch.nn.functional.softmax(preds['All'],dim=1)
138 |             # ind = torch.multinomial(prob,1).flatten()
139 | 
140 |             if self.empFI:  # Use groundtruth label (default is without this)
141 |                 ind = target
142 | 
143 |             loss = self.criterion(preds, ind, task, regularization=False)
144 |             loss = loss + self.model.get_ploss()
145 | 
146 |             self.model.zero_grad()
147 |             loss.backward()
148 |             for n, p in importance.items():
149 |                 if self.params[n].grad is not None:  # Some heads can have no grad if no loss applied on them.
150 |                     p += ((self.params[n].grad ** 2) * len(input) / len(dataloader))
151 | 
152 |         self.train(mode=mode)
153 | 
154 |         return importance
155 | 
156 | 
157 | def EWC_online(agent_config):
158 |     agent = EWC(agent_config)
159 |     agent.online_reg = True
160 |     return agent
161 | 
162 | 
163 | class SI(L2):
164 |     """
165 |     @inproceedings{zenke2017continual,
166 |         title={Continual Learning Through Synaptic Intelligence},
167 |         author={Zenke, Friedemann and Poole, Ben and Ganguli, Surya},
168 |         booktitle={International Conference on Machine Learning},
169 |         year={2017},
170 |         url={https://arxiv.org/abs/1703.04200}
171 |     }
172 |     """
173 | 
174 |     def __init__(self, agent_config):
175 |         super(SI, self).__init__(agent_config)
176 |         self.online_reg = True  # Original SI works in an online updating fashion
177 |         self.damping_factor = 0.1
178 |         self.w = {}
179 |         for n, p in self.params.items():
180 |             self.w[n] = p.clone().detach().zero_()
181 | 
182 |         # The initial_params will only be used in the first task (when the regularization_terms is empty)
183 |         self.initial_params = {}
184 |         for n, p in self.params.items():
185 |             self.initial_params[n] = p.clone().detach()
186 | 
187 |     def update_model(self, inputs, targets, tasks):
188 | 
189 |         unreg_gradients = {}
190 |         
191 |         # 1.Save current parameters
192 |         old_params = {}
193 |         for n, p in self.params.items():
194 |             old_params[n] = p.clone().detach()
195 | 
196 |         # 2. Collect the gradients without regularization term
197 |         out = self.forward(inputs)
198 |         loss = self.criterion(out, targets, tasks, regularization=False)
199 |         loss = loss + self.model.get_ploss()
200 | 
201 |         self.optimizer.zero_grad()
202 |         loss.backward(retain_graph=True)
203 |         for n, p in self.params.items():
204 |             if p.grad is not None:
205 |                 unreg_gradients[n] = p.grad.clone().detach()
206 | 
207 |         # 3. Normal update with regularization
208 |         loss = self.criterion(out, targets, tasks, regularization=True)
209 |         self.optimizer.zero_grad()
210 |         loss.backward()
211 |         self.optimizer.step()
212 | 
213 |         # 4. Accumulate the w
214 |         for n, p in self.params.items():
215 |             delta = p.detach() - old_params[n]
216 |             if n in unreg_gradients.keys():  # In multi-head network, some head could have no grad (lazy) since no loss go through it.
217 |                 self.w[n] -= unreg_gradients[n] * delta  # w[n] is >=0
218 | 
219 |         return loss.detach(), out
220 | 
221 |     """
222 |     # - Alternative simplified implementation with similar performance -
223 |     def update_model(self, inputs, targets, tasks):
224 |         # A wrapper of original update step to include the estimation of w
225 | 
226 |         # Backup prev param if not done yet
227 |         # The backup only happened at the beginning of a new task
228 |         if len(self.prev_params) == 0:
229 |             for n, p in self.params.items():
230 |                 self.prev_params[n] = p.clone().detach()
231 | 
232 |         # 1.Save current parameters
233 |         old_params = {}
234 |         for n, p in self.params.items():
235 |             old_params[n] = p.clone().detach()
236 | 
237 |         # 2.Calculate the loss as usual
238 |         loss, out = super(SI, self).update_model(inputs, targets, tasks)
239 | 
240 |         # 3.Accumulate the w
241 |         for n, p in self.params.items():
242 |             delta = p.detach() - old_params[n]
243 |             if p.grad is not None:  # In multi-head network, some head could have no grad (lazy) since no loss go through it.
244 |                 self.w[n] -= p.grad * delta  # w[n] is >=0
245 | 
246 |         return loss.detach(), out
247 |     """
248 | 
249 |     def calculate_importance(self, dataloader):
250 |         self.log('Computing SI')
251 |         assert self.online_reg,'SI needs online_reg=True'
252 | 
253 |         # Initialize the importance matrix
254 |         if len(self.regularization_terms)>0: # The case of after the first task
255 |             importance = self.regularization_terms[1]['importance']
256 |             prev_params = self.regularization_terms[1]['task_param']
257 |         else:  # It is in the first task
258 |             importance = {}
259 |             for n, p in self.params.items():
260 |                 importance[n] = p.clone().detach().fill_(0)  # zero initialized
261 |             prev_params = self.initial_params
262 | 
263 |         # Calculate or accumulate the Omega (the importance matrix)
264 |         for n, p in importance.items():
265 |             delta_theta = self.params[n].detach() - prev_params[n]
266 |             p += self.w[n]/(delta_theta**2 + self.damping_factor)
267 |             self.w[n].zero_()
268 | 
269 |         return importance
270 | 
271 | 
272 | class MAS(L2):
273 |     """
274 |     @article{aljundi2017memory,
275 |       title={Memory Aware Synapses: Learning what (not) to forget},
276 |       author={Aljundi, Rahaf and Babiloni, Francesca and Elhoseiny, Mohamed and Rohrbach, Marcus and Tuytelaars, Tinne},
277 |       booktitle={ECCV},
278 |       year={2018},
279 |       url={https://eccv2018.org/openaccess/content_ECCV_2018/papers/Rahaf_Aljundi_Memory_Aware_Synapses_ECCV_2018_paper.pdf}
280 |     }
281 |     """
282 | 
283 |     def __init__(self, agent_config):
284 |         super(MAS, self).__init__(agent_config)
285 |         self.online_reg = True
286 | 
287 |     def calculate_importance(self, dataloader):
288 |         self.log('Computing MAS')
289 | 
290 |         # Initialize the importance matrix
291 |         if self.online_reg and len(self.regularization_terms)>0:
292 |             importance = self.regularization_terms[1]['importance']
293 |         else:
294 |             importance = {}
295 |             for n, p in self.params.items():
296 |                 importance[n] = p.clone().detach().fill_(0)  # zero initialized
297 | 
298 |         mode = self.training
299 |         self.eval()
300 | 
301 |         # Accumulate the gradients of L2 loss on the outputs
302 |         for i, (input, target, task) in enumerate(dataloader):
303 |             if self.gpu:
304 |                 input = input.cuda()
305 |                 target = target.cuda()
306 | 
307 |             preds = self.forward(input)
308 | 
309 |             # Sample the labels for estimating the gradients
310 |             # For multi-headed model, the batch of data will be from the same task,
311 |             # so we just use task[0] as the task name to fetch corresponding predictions
312 |             # For single-headed model, just use the max of predictions from preds['All']
313 |             task_name = task[0] if self.multihead else 'All'
314 | 
315 |             # The flag self.valid_out_dim is for handling the case of incremental class learning.
316 |             # if self.valid_out_dim is an integer, it means only the first 'self.valid_out_dim' dimensions are used
317 |             # in calculating the  loss.
318 |             pred = preds[task_name] if not isinstance(self.valid_out_dim, int) else preds[task_name][:,:self.valid_out_dim]
319 | 
320 |             pred.pow_(2)
321 |             loss = pred.mean()
322 |             loss = loss + self.model.get_ploss()
323 | 
324 |             self.model.zero_grad()
325 |             loss.backward()
326 |             for n, p in importance.items():
327 |                 if self.params[n].grad is not None:  # Some heads can have no grad if no loss applied on them.
328 |                     p += (self.params[n].grad.abs() / len(dataloader))
329 | 
330 |         self.train(mode=mode)
331 | 
332 |         return importance


--------------------------------------------------------------------------------
/cl/models/nsm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | δ = 1e-6
  7 | def θ(a, b, dimA=2, dimB=2, normBy=2):
  8 |     """Batchwise Cosine distance
  9 | 
 10 |     Cosine distance
 11 | 
 12 |     Arguments:
 13 |         a {Tensor} -- A 3D Tensor (b * m * w)
 14 |         b {Tensor} -- A 3D Tensor (b * r * w)
 15 | 
 16 |     Keyword Arguments:
 17 |         dimA {number} -- exponent value of the norm for `a` (default: {2})
 18 |         dimB {number} -- exponent value of the norm for `b` (default: {1})
 19 | 
 20 |     Returns:
 21 |         Tensor -- Batchwise cosine distance (b * r * m)
 22 |     """
 23 |     a_norm = torch.norm(a, normBy, dimA, keepdim=True).expand_as(a) + δ
 24 |     b_norm = torch.norm(b, normBy, dimB, keepdim=True).expand_as(b) + δ
 25 | 
 26 |     x = torch.bmm(a, b.transpose(1, 2)).transpose(1, 2) / (
 27 |             torch.bmm(a_norm, b_norm.transpose(1, 2)).transpose(1, 2) + δ)
 28 |     return x
 29 | 
 30 | 
 31 | class ProgammedController(nn.Module):
 32 |     def __init__(self, program_shape, program_interface_size, pkey_dim=10, num_program=2,
 33 |                  bias=False, svd_num_features=8, program_read_mode="linear", att_mode="kv", kc_mode="lk"):
 34 |         super(ProgammedController, self).__init__()
 35 |         self.pkey_dim = pkey_dim
 36 |         self.program_size = num_program
 37 |         self.program_shape = program_shape
 38 |         self.has_bias = bias
 39 |         self.svd_num_features = svd_num_features
 40 |         self.program_read_mode = program_read_mode
 41 |         self.att_mode = att_mode
 42 |         self.kc_mode = kc_mode
 43 | 
 44 |         a=0
 45 |         if bias:
 46 |             a =1
 47 | 
 48 |         self.mprogram_weights = nn.Parameter(torch.zeros(self.program_size,
 49 |                                                          self.pkey_dim +
 50 |                                                          (program_shape[0]+a) * program_shape[1],
 51 |                                                          requires_grad=True))
 52 | 
 53 |         # if att_mode == "kv":
 54 |         self.program_key = nn.Linear(program_interface_size, self.pkey_dim)
 55 |         self.program_strength = nn.Linear(program_interface_size, 1)
 56 |         # elif att_mode == "da":
 57 |         self.address_net = nn.Linear(program_interface_size, self.program_size)
 58 |         # elif  att_mode == "dasvd":
 59 |         self.address_svd_net = nn.Linear(program_interface_size, self.program_size*svd_num_features)
 60 |         self.address_svd_net2 = nn.Linear(program_interface_size, self.program_size)
 61 | 
 62 |         self.program_descriptor = nn.Linear(min(program_shape[0]+a,program_shape[1])*self.svd_num_features//2*3, self.pkey_dim)
 63 |         self.program_descriptor2 = nn.Linear(program_shape[1]*2+1,
 64 |                                              self.pkey_dim)
 65 |         # self.program_descriptor2.weight.require_grad=False
 66 |         # self.program_descriptor2.bias.require_grad=False
 67 | 
 68 |         self.program_key2 = nn.Linear(program_interface_size, self.pkey_dim*(self.svd_num_features+1))
 69 |         # self.program_key2.weight.require_grad = False
 70 |         # self.program_key2.bias.require_grad = False
 71 |         self.code_len_net = nn.Linear(program_interface_size, program_shape[1])
 72 | 
 73 |         self.pad_size = program_shape[0]+a-self.svd_num_features
 74 |         # stdv = 1. / math.sqrt(self.mprogram_weights.size(1))
 75 |         # self.mprogram_weights.data.uniform_(-stdv, stdv)
 76 |         self.relu = nn.ReLU()
 77 |         self.MK = None
 78 |         self.Us = None
 79 |         self.Ss = None
 80 |         self.Vs = None
 81 |         for name, param in self.named_parameters():
 82 |             if "mprogram_weights" not in name:
 83 |                 param.requires_grad = True
 84 | 
 85 |         # self.updateMK()
 86 | 
 87 |     def initialize(self):
 88 |         nn.init.xavier_uniform_(self.mprogram_weights, gain=1.4)
 89 |         nn.init.xavier_uniform_(self.program_key.weight, gain=1.4)
 90 |         nn.init.normal_(self.program_key.bias, std=0.01)
 91 |         nn.init.xavier_uniform_(self.program_strength.weight, gain=1.4)
 92 |         nn.init.normal_(self.program_strength.bias, std=0.01)
 93 |         nn.init.xavier_uniform_(self.address_net.weight, gain=1.4)
 94 |         nn.init.normal_(self.address_net.bias, std=0.01)
 95 |         nn.init.xavier_uniform_(self.address_svd_net.weight, gain=1.4)
 96 |         nn.init.normal_(self.address_svd_net.bias, std=0.01)
 97 |         nn.init.xavier_uniform_(self.address_svd_net2.weight, gain=1.4)
 98 |         nn.init.normal_(self.address_svd_net2.bias, std=0.01)
 99 |         nn.init.xavier_uniform_(self.program_descriptor.weight, gain=1.4)
100 |         nn.init.normal_(self.program_descriptor.bias, std=0.01)
101 |         nn.init.xavier_uniform_(self.program_key2.weight, gain=1.4)
102 |         nn.init.normal_(self.program_key2.bias, std=0.01)
103 |         nn.init.xavier_uniform_(self.code_len_net.weight, gain=1.4)
104 |         nn.init.normal_(self.code_len_net.bias, std=0.01)
105 | 
106 |     def get_mprogram_weight(self, p):
107 |         return self.mprogram_weights[p,self.pkey_dim:self.pkey_dim+self.program_shape[0]*self.program_shape[1]]
108 | 
109 |     def attend_program(self, input):
110 | 
111 |         if self.att_mode == "kv":
112 |             keys = F.tanh(self.program_key(input))
113 |             strength = F.softplus(self.program_strength(input))
114 |             K = keys.unsqueeze(1)[:, :, :self.pkey_dim]
115 | 
116 |             d = θ(self.MK.repeat(keys.shape[0], 1, 1), K)
117 |             content_weights = F.softmax(d * strength.unsqueeze(2), dim=-1)
118 |             return content_weights, keys, strength
119 |         elif self.att_mode == "da":
120 |             a = F.softmax(self.address_net(input), dim=-1)
121 |             return a, None, None
122 |         elif self.att_mode == "dasvd":
123 |             a = self.address_svd_net(input).view(input.shape[0], self.svd_num_features, self.program_size)
124 |             a2 = self.address_svd_net2(input).view(input.shape[0], 1, self.program_size)
125 | 
126 |             a = F.softmax(a, dim=-1)
127 |             a2 = F.softmax(a2, dim=-1)
128 | 
129 |             pad = torch.ones(input.shape[0], self.pad_size, self.program_size)
130 |             if torch.cuda.is_available():
131 |                 pad = pad.cuda()
132 |             a = torch.cat([a, pad*a2], dim=1)
133 |             return a, None, None
134 |         elif self.att_mode == "kvsvd":
135 |             K = F.tanh(self.program_key2(input)).view(input.shape[0], self.svd_num_features+1, self.pkey_dim)
136 | 
137 |             MK = self.MK.permute(1,0,2)
138 | 
139 |             MK = MK.repeat(input.shape[0], 1, 1, 1)
140 |             d = θ(MK.view(-1, self.program_size, self.pkey_dim), K.view(-1,1,self.pkey_dim))
141 |             content_weights = F.softmax(d , dim=-1).view(input.shape[0], self.svd_num_features+1,self.program_size)
142 |             pad = torch.ones(input.shape[0], self.pad_size, self.program_size)
143 |             if torch.cuda.is_available():
144 |                 pad = pad.cuda()
145 |             pad = pad*content_weights[:,-1,:].unsqueeze(1)
146 |             a = torch.cat([content_weights[:,:self.svd_num_features,:], pad], dim=1)
147 |             return a, K[:,0,:], None
148 | 
149 |     def updateMK(self, kc_mode="lk"):
150 |         if "svd" in kc_mode:
151 |             try:
152 |                MP = self.mprogram_weights[:, self.pkey_dim:]
153 |                A = MP.view(MP.shape[0], -1, self.program_shape[1])
154 |                MK = []
155 |                Us = []
156 |                Ss = []
157 |                Vs = []
158 |                for i in range(self.program_size):
159 |                    U,S,V = torch.svd(A[i])
160 |                    if kc_mode=="svds":
161 |                        MK.append(S[:self.pkey_dim])
162 |                    elif kc_mode == "svda":
163 |                        pfeature = torch.cat([U[:self.svd_num_features+1, :],
164 |                                              V[:self.svd_num_features+1, :],
165 |                                              S[:self.svd_num_features+1].unsqueeze(1)], dim=1)
166 |                        MK.append(self.program_descriptor2(pfeature))
167 |                    else:
168 |                        pfeature = torch.cat([U[:self.svd_num_features//2,:].contiguous().view(-1),
169 |                                              V[:self.svd_num_features //2, :].contiguous().view(-1),
170 |                                              S[:self.svd_num_features//2]])
171 |                        MK.append(self.program_descriptor(pfeature))
172 |                    if self.program_read_mode!="linear":
173 |                        Us.append(U.contiguous().view(-1))
174 |                        Ss.append(S)
175 |                        Vs.append(V.contiguous().view(-1))
176 |                self.MK = F.tanh(torch.stack(MK, dim=0))
177 |                if self.program_read_mode != "linear":
178 |                    self.Us = torch.stack(Us, dim=0)
179 |                    self.Ss = torch.stack(Ss, dim=0)
180 |                    self.Vs = torch.stack(Vs, dim=0)
181 | 
182 |             except Exception as e:
183 |                print(f"svd err {e}")
184 |         elif kc_mode == "lk":
185 |             self.MK = F.tanh(self.mprogram_weights[:,:self.pkey_dim])
186 |             if self.program_read_mode!="linear":
187 |                 MP = self.mprogram_weights[:, self.pkey_dim:]
188 |                 A = MP.view(MP.shape[0], -1, self.program_shape[1])
189 |                 MK = []
190 |                 Us = []
191 |                 Ss = []
192 |                 Vs = []
193 |                 for i in range(self.program_size):
194 |                     U, S, V = torch.svd(A[i])
195 |                     Us.append(U.contiguous().view(-1))
196 |                     Ss.append(S)
197 |                     Vs.append(V.contiguous().view(-1))
198 |                 if self.program_read_mode != "linear":
199 |                     self.Us = torch.stack(Us, dim=0)
200 |                     self.Ss = torch.stack(Ss, dim=0)
201 |                     self.Vs = torch.stack(Vs, dim=0)
202 | 
203 | 
204 |     def linear_read(self, MP, weights):
205 |         return  torch.matmul(weights, MP)
206 | 
207 |     def linear_svd_read(self, weights):
208 |         U = torch.matmul(weights, self.Us.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1])
209 |         # U = torch.sum(self.Us, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1])
210 |         # V = torch.sum(self.Vs, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1])
211 |         V = torch.matmul(weights, self.Vs.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1])
212 | 
213 |         S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1))
214 |         S = torch.torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1)
215 |         US = torch.matmul(U, S)
216 | 
217 |         USV = torch.matmul(US, V.permute(0,2,1))
218 | 
219 |         return USV
220 | 
221 |     def linear_svd_read1(self, weights):
222 |         S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1))
223 |         S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1)
224 | 
225 |         U = torch.sum(self.Us, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
226 |         V = torch.sum(self.Vs, dim=0).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
227 |         US = torch.matmul(U, S)
228 |         USV = torch.matmul(US, V.permute(0, 2, 1))
229 | 
230 | 
231 | 
232 |         return USV
233 | 
234 |     def linear_svd_read2(self, weights):
235 |         S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1))
236 |         S = torch.torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1)
237 | 
238 |         U = self.Us[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
239 |         V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
240 |         US = torch.matmul(U, S)
241 |         USV = torch.matmul(US, V.permute(0, 2, 1))*weights[:,:,0].unsqueeze(2)
242 | 
243 |         for i in range(self.program_size-1):
244 |             U = self.Us[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1])
245 |             V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1])
246 |             US = torch.matmul(U, S)
247 |             USV += torch.matmul(US, V.permute(0, 2, 1))*weights[:,:,i+1].unsqueeze(2)
248 | 
249 | 
250 |         return USV
251 | 
252 |     def linear_svd_read3(self, weights):
253 |         S = torch.matmul(weights, self.Ss.repeat(weights.shape[0], 1, 1))
254 |         S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1).squeeze(1)
255 | 
256 |         U = self.Us[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
257 |         # V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
258 |         V = torch.matmul(weights, self.Vs.repeat(weights.shape[0], 1, 1)).view(weights.shape[0],-1, self.program_shape[1])
259 | 
260 |         US = torch.matmul(U, S)
261 |         USV = torch.matmul(US, V.permute(0, 2, 1))
262 | 
263 |         for i in range(self.program_size-1):
264 |             U = self.Us[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1])
265 |             V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1])
266 |             US = torch.matmul(U, S)
267 |             USV += torch.matmul(US, V.permute(0, 2, 1))
268 | 
269 | 
270 |         return USV
271 | 
272 | 
273 |     def linear_svd_read_da(self, weights, cg):
274 |         S = self.Ss.repeat(weights.shape[0], 1, 1)#*cg.unsqueeze(1)
275 |         S = torch.diag_embed(S, offset=0, dim1=-2, dim2=-1)
276 | 
277 |         U = (self.Us[0]).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])*weights[:,:, 0].unsqueeze(2)
278 |         V = self.Vs[0].repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1, self.program_shape[1])
279 |         US = torch.matmul(U, S[:,0,:,:])
280 |         USV = torch.matmul(US, V.permute(0, 2, 1))
281 | 
282 |         for i in range(self.program_size-1):
283 |             U = (self.Us[i+1]).repeat(weights.shape[0], 1, 1).view(weights.shape[0], -1,self.program_shape[1])*weights[:,:, i+1].unsqueeze(2)
284 |             V = self.Vs[i+1].repeat(weights.shape[0], 1, 1).view(weights.shape[0],-1, self.program_shape[1])
285 |             US = torch.matmul(U, S[:,i+1,:,:])
286 |             USV += torch.matmul(US, V.permute(0, 2, 1))
287 | 
288 |         return USV
289 | 
290 |     def read_program(self, input):
291 |         key_size = self.pkey_dim
292 | 
293 | 
294 |         content_weights, keys, strengths = self.attend_program(input)
295 | 
296 |         # print(input.shape)
297 |         # print(memory.shape)
298 |         # print(keys.shape)
299 |         # MK = F.tanh(memory.repeat(keys.shape[0], 1, 1))[:, :, :key_size]
300 |         # MK = self.getMK().repeat(keys.shape[0], 1, 1)
301 |         if self.has_bias:
302 |             biases = self.mprogram_weights.repeat(input.shape[0], 1, 1)[:, :,-self.program_shape[1]:]
303 | 
304 |         MP = self.mprogram_weights.repeat(input.shape[0], 1, 1)[:, :, key_size:key_size+self.program_shape[0]*self.program_shape[1]]
305 | 
306 | 
307 |         # print(MP.shape)
308 |         # print(content_weights.shape)
309 |         if self.program_read_mode == "linear":
310 |             working_weight = self.linear_read(MP, content_weights)
311 |         if self.program_read_mode == "svd":
312 |             working_weight = self.linear_svd_read(content_weights)
313 |         elif self.program_read_mode == "svd1":
314 |             working_weight = self.linear_svd_read1(content_weights)
315 |         elif self.program_read_mode == "svd2":
316 |             working_weight = self.linear_svd_read2(content_weights)
317 |         elif self.program_read_mode == "svd3":
318 |             working_weight = self.linear_svd_read3(content_weights)
319 |         elif self.program_read_mode == "svdda":
320 |             cg = F.sigmoid(self.code_len_net(input))
321 |             working_weight = self.linear_svd_read_da(content_weights, cg)
322 |             content_weights = content_weights[:,0,:]
323 |         if len(content_weights.shape)==2:
324 |             content_weights = content_weights.unsqueeze(1)
325 |         # instruction = content_weights.view(content_weights.shape[0],self.program_size)[:,0].unsqueeze(1) * MP[:,0,:]
326 |         # for i in range(self.program_size-1):
327 |         #     instruction*= content_weights.view(content_weights.shape[0],self.program_size)[:,i+1].unsqueeze(1)*MP[:,i+1,:]
328 | 
329 |         o = (torch.matmul(input.unsqueeze(1), working_weight.view(input.shape[0], self.program_shape[0], self.program_shape[1]))).squeeze(1)
330 |         if self.has_bias:
331 |             bias = torch.matmul(content_weights, biases).squeeze(1)
332 |             o = o+bias
333 |         program_scales = []
334 |         for p in range(self.program_size):
335 |             s = torch.mean(torch.exp(-torch.abs(working_weight.view(input.shape[0],-1)-MP[:,p]))
336 |                            , dim=0)
337 |             program_scales.append(s)
338 |         return o
339 | 
340 |     def forward(self, x):
341 |         self.updateMK()
342 |         return self.read_program(x)
343 | 


--------------------------------------------------------------------------------
/cl/outputs/split_MNIST_incremental_domain/Offline.log:
--------------------------------------------------------------------------------
  1 | split_boundaries: [0, 2, 4, 6, 8, 10]
  2 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
  3 | FF mlp
  4 | MLP(
  5 |   (linear): Sequential(
  6 |     (0): Linear(in_features=1024, out_features=400, bias=False)
  7 |     (1): ReLU(inplace=True)
  8 |     (2): Linear(in_features=400, out_features=400, bias=False)
  9 |     (3): ReLU(inplace=True)
 10 |   )
 11 |   (last): ModuleDict(
 12 |     (All): Linear(in_features=400, out_features=2, bias=True)
 13 |   )
 14 | )
 15 | #parameter of model:  570402
 16 | no learnable params:  570402
 17 | Task order: ['1', '2', '3', '4', '5']
 18 | Epoch:0
 19 | LR: 0.001
 20 | Itr		Time		  Data		  Loss		Acc
 21 | [0/469]	0.5880 (0.5880)	0.0997 (0.0997)	0.690 (0.690)	48.44 (48.44)
 22 | [100/469]	0.0037 (0.0177)	0.0015 (0.0101)	0.051 (0.194)	97.66 (92.37)
 23 | [200/469]	0.0023 (0.0147)	0.0003 (0.0095)	0.066 (0.145)	97.66 (94.50)
 24 | [300/469]	0.0034 (0.0136)	0.0003 (0.0093)	0.049 (0.123)	98.44 (95.37)
 25 | [400/469]	0.0024 (0.0131)	0.0003 (0.0091)	0.102 (0.109)	96.09 (95.95)
 26 | [468/469]	0.0023 (0.0130)	0.0002 (0.0091)	0.067 (0.103)	95.83 (96.22)
 27 |  * Train Acc 96.218
 28 |  * Val Acc 97.960, Total time 1.00
 29 | Epoch:1
 30 | LR: 0.001
 31 | Itr		Time		  Data		  Loss		Acc
 32 | [0/469]	0.1212 (0.1212)	0.1178 (0.1178)	0.039 (0.039)	98.44 (98.44)
 33 | [100/469]	0.0091 (0.0125)	0.0065 (0.0099)	0.011 (0.048)	100.00 (98.37)
 34 | [200/469]	0.0264 (0.0124)	0.0239 (0.0094)	0.086 (0.049)	97.66 (98.31)
 35 | [300/469]	0.0057 (0.0120)	0.0034 (0.0091)	0.043 (0.050)	98.44 (98.25)
 36 | [400/469]	0.0355 (0.0122)	0.0287 (0.0090)	0.023 (0.049)	99.22 (98.30)
 37 | [468/469]	0.0031 (0.0120)	0.0010 (0.0089)	0.007 (0.050)	100.00 (98.26)
 38 |  * Train Acc 98.260
 39 |  * Val Acc 98.130, Total time 1.00
 40 | Epoch:2
 41 | LR: 0.001
 42 | Itr		Time		  Data		  Loss		Acc
 43 | [0/469]	0.1228 (0.1228)	0.1182 (0.1182)	0.026 (0.026)	99.22 (99.22)
 44 | [100/469]	0.0224 (0.0126)	0.0201 (0.0100)	0.049 (0.035)	97.66 (98.69)
 45 | [200/469]	0.0099 (0.0121)	0.0075 (0.0094)	0.027 (0.034)	99.22 (98.80)
 46 | [300/469]	0.0025 (0.0119)	0.0003 (0.0092)	0.008 (0.035)	100.00 (98.77)
 47 | [400/469]	0.0067 (0.0120)	0.0025 (0.0091)	0.016 (0.036)	99.22 (98.73)
 48 | [468/469]	0.0021 (0.0119)	0.0002 (0.0091)	0.004 (0.036)	100.00 (98.73)
 49 |  * Train Acc 98.732
 50 |  * Val Acc 98.600, Total time 1.03
 51 | Epoch:3
 52 | LR: 0.001
 53 | Itr		Time		  Data		  Loss		Acc
 54 | [0/469]	0.1898 (0.1898)	0.1863 (0.1863)	0.016 (0.016)	100.00 (100.00)
 55 | [100/469]	0.0040 (0.0134)	0.0015 (0.0108)	0.014 (0.025)	100.00 (99.13)
 56 | [200/469]	0.0044 (0.0123)	0.0018 (0.0094)	0.015 (0.027)	99.22 (99.08)
 57 | [300/469]	0.0081 (0.0120)	0.0057 (0.0092)	0.014 (0.026)	99.22 (99.10)
 58 | [400/469]	0.0022 (0.0121)	0.0003 (0.0092)	0.018 (0.026)	99.22 (99.06)
 59 | [468/469]	0.0022 (0.0120)	0.0002 (0.0092)	0.011 (0.028)	100.00 (99.02)
 60 |  * Train Acc 99.020
 61 |  * Val Acc 98.840, Total time 1.09
 62 |  * Val Acc 98.840, Total time 1.01
 63 | OrderedDict([('All', {'All': 98.84})])
 64 | Task All average acc: 98.84
 65 | ===Summary of experiment repeats: 1 / 10 ===
 66 | The regularization coefficient: 0.0
 67 | The last avg acc of all repeats: [98.84  0.    0.    0.    0.    0.    0.    0.    0.    0.  ]
 68 | mean: 9.884 std: 29.652000000000005
 69 | split_boundaries: [0, 2, 4, 6, 8, 10]
 70 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
 71 | FF mlp
 72 | MLP(
 73 |   (linear): Sequential(
 74 |     (0): Linear(in_features=1024, out_features=400, bias=False)
 75 |     (1): ReLU(inplace=True)
 76 |     (2): Linear(in_features=400, out_features=400, bias=False)
 77 |     (3): ReLU(inplace=True)
 78 |   )
 79 |   (last): ModuleDict(
 80 |     (All): Linear(in_features=400, out_features=2, bias=True)
 81 |   )
 82 | )
 83 | #parameter of model:  570402
 84 | no learnable params:  570402
 85 | Task order: ['1', '2', '3', '4', '5']
 86 | Epoch:0
 87 | LR: 0.001
 88 | Itr		Time		  Data		  Loss		Acc
 89 | [0/469]	0.1353 (0.1353)	0.1312 (0.1312)	0.689 (0.689)	50.78 (50.78)
 90 | [100/469]	0.0250 (0.0128)	0.0224 (0.0103)	0.088 (0.202)	96.09 (91.53)
 91 | [200/469]	0.0153 (0.0121)	0.0128 (0.0094)	0.061 (0.146)	97.66 (94.15)
 92 | [300/469]	0.0040 (0.0119)	0.0015 (0.0092)	0.083 (0.122)	97.66 (95.22)
 93 | [400/469]	0.0024 (0.0120)	0.0003 (0.0093)	0.032 (0.111)	99.22 (95.78)
 94 | [468/469]	0.0023 (0.0119)	0.0002 (0.0093)	0.028 (0.104)	98.96 (96.06)
 95 |  * Train Acc 96.060
 96 |  * Val Acc 97.890, Total time 1.00
 97 | Epoch:1
 98 | LR: 0.001
 99 | Itr		Time		  Data		  Loss		Acc
100 | [0/469]	0.1581 (0.1581)	0.1544 (0.1544)	0.050 (0.050)	98.44 (98.44)
101 | [100/469]	0.0225 (0.0135)	0.0154 (0.0101)	0.019 (0.052)	99.22 (98.15)
102 | [200/469]	0.0069 (0.0127)	0.0044 (0.0094)	0.031 (0.050)	99.22 (98.18)
103 | [300/469]	0.0102 (0.0123)	0.0081 (0.0091)	0.033 (0.051)	99.22 (98.19)
104 | [400/469]	0.0031 (0.0120)	0.0007 (0.0090)	0.055 (0.050)	97.66 (98.24)
105 | [468/469]	0.0021 (0.0119)	0.0002 (0.0089)	0.037 (0.049)	98.96 (98.28)
106 |  * Train Acc 98.285
107 |  * Val Acc 98.670, Total time 0.99
108 | Epoch:2
109 | LR: 0.001
110 | Itr		Time		  Data		  Loss		Acc
111 | [0/469]	0.2354 (0.2354)	0.2297 (0.2297)	0.027 (0.027)	99.22 (99.22)
112 | [100/469]	0.0037 (0.0143)	0.0014 (0.0114)	0.087 (0.032)	96.88 (98.85)
113 | [200/469]	0.0024 (0.0133)	0.0003 (0.0105)	0.018 (0.036)	100.00 (98.71)
114 | [300/469]	0.0025 (0.0126)	0.0002 (0.0100)	0.056 (0.036)	98.44 (98.73)
115 | [400/469]	0.0268 (0.0123)	0.0244 (0.0098)	0.020 (0.036)	99.22 (98.73)
116 | [468/469]	0.0023 (0.0122)	0.0002 (0.0096)	0.021 (0.036)	98.96 (98.74)
117 |  * Train Acc 98.738
118 |  * Val Acc 98.540, Total time 1.08
119 | Epoch:3
120 | LR: 0.001
121 | Itr		Time		  Data		  Loss		Acc
122 | [0/469]	0.1093 (0.1093)	0.1058 (0.1058)	0.030 (0.030)	99.22 (99.22)
123 | [100/469]	0.0157 (0.0127)	0.0132 (0.0101)	0.029 (0.029)	99.22 (98.95)
124 | [200/469]	0.0104 (0.0120)	0.0081 (0.0094)	0.039 (0.028)	99.22 (99.02)
125 | [300/469]	0.0044 (0.0120)	0.0021 (0.0093)	0.083 (0.028)	96.88 (99.03)
126 | [400/469]	0.0085 (0.0118)	0.0058 (0.0091)	0.047 (0.028)	97.66 (99.03)
127 | [468/469]	0.0023 (0.0118)	0.0002 (0.0091)	0.043 (0.029)	96.88 (98.99)
128 |  * Train Acc 98.988
129 |  * Val Acc 98.660, Total time 1.01
130 |  * Val Acc 98.660, Total time 0.99
131 | OrderedDict([('All', {'All': 98.66})])
132 | Task All average acc: 98.66
133 | ===Summary of experiment repeats: 2 / 10 ===
134 | The regularization coefficient: 0.0
135 | The last avg acc of all repeats: [98.84 98.66  0.    0.    0.    0.    0.    0.    0.    0.  ]
136 | mean: 19.75 std: 39.50002050632379
137 | split_boundaries: [0, 2, 4, 6, 8, 10]
138 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
139 | FF mlp
140 | MLP(
141 |   (linear): Sequential(
142 |     (0): Linear(in_features=1024, out_features=400, bias=False)
143 |     (1): ReLU(inplace=True)
144 |     (2): Linear(in_features=400, out_features=400, bias=False)
145 |     (3): ReLU(inplace=True)
146 |   )
147 |   (last): ModuleDict(
148 |     (All): Linear(in_features=400, out_features=2, bias=True)
149 |   )
150 | )
151 | #parameter of model:  570402
152 | no learnable params:  570402
153 | Task order: ['1', '2', '3', '4', '5']
154 | Epoch:0
155 | LR: 0.001
156 | Itr		Time		  Data		  Loss		Acc
157 | [0/469]	0.1701 (0.1701)	0.1658 (0.1658)	0.696 (0.696)	45.31 (45.31)
158 | [100/469]	0.0276 (0.0140)	0.0254 (0.0113)	0.136 (0.195)	95.31 (92.06)
159 | [200/469]	0.0038 (0.0131)	0.0014 (0.0103)	0.094 (0.146)	96.09 (94.20)
160 | [300/469]	0.0308 (0.0125)	0.0284 (0.0099)	0.047 (0.122)	99.22 (95.25)
161 | [400/469]	0.0251 (0.0123)	0.0227 (0.0097)	0.045 (0.108)	98.44 (95.84)
162 | [468/469]	0.0024 (0.0121)	0.0002 (0.0095)	0.123 (0.102)	96.88 (96.11)
163 |  * Train Acc 96.110
164 |  * Val Acc 97.900, Total time 0.99
165 | Epoch:1
166 | LR: 0.001
167 | Itr		Time		  Data		  Loss		Acc
168 | [0/469]	0.1156 (0.1156)	0.1122 (0.1122)	0.036 (0.036)	99.22 (99.22)
169 | [100/469]	0.0026 (0.0128)	0.0004 (0.0100)	0.053 (0.053)	97.66 (98.23)
170 | [200/469]	0.0050 (0.0124)	0.0021 (0.0094)	0.120 (0.055)	97.66 (98.08)
171 | [300/469]	0.0133 (0.0121)	0.0100 (0.0092)	0.069 (0.053)	96.88 (98.15)
172 | [400/469]	0.0302 (0.0120)	0.0277 (0.0092)	0.072 (0.051)	97.66 (98.24)
173 | [468/469]	0.0162 (0.0120)	0.0140 (0.0091)	0.026 (0.050)	98.96 (98.26)
174 |  * Train Acc 98.258
175 |  * Val Acc 98.580, Total time 1.01
176 | Epoch:2
177 | LR: 0.001
178 | Itr		Time		  Data		  Loss		Acc
179 | [0/469]	0.1011 (0.1011)	0.0981 (0.0981)	0.048 (0.048)	97.66 (97.66)
180 | [100/469]	0.0123 (0.0125)	0.0100 (0.0100)	0.061 (0.034)	96.88 (98.82)
181 | [200/469]	0.0061 (0.0120)	0.0037 (0.0094)	0.015 (0.036)	99.22 (98.75)
182 | [300/469]	0.0394 (0.0123)	0.0368 (0.0096)	0.037 (0.039)	98.44 (98.65)
183 | [400/469]	0.0029 (0.0120)	0.0003 (0.0093)	0.048 (0.038)	97.66 (98.69)
184 | [468/469]	0.0130 (0.0120)	0.0109 (0.0093)	0.051 (0.037)	98.96 (98.70)
185 |  * Train Acc 98.700
186 |  * Val Acc 98.570, Total time 0.99
187 | Epoch:3
188 | LR: 0.001
189 | Itr		Time		  Data		  Loss		Acc
190 | [0/469]	0.1563 (0.1563)	0.1494 (0.1494)	0.046 (0.046)	98.44 (98.44)
191 | [100/469]	0.0027 (0.0128)	0.0003 (0.0102)	0.075 (0.030)	96.88 (98.92)
192 | [200/469]	0.0027 (0.0127)	0.0003 (0.0097)	0.044 (0.030)	98.44 (98.92)
193 | [300/469]	0.0022 (0.0122)	0.0003 (0.0094)	0.008 (0.029)	100.00 (98.94)
194 | [400/469]	0.0204 (0.0120)	0.0180 (0.0092)	0.032 (0.029)	98.44 (98.95)
195 | [468/469]	0.0186 (0.0120)	0.0159 (0.0091)	0.030 (0.029)	98.96 (98.92)
196 |  * Train Acc 98.917
197 |  * Val Acc 98.850, Total time 1.09
198 |  * Val Acc 98.850, Total time 1.05
199 | OrderedDict([('All', {'All': 98.85})])
200 | Task All average acc: 98.85
201 | ===Summary of experiment repeats: 3 / 10 ===
202 | The regularization coefficient: 0.0
203 | The last avg acc of all repeats: [98.84 98.66 98.85  0.    0.    0.    0.    0.    0.    0.  ]
204 | mean: 29.635 std: 45.2682354968691
205 | split_boundaries: [0, 2, 4, 6, 8, 10]
206 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
207 | FF mlp
208 | MLP(
209 |   (linear): Sequential(
210 |     (0): Linear(in_features=1024, out_features=400, bias=False)
211 |     (1): ReLU(inplace=True)
212 |     (2): Linear(in_features=400, out_features=400, bias=False)
213 |     (3): ReLU(inplace=True)
214 |   )
215 |   (last): ModuleDict(
216 |     (All): Linear(in_features=400, out_features=2, bias=True)
217 |   )
218 | )
219 | #parameter of model:  570402
220 | no learnable params:  570402
221 | Task order: ['1', '2', '3', '4', '5']
222 | Epoch:0
223 | LR: 0.001
224 | Itr		Time		  Data		  Loss		Acc
225 | [0/469]	0.1238 (0.1238)	0.1161 (0.1161)	0.699 (0.699)	50.00 (50.00)
226 | [100/469]	0.0051 (0.0124)	0.0022 (0.0098)	0.121 (0.200)	95.31 (91.48)
227 | [200/469]	0.0286 (0.0125)	0.0264 (0.0098)	0.072 (0.149)	96.88 (93.93)
228 | [300/469]	0.0026 (0.0122)	0.0003 (0.0096)	0.032 (0.127)	98.44 (94.95)
229 | [400/469]	0.0051 (0.0126)	0.0026 (0.0096)	0.039 (0.114)	98.44 (95.60)
230 | [468/469]	0.0019 (0.0125)	0.0001 (0.0096)	0.051 (0.106)	98.96 (95.94)
231 |  * Train Acc 95.940
232 |  * Val Acc 97.700, Total time 1.01
233 | Epoch:1
234 | LR: 0.001
235 | Itr		Time		  Data		  Loss		Acc
236 | [0/469]	0.1316 (0.1316)	0.1278 (0.1278)	0.068 (0.068)	96.88 (96.88)
237 | [100/469]	0.0245 (0.0126)	0.0202 (0.0100)	0.099 (0.053)	97.66 (98.14)
238 | [200/469]	0.0027 (0.0121)	0.0003 (0.0096)	0.028 (0.052)	100.00 (98.19)
239 | [300/469]	0.0025 (0.0118)	0.0003 (0.0092)	0.029 (0.053)	99.22 (98.16)
240 | [400/469]	0.0054 (0.0117)	0.0028 (0.0092)	0.042 (0.051)	99.22 (98.23)
241 | [468/469]	0.0073 (0.0118)	0.0015 (0.0090)	0.044 (0.050)	98.96 (98.28)
242 |  * Train Acc 98.275
243 |  * Val Acc 98.350, Total time 1.07
244 | Epoch:2
245 | LR: 0.001
246 | Itr		Time		  Data		  Loss		Acc
247 | [0/469]	0.1341 (0.1341)	0.1304 (0.1304)	0.051 (0.051)	97.66 (97.66)
248 | [100/469]	0.0250 (0.0128)	0.0223 (0.0102)	0.064 (0.033)	97.66 (98.84)
249 | [200/469]	0.0022 (0.0123)	0.0003 (0.0096)	0.149 (0.035)	96.09 (98.75)
250 | [300/469]	0.0093 (0.0120)	0.0053 (0.0093)	0.016 (0.035)	100.00 (98.73)
251 | [400/469]	0.0278 (0.0119)	0.0254 (0.0093)	0.025 (0.035)	99.22 (98.75)
252 | [468/469]	0.0033 (0.0118)	0.0011 (0.0092)	0.031 (0.035)	98.96 (98.74)
253 |  * Train Acc 98.738
254 |  * Val Acc 98.810, Total time 1.07
255 | Epoch:3
256 | LR: 0.001
257 | Itr		Time		  Data		  Loss		Acc
258 | [0/469]	0.1699 (0.1699)	0.1663 (0.1663)	0.013 (0.013)	100.00 (100.00)
259 | [100/469]	0.0025 (0.0128)	0.0003 (0.0102)	0.004 (0.026)	100.00 (99.17)
260 | [200/469]	0.0273 (0.0122)	0.0211 (0.0095)	0.004 (0.025)	100.00 (99.17)
261 | [300/469]	0.0025 (0.0120)	0.0003 (0.0093)	0.034 (0.026)	99.22 (99.10)
262 | [400/469]	0.0075 (0.0118)	0.0003 (0.0092)	0.015 (0.027)	99.22 (99.08)
263 | [468/469]	0.0108 (0.0118)	0.0079 (0.0092)	0.016 (0.028)	100.00 (99.04)
264 |  * Train Acc 99.043
265 |  * Val Acc 98.670, Total time 1.06
266 |  * Val Acc 98.670, Total time 1.00
267 | OrderedDict([('All', {'All': 98.67})])
268 | Task All average acc: 98.67
269 | ===Summary of experiment repeats: 4 / 10 ===
270 | The regularization coefficient: 0.0
271 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67  0.    0.    0.    0.    0.    0.  ]
272 | mean: 39.501999999999995 std: 48.37990549804743
273 | split_boundaries: [0, 2, 4, 6, 8, 10]
274 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
275 | FF mlp
276 | MLP(
277 |   (linear): Sequential(
278 |     (0): Linear(in_features=1024, out_features=400, bias=False)
279 |     (1): ReLU(inplace=True)
280 |     (2): Linear(in_features=400, out_features=400, bias=False)
281 |     (3): ReLU(inplace=True)
282 |   )
283 |   (last): ModuleDict(
284 |     (All): Linear(in_features=400, out_features=2, bias=True)
285 |   )
286 | )
287 | #parameter of model:  570402
288 | no learnable params:  570402
289 | Task order: ['1', '2', '3', '4', '5']
290 | Epoch:0
291 | LR: 0.001
292 | Itr		Time		  Data		  Loss		Acc
293 | [0/469]	0.1477 (0.1477)	0.1437 (0.1437)	0.697 (0.697)	54.69 (54.69)
294 | [100/469]	0.0022 (0.0136)	0.0003 (0.0109)	0.110 (0.192)	95.31 (92.01)
295 | [200/469]	0.0191 (0.0129)	0.0167 (0.0101)	0.077 (0.142)	96.88 (94.36)
296 | [300/469]	0.0025 (0.0124)	0.0003 (0.0098)	0.084 (0.119)	96.88 (95.36)
297 | [400/469]	0.0026 (0.0122)	0.0003 (0.0096)	0.079 (0.109)	98.44 (95.85)
298 | [468/469]	0.0021 (0.0121)	0.0002 (0.0095)	0.096 (0.102)	96.88 (96.16)
299 |  * Train Acc 96.155
300 |  * Val Acc 98.090, Total time 1.03
301 | Epoch:1
302 | LR: 0.001
303 | Itr		Time		  Data		  Loss		Acc
304 | [0/469]	0.2470 (0.2470)	0.2407 (0.2407)	0.075 (0.075)	97.66 (97.66)
305 | [100/469]	0.0025 (0.0139)	0.0003 (0.0110)	0.052 (0.050)	97.66 (98.14)
306 | [200/469]	0.0043 (0.0126)	0.0021 (0.0099)	0.066 (0.050)	97.66 (98.19)
307 | [300/469]	0.0072 (0.0126)	0.0051 (0.0096)	0.175 (0.050)	96.09 (98.24)
308 | [400/469]	0.0202 (0.0123)	0.0178 (0.0094)	0.096 (0.049)	99.22 (98.32)
309 | [468/469]	0.0027 (0.0121)	0.0008 (0.0092)	0.006 (0.048)	100.00 (98.35)
310 |  * Train Acc 98.347
311 |  * Val Acc 98.490, Total time 1.01
312 | Epoch:2
313 | LR: 0.001
314 | Itr		Time		  Data		  Loss		Acc
315 | [0/469]	0.1382 (0.1382)	0.1342 (0.1342)	0.030 (0.030)	98.44 (98.44)
316 | [100/469]	0.0049 (0.0137)	0.0024 (0.0110)	0.015 (0.031)	99.22 (98.94)
317 | [200/469]	0.0248 (0.0128)	0.0223 (0.0101)	0.069 (0.034)	95.31 (98.81)
318 | [300/469]	0.0024 (0.0126)	0.0003 (0.0100)	0.026 (0.034)	98.44 (98.76)
319 | [400/469]	0.0043 (0.0123)	0.0019 (0.0097)	0.018 (0.035)	100.00 (98.76)
320 | [468/469]	0.0019 (0.0122)	0.0001 (0.0096)	0.013 (0.036)	100.00 (98.70)
321 |  * Train Acc 98.702
322 |  * Val Acc 98.430, Total time 1.02
323 | Epoch:3
324 | LR: 0.001
325 | Itr		Time		  Data		  Loss		Acc
326 | [0/469]	0.1211 (0.1211)	0.1176 (0.1176)	0.010 (0.010)	100.00 (100.00)
327 | [100/469]	0.0235 (0.0126)	0.0212 (0.0100)	0.034 (0.025)	98.44 (99.09)
328 | [200/469]	0.0231 (0.0125)	0.0207 (0.0096)	0.057 (0.027)	98.44 (99.03)
329 | [300/469]	0.0120 (0.0122)	0.0096 (0.0093)	0.063 (0.027)	96.88 (99.05)
330 | [400/469]	0.0078 (0.0120)	0.0015 (0.0090)	0.019 (0.028)	99.22 (99.01)
331 | [468/469]	0.0211 (0.0121)	0.0189 (0.0091)	0.008 (0.028)	100.00 (99.01)
332 |  * Train Acc 99.013
333 |  * Val Acc 98.780, Total time 0.98
334 |  * Val Acc 98.780, Total time 1.04
335 | OrderedDict([('All', {'All': 98.78})])
336 | Task All average acc: 98.78
337 | ===Summary of experiment repeats: 5 / 10 ===
338 | The regularization coefficient: 0.0
339 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78  0.    0.    0.    0.    0.  ]
340 | mean: 49.379999999999995 std: 49.38003341432648
341 | split_boundaries: [0, 2, 4, 6, 8, 10]
342 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
343 | FF mlp
344 | MLP(
345 |   (linear): Sequential(
346 |     (0): Linear(in_features=1024, out_features=400, bias=False)
347 |     (1): ReLU(inplace=True)
348 |     (2): Linear(in_features=400, out_features=400, bias=False)
349 |     (3): ReLU(inplace=True)
350 |   )
351 |   (last): ModuleDict(
352 |     (All): Linear(in_features=400, out_features=2, bias=True)
353 |   )
354 | )
355 | #parameter of model:  570402
356 | no learnable params:  570402
357 | Task order: ['1', '2', '3', '4', '5']
358 | Epoch:0
359 | LR: 0.001
360 | Itr		Time		  Data		  Loss		Acc
361 | [0/469]	0.1113 (0.1113)	0.1071 (0.1071)	0.681 (0.681)	53.12 (53.12)
362 | [100/469]	0.0063 (0.0127)	0.0039 (0.0102)	0.127 (0.195)	93.75 (92.13)
363 | [200/469]	0.0023 (0.0123)	0.0003 (0.0098)	0.066 (0.147)	97.66 (94.27)
364 | [300/469]	0.0126 (0.0125)	0.0099 (0.0099)	0.026 (0.124)	99.22 (95.29)
365 | [400/469]	0.0343 (0.0123)	0.0319 (0.0097)	0.104 (0.110)	97.66 (95.86)
366 | [468/469]	0.0024 (0.0122)	0.0002 (0.0096)	0.068 (0.105)	97.92 (96.09)
367 |  * Train Acc 96.087
368 |  * Val Acc 98.420, Total time 1.01
369 | Epoch:1
370 | LR: 0.001
371 | Itr		Time		  Data		  Loss		Acc
372 | [0/469]	0.1149 (0.1149)	0.1114 (0.1114)	0.047 (0.047)	99.22 (99.22)
373 | [100/469]	0.0031 (0.0126)	0.0007 (0.0101)	0.078 (0.051)	96.88 (98.18)
374 | [200/469]	0.0041 (0.0127)	0.0020 (0.0095)	0.071 (0.054)	95.31 (98.09)
375 | [300/469]	0.0113 (0.0123)	0.0089 (0.0093)	0.038 (0.050)	99.22 (98.26)
376 | [400/469]	0.0024 (0.0121)	0.0003 (0.0091)	0.085 (0.051)	97.66 (98.23)
377 | [468/469]	0.0049 (0.0120)	0.0029 (0.0091)	0.005 (0.050)	100.00 (98.26)
378 |  * Train Acc 98.260
379 |  * Val Acc 98.380, Total time 1.03
380 | Epoch:2
381 | LR: 0.001
382 | Itr		Time		  Data		  Loss		Acc
383 | [0/469]	0.1301 (0.1301)	0.1266 (0.1266)	0.014 (0.014)	100.00 (100.00)
384 | [100/469]	0.0038 (0.0137)	0.0014 (0.0104)	0.023 (0.034)	99.22 (98.82)
385 | [200/469]	0.0115 (0.0126)	0.0090 (0.0096)	0.065 (0.036)	98.44 (98.76)
386 | [300/469]	0.0046 (0.0122)	0.0021 (0.0093)	0.015 (0.036)	98.44 (98.70)
387 | [400/469]	0.0052 (0.0120)	0.0031 (0.0092)	0.039 (0.038)	98.44 (98.65)
388 | [468/469]	0.0056 (0.0121)	0.0034 (0.0093)	0.032 (0.038)	98.96 (98.66)
389 |  * Train Acc 98.663
390 |  * Val Acc 98.460, Total time 1.03
391 | Epoch:3
392 | LR: 0.001
393 | Itr		Time		  Data		  Loss		Acc
394 | [0/469]	0.1454 (0.1454)	0.1418 (0.1418)	0.046 (0.046)	99.22 (99.22)
395 | [100/469]	0.0025 (0.0129)	0.0003 (0.0103)	0.041 (0.027)	96.88 (98.96)
396 | [200/469]	0.0029 (0.0123)	0.0003 (0.0096)	0.028 (0.027)	99.22 (99.01)
397 | [300/469]	0.0045 (0.0124)	0.0021 (0.0097)	0.017 (0.027)	100.00 (99.00)
398 | [400/469]	0.0025 (0.0122)	0.0003 (0.0094)	0.056 (0.028)	99.22 (98.99)
399 | [468/469]	0.0210 (0.0121)	0.0189 (0.0094)	0.018 (0.029)	98.96 (98.97)
400 |  * Train Acc 98.967
401 |  * Val Acc 98.450, Total time 1.05
402 |  * Val Acc 98.450, Total time 1.02
403 | OrderedDict([('All', {'All': 98.45})])
404 | Task All average acc: 98.45
405 | ===Summary of experiment repeats: 6 / 10 ===
406 | The regularization coefficient: 0.0
407 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45  0.    0.    0.    0.  ]
408 | mean: 59.225 std: 48.35712693078446
409 | split_boundaries: [0, 2, 4, 6, 8, 10]
410 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
411 | FF mlp
412 | MLP(
413 |   (linear): Sequential(
414 |     (0): Linear(in_features=1024, out_features=400, bias=False)
415 |     (1): ReLU(inplace=True)
416 |     (2): Linear(in_features=400, out_features=400, bias=False)
417 |     (3): ReLU(inplace=True)
418 |   )
419 |   (last): ModuleDict(
420 |     (All): Linear(in_features=400, out_features=2, bias=True)
421 |   )
422 | )
423 | #parameter of model:  570402
424 | no learnable params:  570402
425 | Task order: ['1', '2', '3', '4', '5']
426 | Epoch:0
427 | LR: 0.001
428 | Itr		Time		  Data		  Loss		Acc
429 | [0/469]	0.1043 (0.1043)	0.0988 (0.0988)	0.687 (0.687)	61.72 (61.72)
430 | [100/469]	0.0025 (0.0125)	0.0004 (0.0100)	0.076 (0.197)	96.09 (91.82)
431 | [200/469]	0.0027 (0.0119)	0.0003 (0.0093)	0.146 (0.147)	94.53 (94.12)
432 | [300/469]	0.0315 (0.0119)	0.0270 (0.0093)	0.059 (0.126)	97.66 (95.03)
433 | [400/469]	0.0079 (0.0120)	0.0037 (0.0091)	0.041 (0.113)	99.22 (95.62)
434 | [468/469]	0.0202 (0.0119)	0.0181 (0.0090)	0.087 (0.106)	96.88 (95.90)
435 |  * Train Acc 95.900
436 |  * Val Acc 97.920, Total time 1.10
437 | Epoch:1
438 | LR: 0.001
439 | Itr		Time		  Data		  Loss		Acc
440 | [0/469]	0.1452 (0.1452)	0.1416 (0.1416)	0.079 (0.079)	98.44 (98.44)
441 | [100/469]	0.0245 (0.0129)	0.0222 (0.0103)	0.050 (0.055)	97.66 (97.98)
442 | [200/469]	0.0102 (0.0121)	0.0078 (0.0096)	0.036 (0.052)	99.22 (98.08)
443 | [300/469]	0.0223 (0.0119)	0.0198 (0.0093)	0.124 (0.052)	96.09 (98.11)
444 | [400/469]	0.0025 (0.0120)	0.0003 (0.0091)	0.035 (0.052)	96.88 (98.14)
445 | [468/469]	0.0025 (0.0119)	0.0002 (0.0091)	0.012 (0.050)	100.00 (98.23)
446 |  * Train Acc 98.235
447 |  * Val Acc 98.430, Total time 1.00
448 | Epoch:2
449 | LR: 0.001
450 | Itr		Time		  Data		  Loss		Acc
451 | [0/469]	0.1390 (0.1390)	0.1355 (0.1355)	0.014 (0.014)	99.22 (99.22)
452 | [100/469]	0.0027 (0.0129)	0.0003 (0.0104)	0.026 (0.034)	100.00 (98.70)
453 | [200/469]	0.0039 (0.0121)	0.0016 (0.0096)	0.013 (0.035)	100.00 (98.75)
454 | [300/469]	0.0026 (0.0120)	0.0003 (0.0094)	0.003 (0.035)	100.00 (98.72)
455 | [400/469]	0.0025 (0.0121)	0.0003 (0.0094)	0.075 (0.036)	98.44 (98.71)
456 | [468/469]	0.0136 (0.0120)	0.0110 (0.0094)	0.013 (0.036)	100.00 (98.72)
457 |  * Train Acc 98.717
458 |  * Val Acc 98.360, Total time 1.01
459 | Epoch:3
460 | LR: 0.001
461 | Itr		Time		  Data		  Loss		Acc
462 | [0/469]	0.1502 (0.1502)	0.1463 (0.1463)	0.026 (0.026)	99.22 (99.22)
463 | [100/469]	0.0199 (0.0129)	0.0172 (0.0103)	0.047 (0.030)	96.88 (98.97)
464 | [200/469]	0.0036 (0.0122)	0.0014 (0.0095)	0.052 (0.028)	97.66 (99.06)
465 | [300/469]	0.0024 (0.0123)	0.0003 (0.0093)	0.026 (0.028)	99.22 (99.03)
466 | [400/469]	0.0166 (0.0121)	0.0143 (0.0092)	0.024 (0.028)	99.22 (99.03)
467 | [468/469]	0.0022 (0.0120)	0.0002 (0.0091)	0.057 (0.028)	96.88 (99.01)
468 |  * Train Acc 99.012
469 |  * Val Acc 98.470, Total time 0.99
470 |  * Val Acc 98.470, Total time 1.07
471 | OrderedDict([('All', {'All': 98.47})])
472 | Task All average acc: 98.47
473 | ===Summary of experiment repeats: 7 / 10 ===
474 | The regularization coefficient: 0.0
475 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47  0.    0.    0.  ]
476 | mean: 69.072 std: 45.21841722130486
477 | split_boundaries: [0, 2, 4, 6, 8, 10]
478 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
479 | FF mlp
480 | MLP(
481 |   (linear): Sequential(
482 |     (0): Linear(in_features=1024, out_features=400, bias=False)
483 |     (1): ReLU(inplace=True)
484 |     (2): Linear(in_features=400, out_features=400, bias=False)
485 |     (3): ReLU(inplace=True)
486 |   )
487 |   (last): ModuleDict(
488 |     (All): Linear(in_features=400, out_features=2, bias=True)
489 |   )
490 | )
491 | #parameter of model:  570402
492 | no learnable params:  570402
493 | Task order: ['1', '2', '3', '4', '5']
494 | Epoch:0
495 | LR: 0.001
496 | Itr		Time		  Data		  Loss		Acc
497 | [0/469]	0.1434 (0.1434)	0.1396 (0.1396)	0.725 (0.725)	44.53 (44.53)
498 | [100/469]	0.0169 (0.0127)	0.0137 (0.0101)	0.201 (0.216)	92.97 (90.67)
499 | [200/469]	0.0124 (0.0126)	0.0071 (0.0097)	0.116 (0.157)	92.97 (93.62)
500 | [300/469]	0.0025 (0.0122)	0.0003 (0.0094)	0.054 (0.130)	98.44 (94.85)
501 | [400/469]	0.0022 (0.0120)	0.0003 (0.0092)	0.165 (0.116)	95.31 (95.47)
502 | [468/469]	0.0022 (0.0120)	0.0002 (0.0092)	0.043 (0.109)	98.96 (95.79)
503 |  * Train Acc 95.787
504 |  * Val Acc 97.520, Total time 1.02
505 | Epoch:1
506 | LR: 0.001
507 | Itr		Time		  Data		  Loss		Acc
508 | [0/469]	0.1120 (0.1120)	0.1083 (0.1083)	0.065 (0.065)	97.66 (97.66)
509 | [100/469]	0.0022 (0.0133)	0.0003 (0.0105)	0.048 (0.049)	97.66 (98.36)
510 | [200/469]	0.0045 (0.0125)	0.0003 (0.0099)	0.096 (0.050)	96.09 (98.30)
511 | [300/469]	0.0042 (0.0123)	0.0003 (0.0095)	0.053 (0.050)	98.44 (98.32)
512 | [400/469]	0.0026 (0.0122)	0.0003 (0.0094)	0.107 (0.050)	96.09 (98.31)
513 | [468/469]	0.0019 (0.0121)	0.0001 (0.0094)	0.015 (0.049)	100.00 (98.35)
514 |  * Train Acc 98.348
515 |  * Val Acc 98.570, Total time 0.98
516 | Epoch:2
517 | LR: 0.001
518 | Itr		Time		  Data		  Loss		Acc
519 | [0/469]	0.1443 (0.1443)	0.1407 (0.1407)	0.021 (0.021)	99.22 (99.22)
520 | [100/469]	0.0094 (0.0128)	0.0071 (0.0103)	0.042 (0.036)	98.44 (98.82)
521 | [200/469]	0.0060 (0.0127)	0.0003 (0.0100)	0.063 (0.037)	96.09 (98.73)
522 | [300/469]	0.0085 (0.0123)	0.0047 (0.0096)	0.019 (0.037)	99.22 (98.74)
523 | [400/469]	0.0030 (0.0123)	0.0003 (0.0096)	0.012 (0.037)	99.22 (98.71)
524 | [468/469]	0.0020 (0.0122)	0.0002 (0.0095)	0.026 (0.038)	97.92 (98.69)
525 |  * Train Acc 98.687
526 |  * Val Acc 98.540, Total time 1.00
527 | Epoch:3
528 | LR: 0.001
529 | Itr		Time		  Data		  Loss		Acc
530 | [0/469]	0.1162 (0.1162)	0.1128 (0.1128)	0.016 (0.016)	100.00 (100.00)
531 | [100/469]	0.0252 (0.0127)	0.0226 (0.0101)	0.099 (0.031)	98.44 (98.88)
532 | [200/469]	0.0026 (0.0120)	0.0004 (0.0095)	0.009 (0.031)	100.00 (98.85)
533 | [300/469]	0.0028 (0.0119)	0.0004 (0.0093)	0.008 (0.031)	100.00 (98.88)
534 | [400/469]	0.0205 (0.0119)	0.0175 (0.0093)	0.016 (0.031)	99.22 (98.87)
535 | [468/469]	0.0053 (0.0118)	0.0034 (0.0091)	0.021 (0.031)	97.92 (98.88)
536 |  * Train Acc 98.880
537 |  * Val Acc 98.690, Total time 1.06
538 |  * Val Acc 98.690, Total time 1.07
539 | OrderedDict([('All', {'All': 98.69})])
540 | Task All average acc: 98.69
541 | ===Summary of experiment repeats: 8 / 10 ===
542 | The regularization coefficient: 0.0
543 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69  0.    0.  ]
544 | mean: 78.941 std: 39.470705200186124
545 | split_boundaries: [0, 2, 4, 6, 8, 10]
546 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
547 | FF mlp
548 | MLP(
549 |   (linear): Sequential(
550 |     (0): Linear(in_features=1024, out_features=400, bias=False)
551 |     (1): ReLU(inplace=True)
552 |     (2): Linear(in_features=400, out_features=400, bias=False)
553 |     (3): ReLU(inplace=True)
554 |   )
555 |   (last): ModuleDict(
556 |     (All): Linear(in_features=400, out_features=2, bias=True)
557 |   )
558 | )
559 | #parameter of model:  570402
560 | no learnable params:  570402
561 | Task order: ['1', '2', '3', '4', '5']
562 | Epoch:0
563 | LR: 0.001
564 | Itr		Time		  Data		  Loss		Acc
565 | [0/469]	0.1476 (0.1476)	0.1435 (0.1435)	0.684 (0.684)	63.28 (63.28)
566 | [100/469]	0.0024 (0.0127)	0.0003 (0.0102)	0.082 (0.200)	98.44 (91.97)
567 | [200/469]	0.0032 (0.0125)	0.0005 (0.0094)	0.055 (0.148)	97.66 (94.24)
568 | [300/469]	0.0200 (0.0122)	0.0177 (0.0092)	0.066 (0.128)	96.88 (95.13)
569 | [400/469]	0.0274 (0.0121)	0.0248 (0.0091)	0.034 (0.113)	98.44 (95.73)
570 | [468/469]	0.0019 (0.0119)	0.0001 (0.0090)	0.015 (0.105)	100.00 (96.03)
571 |  * Train Acc 96.032
572 |  * Val Acc 98.140, Total time 1.00
573 | Epoch:1
574 | LR: 0.001
575 | Itr		Time		  Data		  Loss		Acc
576 | [0/469]	0.1270 (0.1270)	0.1192 (0.1192)	0.033 (0.033)	98.44 (98.44)
577 | [100/469]	0.0041 (0.0135)	0.0019 (0.0108)	0.042 (0.048)	98.44 (98.22)
578 | [200/469]	0.0028 (0.0125)	0.0003 (0.0099)	0.071 (0.052)	96.88 (98.14)
579 | [300/469]	0.0024 (0.0122)	0.0003 (0.0096)	0.013 (0.051)	100.00 (98.18)
580 | [400/469]	0.0299 (0.0121)	0.0277 (0.0095)	0.029 (0.051)	98.44 (98.19)
581 | [468/469]	0.0022 (0.0119)	0.0001 (0.0093)	0.045 (0.050)	98.96 (98.23)
582 |  * Train Acc 98.228
583 |  * Val Acc 98.380, Total time 1.00
584 | Epoch:2
585 | LR: 0.001
586 | Itr		Time		  Data		  Loss		Acc
587 | [0/469]	0.1106 (0.1106)	0.1005 (0.1005)	0.048 (0.048)	98.44 (98.44)
588 | [100/469]	0.0023 (0.0137)	0.0003 (0.0110)	0.054 (0.032)	96.88 (98.79)
589 | [200/469]	0.0280 (0.0127)	0.0259 (0.0101)	0.014 (0.035)	100.00 (98.76)
590 | [300/469]	0.0024 (0.0123)	0.0003 (0.0097)	0.005 (0.036)	100.00 (98.76)
591 | [400/469]	0.0086 (0.0123)	0.0063 (0.0097)	0.054 (0.037)	96.88 (98.73)
592 | [468/469]	0.0119 (0.0123)	0.0059 (0.0096)	0.080 (0.037)	96.88 (98.70)
593 |  * Train Acc 98.702
594 |  * Val Acc 98.440, Total time 1.09
595 | Epoch:3
596 | LR: 0.001
597 | Itr		Time		  Data		  Loss		Acc
598 | [0/469]	0.1449 (0.1449)	0.1415 (0.1415)	0.010 (0.010)	99.22 (99.22)
599 | [100/469]	0.0035 (0.0128)	0.0014 (0.0104)	0.005 (0.026)	100.00 (99.04)
600 | [200/469]	0.0028 (0.0121)	0.0003 (0.0096)	0.074 (0.027)	98.44 (99.04)
601 | [300/469]	0.0024 (0.0121)	0.0003 (0.0094)	0.047 (0.027)	97.66 (99.02)
602 | [400/469]	0.0043 (0.0119)	0.0005 (0.0093)	0.019 (0.027)	99.22 (99.05)
603 | [468/469]	0.0020 (0.0120)	0.0001 (0.0093)	0.005 (0.027)	100.00 (99.03)
604 |  * Train Acc 99.035
605 |  * Val Acc 98.820, Total time 1.03
606 |  * Val Acc 98.820, Total time 1.03
607 | OrderedDict([('All', {'All': 98.82})])
608 | Task All average acc: 98.82
609 | ===Summary of experiment repeats: 9 / 10 ===
610 | The regularization coefficient: 0.0
611 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69 98.82  0.  ]
612 | mean: 88.82300000000001 std: 29.607971240866874
613 | split_boundaries: [0, 2, 4, 6, 8, 10]
614 | {'1': [0, 1], '2': [2, 3], '3': [4, 5], '4': [6, 7], '5': [8, 9]}
615 | FF mlp
616 | MLP(
617 |   (linear): Sequential(
618 |     (0): Linear(in_features=1024, out_features=400, bias=False)
619 |     (1): ReLU(inplace=True)
620 |     (2): Linear(in_features=400, out_features=400, bias=False)
621 |     (3): ReLU(inplace=True)
622 |   )
623 |   (last): ModuleDict(
624 |     (All): Linear(in_features=400, out_features=2, bias=True)
625 |   )
626 | )
627 | #parameter of model:  570402
628 | no learnable params:  570402
629 | Task order: ['1', '2', '3', '4', '5']
630 | Epoch:0
631 | LR: 0.001
632 | Itr		Time		  Data		  Loss		Acc
633 | [0/469]	0.1118 (0.1118)	0.1081 (0.1081)	0.691 (0.691)	48.44 (48.44)
634 | [100/469]	0.0275 (0.0136)	0.0251 (0.0107)	0.138 (0.198)	95.31 (91.92)
635 | [200/469]	0.0090 (0.0125)	0.0066 (0.0099)	0.063 (0.149)	97.66 (94.10)
636 | [300/469]	0.0027 (0.0121)	0.0003 (0.0094)	0.039 (0.126)	99.22 (95.09)
637 | [400/469]	0.0138 (0.0122)	0.0065 (0.0092)	0.115 (0.112)	97.66 (95.74)
638 | [468/469]	0.0026 (0.0121)	0.0002 (0.0091)	0.158 (0.105)	95.83 (96.02)
639 |  * Train Acc 96.017
640 |  * Val Acc 97.890, Total time 1.02
641 | Epoch:1
642 | LR: 0.001
643 | Itr		Time		  Data		  Loss		Acc
644 | [0/469]	0.1237 (0.1237)	0.1168 (0.1168)	0.070 (0.070)	97.66 (97.66)
645 | [100/469]	0.0025 (0.0127)	0.0003 (0.0102)	0.036 (0.054)	98.44 (98.13)
646 | [200/469]	0.0195 (0.0125)	0.0173 (0.0098)	0.047 (0.055)	98.44 (98.08)
647 | [300/469]	0.0217 (0.0122)	0.0194 (0.0095)	0.024 (0.055)	100.00 (98.10)
648 | [400/469]	0.0134 (0.0120)	0.0109 (0.0093)	0.067 (0.053)	98.44 (98.16)
649 | [468/469]	0.0122 (0.0119)	0.0098 (0.0093)	0.025 (0.052)	100.00 (98.20)
650 |  * Train Acc 98.205
651 |  * Val Acc 98.440, Total time 1.07
652 | Epoch:2
653 | LR: 0.001
654 | Itr		Time		  Data		  Loss		Acc
655 | [0/469]	0.1333 (0.1333)	0.1301 (0.1301)	0.017 (0.017)	99.22 (99.22)
656 | [100/469]	0.0159 (0.0129)	0.0137 (0.0104)	0.016 (0.037)	99.22 (98.69)
657 | [200/469]	0.0285 (0.0122)	0.0261 (0.0097)	0.028 (0.035)	97.66 (98.73)
658 | [300/469]	0.0025 (0.0122)	0.0003 (0.0095)	0.017 (0.036)	99.22 (98.75)
659 | [400/469]	0.0025 (0.0120)	0.0003 (0.0093)	0.057 (0.035)	99.22 (98.78)
660 | [468/469]	0.0207 (0.0120)	0.0186 (0.0093)	0.013 (0.036)	98.96 (98.75)
661 |  * Train Acc 98.747
662 |  * Val Acc 98.540, Total time 1.01
663 | Epoch:3
664 | LR: 0.001
665 | Itr		Time		  Data		  Loss		Acc
666 | [0/469]	0.1425 (0.1425)	0.1363 (0.1363)	0.009 (0.009)	100.00 (100.00)
667 | [100/469]	0.0080 (0.0138)	0.0046 (0.0102)	0.040 (0.027)	99.22 (98.99)
668 | [200/469]	0.0033 (0.0127)	0.0003 (0.0096)	0.093 (0.027)	98.44 (99.01)
669 | [300/469]	0.0216 (0.0122)	0.0187 (0.0091)	0.010 (0.028)	100.00 (99.00)
670 | [400/469]	0.0040 (0.0121)	0.0016 (0.0091)	0.003 (0.029)	100.00 (98.94)
671 | [468/469]	0.0018 (0.0119)	0.0001 (0.0090)	0.106 (0.030)	98.96 (98.94)
672 |  * Train Acc 98.940
673 |  * Val Acc 98.470, Total time 0.93
674 |  * Val Acc 98.470, Total time 0.89
675 | OrderedDict([('All', {'All': 98.47})])
676 | Task All average acc: 98.47
677 | ===Summary of experiment repeats: 10 / 10 ===
678 | The regularization coefficient: 0.0
679 | The last avg acc of all repeats: [98.84 98.66 98.85 98.67 98.78 98.45 98.47 98.69 98.82 98.47]
680 | mean: 98.67 std: 0.14993331851192948
681 | reg_coef: 0.0 mean: 98.67 std: 0.14993331851192948
682 | 


--------------------------------------------------------------------------------