├── .gitignore ├── eve_loss.png ├── eve_test_loss.png ├── README.md ├── eve.py └── eve_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .idea 3 | -------------------------------------------------------------------------------- /eve_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moskomule/eve.pytorch/HEAD/eve_loss.png -------------------------------------------------------------------------------- /eve_test_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moskomule/eve.pytorch/HEAD/eve_test_loss.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Eve.pytorch 2 | 3 | An implementation of **Eve Optimizer**, proposed in *Improving Stochastic Gradient Descent with Feedback*, Koushik and Hayashi, 2016([arXiv](https://arxiv.org/abs/1611.01505)). 4 | 5 | Comparison of training loss with Adam. Parameters are their default values. 6 | 7 | ![losses](eve_loss.png) 8 | ![losses](eve_test_loss.png) 9 | -------------------------------------------------------------------------------- /eve.py: -------------------------------------------------------------------------------- 1 | import math 2 | from torch.optim import Optimizer 3 | 4 | 5 | class Eve(Optimizer): 6 | """ 7 | Implements Eve Algorithm, proposed in `IMPROVING STOCHASTIC GRADIENT DESCENT WITH FEEDBACK` 8 | """ 9 | 10 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999, 0.999), eps=1e-8, 11 | k=0.1, K=10, weight_decay=0): 12 | 13 | defaults = dict(lr=lr, betas=betas, eps=eps, 14 | k=k, K=K, weight_decay=weight_decay) 15 | super(Eve, self).__init__(params, defaults) 16 | 17 | def step(self, closure): 18 | """ 19 | :param closure: closure returns loss. see http://pytorch.org/docs/optim.html#optimizer-step-closure 20 | :return: loss 21 | """ 22 | loss = closure() 23 | _loss = loss.data[0] # float 24 | 25 | for group in self.param_groups: 26 | 27 | for p in group['params']: 28 | grad = p.grad.data 29 | state = self.state[p] 30 | 31 | # State initialization 32 | if len(state) == 0: 33 | state['step'] = 0 34 | # Exponential moving average of gradient values 35 | state['m_t'] = grad.new().resize_as_(grad).zero_() 36 | # Exponential moving average of squared gradient values 37 | state['v_t'] = grad.new().resize_as_(grad).zero_() 38 | # f hats, smoothly tracked objective functions 39 | # \hat{f}_0 = f_0 40 | state['ft_2'], state['ft_1'] = _loss, None 41 | state['d'] = 1 42 | 43 | m_t, v_t = state['m_t'], state['v_t'] 44 | beta1, beta2, beta3 = group['betas'] 45 | k, K = group['k'], group['K'] 46 | d = state['d'] 47 | state['step'] += 1 48 | t = state['step'] 49 | # initialization of \hat{f}_1 50 | if t == 1: 51 | # \hat{f}_1 = f_1 52 | state['ft_1'] = _loss 53 | # \hat{f_{t-1}}, \hat{f_{t-2}} 54 | ft_1, ft_2 = state['ft_1'], state['ft_2'] 55 | # f(\theta_{t-1}) 56 | f = _loss 57 | 58 | if group['weight_decay'] != 0: 59 | grad = grad.add(group['weight_decay'], p.data) 60 | 61 | # Decay the first and second moment running average coefficient 62 | m_t.mul_(beta1).add_(grad, alpha=1-beta1) 63 | v_t.mul_(beta2).addcmul_(grad, grad, value=1-beta2) 64 | 65 | m_t_hat = m_t / (1 - beta1 ** t) 66 | v_t_hat = v_t / (1 - beta2 ** t) 67 | 68 | if t > 1: 69 | if f >= state['ft_2']: 70 | delta = k + 1 71 | Delta = K + 1 72 | else: 73 | delta = 1 / (K + 1) 74 | Delta = 1 / (k + 1) 75 | 76 | c = min(max(delta, f / ft_2), Delta) 77 | r = abs(c - 1) / min(c, 1) 78 | state['ft_1'], state['ft_2'] = c * ft_2, ft_1 79 | state['d'] = beta3 * d + (1 - beta3) * r 80 | 81 | # update parameters 82 | p.data.addcdiv_(m_t_hat, 83 | v_t_hat.sqrt().add_(group['eps']), 84 | value=-group['lr']/state['d']) 85 | 86 | return loss 87 | -------------------------------------------------------------------------------- /eve_test.py: -------------------------------------------------------------------------------- 1 | from eve import Eve 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | from torchvision import datasets, transforms 12 | from torch.autograd import Variable 13 | 14 | # variables 15 | batch_size = 128 16 | epochs = 100 17 | cuda = torch.cuda.is_available() 18 | 19 | # load data 20 | transform = transforms.Compose([transforms.ToTensor(), 21 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 22 | ]) 23 | train_loader = torch.utils.data.DataLoader( 24 | datasets.CIFAR10('data/cifar10', train=True, download=True, 25 | transform=transform), 26 | batch_size=batch_size, shuffle=True) 27 | test_loader = torch.utils.data.DataLoader( 28 | datasets.CIFAR10('data/cifar10', train=False, transform=transform), 29 | batch_size=batch_size, shuffle=True) 30 | 31 | 32 | class Net(nn.Module): 33 | def __init__(self): 34 | super(Net, self).__init__() 35 | self.conv1 = nn.Conv2d(3, 32, 3, stride=1) 36 | self.conv2 = nn.Conv2d(32, 32, kernel_size=3) 37 | 38 | self.conv3 = nn.Conv2d(32, 64, kernel_size=3) 39 | self.conv4 = nn.Conv2d(64, 64, kernel_size=3) 40 | self.dense1 = nn.Linear(in_features=64 * 25, out_features=512) 41 | self.dense1_bn = nn.BatchNorm1d(512) 42 | self.dense2 = nn.Linear(512, 10) 43 | 44 | def forward(self, x): 45 | x = F.relu(self.conv1(x)) 46 | x = F.relu(F.dropout(F.max_pool2d(self.conv2(x), 2), 0.25)) 47 | x = F.relu(self.conv3(x)) 48 | x = F.relu(F.dropout(F.max_pool2d(self.conv4(x), 2), 0.25)) 49 | x = x.view(-1, 64 * 25) # reshape 50 | x = F.relu(self.dense1_bn(self.dense1(x))) 51 | return F.log_softmax(self.dense2(x)) 52 | 53 | 54 | def train(epoch, model, optimizer): 55 | model.train() 56 | total_loss = 0 57 | for batch_idx, (data, target) in enumerate(train_loader): 58 | if cuda: 59 | data, target = data.cuda(), target.cuda() 60 | data, target = Variable(data), Variable(target) 61 | 62 | def closure(): 63 | optimizer.zero_grad() # reset reset optimizer 64 | output = model(data) 65 | loss = F.nll_loss(output, target) # negative log likelihood loss 66 | loss.backward() # backprop 67 | return loss 68 | 69 | loss = optimizer.step(closure) 70 | total_loss += loss.data[0] / len(train_loader) 71 | if batch_idx % 20 == 0: 72 | print('\rTrain Epoch: {} [{}/{} ({:>4.2%})] Loss: {:>5.3}'.format( 73 | epoch, batch_idx * len(data), len(train_loader.dataset), 74 | batch_idx / len(train_loader), total_loss), 75 | end="") 76 | return total_loss 77 | 78 | 79 | def test(epoch, model): 80 | model.eval() 81 | test_loss = 0 82 | correct = 0 83 | for data, target in test_loader: 84 | if cuda: 85 | data, target = data.cuda(), target.cuda() 86 | data, target = Variable(data), Variable(target) 87 | output = model(data) 88 | test_loss += F.nll_loss(output, target).data[0] 89 | pred = output.data.max(1)[1] # get the index of the max log-probability 90 | correct += pred.eq(target.data).cpu().sum() 91 | 92 | test_loss /= len(test_loader) # loss function already averages over batch size 93 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2%})'.format( 94 | test_loss, correct, len(test_loader.dataset), 95 | correct / len(test_loader.dataset))) 96 | return test_loss 97 | 98 | 99 | def plot(loss_a, loss_b, filename, ylabel): 100 | import matplotlib 101 | matplotlib.use("AGG") 102 | import matplotlib.pyplot as plt 103 | plt.plot(loss_a) 104 | plt.plot(loss_b) 105 | plt.legend(["Eve", "Adam"]) 106 | plt.xlabel("epochs") 107 | plt.ylabel(ylabel) 108 | plt.savefig(filename) 109 | plt.clf() 110 | 111 | 112 | print("Eve") 113 | eve_loss = [] 114 | eve_test_loss = [] 115 | model = Net() 116 | if cuda: 117 | model.cuda() 118 | optimizer = Eve(model.parameters()) 119 | for i in range(1, epochs + 1): 120 | eve_loss.append(train(i, model, optimizer)) 121 | eve_test_loss.append(test(i, model)) 122 | 123 | print("Adam") 124 | adam_loss = [] 125 | adam_test_loss = [] 126 | model = Net() 127 | if cuda: 128 | model.cuda() 129 | optimizer = optim.Adam(model.parameters()) 130 | for i in range(1, epochs + 1): 131 | adam_loss.append(train(i, model, optimizer)) 132 | adam_test_loss.append(test(i, model)) 133 | 134 | plot(eve_loss, adam_loss, "eve_loss.png", "training loss") 135 | plot(eve_test_loss, adam_test_loss, "eve_test_loss.png", "testing loss") 136 | --------------------------------------------------------------------------------