├── README.md ├── data ├── processed │ ├── test.pt │ └── training.pt └── raw │ ├── t10k-images-idx3-ubyte │ ├── t10k-labels-idx1-ubyte │ ├── train-images-idx3-ubyte │ └── train-labels-idx1-ubyte └── mnist ├── main.py ├── main_cnn.py ├── main_copy.py └── sparsemax.py /README.md: -------------------------------------------------------------------------------- 1 | # SparsemaxPytorch 2 | 3 | 4 | Implementation in PyTorch of http://proceedings.mlr.press/v48/martins16.pdf (International Conference on Machine Learning 2016) 5 | 6 | 7 | It consists in an activation function similar than Softmax but can give us sparse probabilities of inputs. Interesting to use for attentional models. 8 | 9 | 10 | I tested replacing the softmax activation in the last layer and it gives similar results. 11 | 12 | Coded by Max Raphael Sobroza Marques 13 | -------------------------------------------------------------------------------- /data/processed/test.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msobroza/SparsemaxPytorch/cedb9389b243ccf988de9ae3267d873a7310070e/data/processed/test.pt -------------------------------------------------------------------------------- /data/processed/training.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msobroza/SparsemaxPytorch/cedb9389b243ccf988de9ae3267d873a7310070e/data/processed/training.pt -------------------------------------------------------------------------------- /data/raw/t10k-images-idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msobroza/SparsemaxPytorch/cedb9389b243ccf988de9ae3267d873a7310070e/data/raw/t10k-images-idx3-ubyte -------------------------------------------------------------------------------- /data/raw/t10k-labels-idx1-ubyte: -------------------------------------------------------------------------------- 1 | '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             -------------------------------------------------------------------------------- /data/raw/train-images-idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msobroza/SparsemaxPytorch/cedb9389b243ccf988de9ae3267d873a7310070e/data/raw/train-images-idx3-ubyte -------------------------------------------------------------------------------- /data/raw/train-labels-idx1-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msobroza/SparsemaxPytorch/cedb9389b243ccf988de9ae3267d873a7310070e/data/raw/train-labels-idx1-ubyte -------------------------------------------------------------------------------- /mnist/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | import torch.utils.data 10 | import random 11 | import numpy as np 12 | from random import randint 13 | from sparsemax import Sparsemax 14 | from sparsemax import MultiLabelSparseMaxLoss 15 | 16 | # Training settings 17 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 18 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 19 | help='input batch size for training (default: 64)') 20 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 21 | help='input batch size for testing (default: 1000)') 22 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 23 | help='number of epochs to train (default: 10)') 24 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 25 | help='learning rate (default: 0.01)') 26 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 27 | help='SGD momentum (default: 0.5)') 28 | parser.add_argument('--no-cuda', action='store_true', default=False, 29 | help='disables CUDA training') 30 | parser.add_argument('--seed', type=int, default=1, metavar='S', 31 | help='random seed (default: 1)') 32 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 33 | help='how many batches to wait before logging training status') 34 | args = parser.parse_args() 35 | args.cuda = not args.no_cuda and torch.cuda.is_available() 36 | 37 | torch.manual_seed(args.seed) 38 | if args.cuda: 39 | torch.cuda.manual_seed(args.seed) 40 | 41 | 42 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 43 | train_loader = torch.utils.data.DataLoader( 44 | datasets.MNIST('../data', train=True, download=True, 45 | transform=transforms.Compose([ 46 | transforms.ToTensor(), 47 | transforms.Normalize((0.1307,), (0.3081,)) 48 | ])), 49 | batch_size=args.batch_size, shuffle=True, **kwargs) 50 | test_loader = torch.utils.data.DataLoader( 51 | datasets.MNIST('../data', train=False, transform=transforms.Compose([ 52 | transforms.ToTensor(), 53 | transforms.Normalize((0.1307,), (0.3081,)) 54 | ])), 55 | batch_size=args.batch_size, shuffle=True, **kwargs) 56 | 57 | def generateRandomCliqueVector(clusters, nodes_per_cluster): 58 | result = np.zeros(clusters*nodes_per_cluster) 59 | for i in xrange(clusters): 60 | j = random.randint(0,nodes_per_cluster-1) 61 | result[i*nodes_per_cluster+j]=1.0 62 | return result 63 | 64 | 65 | class Net(nn.Module): 66 | def __init__(self, H_clusters, H_neurons_per_cluster): 67 | super(Net, self).__init__() 68 | self.H_clusters=H_clusters 69 | self.H_neurons_per_cluster=H_neurons_per_cluster 70 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 71 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 72 | self.conv2_drop = nn.Dropout2d() 73 | self.fc1 = nn.Linear(320, 50) 74 | self.fc2 = nn.Linear(50,self.H_clusters*self.H_neurons_per_cluster) 75 | 76 | self.sparsemaxActivation = Sparsemax(self.H_clusters,self.H_neurons_per_cluster) 77 | 78 | def forward(self, x): 79 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 80 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 81 | x = x.view(-1, 320) 82 | x = F.relu(self.fc1(x)) 83 | x = F.dropout(x, training=self.training) 84 | x = self.fc2(x) 85 | y_pred, zs_sparse, taus, is_gt = self.sparsemaxActivation(x) 86 | return x, y_pred, zs_sparse, taus, is_gt 87 | 88 | H_clusters, H_neurons_per_cluster, N_class = 1, 10, 10 89 | model = Net(H_clusters, H_neurons_per_cluster) 90 | sparsemaxMulticlassLoss = MultiLabelSparseMaxLoss(H_clusters, H_neurons_per_cluster) 91 | if args.cuda: 92 | model.cuda() 93 | 94 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 95 | code_target_class = np.zeros((N_class,H_clusters*H_neurons_per_cluster), dtype='float32') 96 | 97 | for i in xrange(N_class): 98 | one_hot_vector = np.zeros(H_clusters*H_neurons_per_cluster) 99 | #code_target_class[i] = generateRandomCliqueVector(H_clusters,H_neurons_per_cluster).reshape((H_clusters*H_neurons_per_cluster)) 100 | one_hot_vector[i] = 1.0 101 | code_target_class[i]=one_hot_vector 102 | 103 | table_embedding = nn.Embedding(N_class, H_clusters*H_neurons_per_cluster, sparse=True).cuda() 104 | table_embedding.volatile=True 105 | table_embedding.requires_grad=False 106 | table_embedding.weight = nn.Parameter(torch.from_numpy(code_target_class).cuda()) 107 | table_embedding.weight.requires_grad=False 108 | table_embedding.weight.cuda() 109 | 110 | def train(epoch): 111 | model.train() 112 | for batch_idx, (data, target) in enumerate(train_loader): 113 | if args.cuda: 114 | data, target = data.cuda(), target.cuda() 115 | data, target = Variable(data), Variable(target) 116 | code_target = table_embedding(target) 117 | optimizer.zero_grad() 118 | input_sparsemax, y_pred, zs_sparse, taus, is_gt = model(data) 119 | loss = sparsemaxMulticlassLoss(input_sparsemax, zs_sparse, code_target, y_pred, taus, is_gt) 120 | #loss = F.nll_loss(output, target) 121 | loss.backward() 122 | optimizer.step() 123 | if batch_idx % args.log_interval == 0: 124 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 125 | epoch, batch_idx * len(data), len(train_loader.dataset), 126 | 100. * batch_idx / len(train_loader), loss.data[0])) 127 | 128 | def test(): 129 | model.eval() 130 | test_loss = 0 131 | correct = 0 132 | for data, target in test_loader: 133 | if args.cuda: 134 | data, target = data.cuda(), target.cuda() 135 | data, target = Variable(data, volatile=True), Variable(target) 136 | _, output, _ , _ , _ = model(data) 137 | print(output) 138 | #test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss 139 | pred = output.data.max(1)[1] # get the index of the max log-probability 140 | 141 | correct += pred.eq(target.data).cpu().sum() 142 | 143 | test_loss /= len(test_loader.dataset) 144 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 145 | test_loss, correct, len(test_loader.dataset), 146 | 100. * correct / len(test_loader.dataset))) 147 | 148 | 149 | for epoch in range(1, args.epochs + 1): 150 | train(epoch) 151 | test() 152 | -------------------------------------------------------------------------------- /mnist/main_cnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | import torch.utils.data 10 | import random 11 | import numpy as np 12 | from random import randint 13 | 14 | # Training settings 15 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 16 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 17 | help='input batch size for training (default: 64)') 18 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 19 | help='input batch size for testing (default: 1000)') 20 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 21 | help='number of epochs to train (default: 10)') 22 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 23 | help='learning rate (default: 0.01)') 24 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 25 | help='SGD momentum (default: 0.5)') 26 | parser.add_argument('--no-cuda', action='store_true', default=False, 27 | help='disables CUDA training') 28 | parser.add_argument('--seed', type=int, default=1, metavar='S', 29 | help='random seed (default: 1)') 30 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 31 | help='how many batches to wait before logging training status') 32 | args = parser.parse_args() 33 | args.cuda = not args.no_cuda and torch.cuda.is_available() 34 | 35 | torch.manual_seed(args.seed) 36 | if args.cuda: 37 | torch.cuda.manual_seed(args.seed) 38 | 39 | 40 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 41 | train_loader = torch.utils.data.DataLoader( 42 | datasets.MNIST('../data', train=True, download=True, 43 | transform=transforms.Compose([ 44 | transforms.ToTensor(), 45 | transforms.Normalize((0.1307,), (0.3081,)) 46 | ])), 47 | batch_size=args.batch_size, shuffle=True, **kwargs) 48 | test_loader = torch.utils.data.DataLoader( 49 | datasets.MNIST('../data', train=False, transform=transforms.Compose([ 50 | transforms.ToTensor(), 51 | transforms.Normalize((0.1307,), (0.3081,)) 52 | ])), 53 | batch_size=args.batch_size, shuffle=True, **kwargs) 54 | 55 | def generateRandomCliqueVector(clusters, nodes_per_cluster): 56 | result = np.zeros(clusters*nodes_per_cluster) 57 | for i in xrange(clusters): 58 | j = random.randint(0,nodes_per_cluster-1) 59 | result[i*nodes_per_cluster+j]=1.0 60 | return result 61 | 62 | class Sparsemax(nn.Module): 63 | def __init__(self, num_clusters, num_neurons_per_cluster): 64 | super(Sparsemax, self).__init__() 65 | self.num_clusters = num_clusters 66 | self.num_neurons_per_cluster = num_neurons_per_cluster 67 | 68 | def forward(self, input): 69 | 70 | input_reshape = torch.zeros(input.size()) 71 | input_reshape = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 72 | #print(input_reshape) 73 | dim = 2 74 | #translate for numerical stability 75 | input_shift = input_reshape # - torch.max(input_reshape, dim)[0].expand_as(input_reshape) 76 | 77 | #sorting input in descending order 78 | z_sorted = torch.sort(input_shift, dim=dim, descending=True)[0] 79 | input_size = input_shift.size()[dim] 80 | range_values = Variable(torch.arange(1, input_size+1), requires_grad=False).cuda() 81 | range_values = range_values.expand_as(z_sorted) 82 | 83 | #Determine sparsity of projection 84 | bound = Variable(torch.zeros(z_sorted.size()),requires_grad=False).cuda() 85 | 86 | #z_sorted = z_sorted.type_as(bound) 87 | bound = 1 + torch.addcmul(bound, range_values, z_sorted) 88 | cumsum_zs = torch.cumsum(z_sorted, dim) 89 | is_gt = torch.gt(bound, cumsum_zs).type(torch.FloatTensor).cuda() 90 | valid = Variable(torch.zeros(range_values.size()),requires_grad=False).cuda() 91 | valid = torch.addcmul(valid, range_values, is_gt) 92 | k_max = torch.max(valid, dim)[0] 93 | zs_sparse = Variable(torch.zeros(z_sorted.size()),requires_grad=False).cuda() 94 | zs_sparse = torch.addcmul(zs_sparse, is_gt, z_sorted) 95 | sum_zs = (torch.sum(zs_sparse, dim) - 1) 96 | taus = Variable(torch.zeros(k_max.size()),requires_grad=False).cuda() 97 | taus = torch.addcdiv(taus, (torch.sum(zs_sparse, dim) - 1), k_max) 98 | taus_expanded = taus.expand_as(input_reshape) 99 | output = Variable(torch.zeros(input_reshape.size())).cuda() 100 | output = torch.max(output, input_shift - taus_expanded) 101 | #self.save_for_backward(output) 102 | #loss = sparseMaxLoss(taus) 103 | return output.view(-1, self.num_clusters*self.num_neurons_per_cluster), zs_sparse,taus, is_gt 104 | 105 | 106 | def backward(self, grad_output): 107 | #output_forward, = self.saved_tensors 108 | 109 | self.output = self.output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 110 | grad_output = grad_output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 111 | dim = 2 112 | non_zeros = Variable(torch.ne(self.output, 0).type(torch.FloatTensor), requires_grad=False).cuda() 113 | mask_grad = Variable(torch.zeros(self.output.size()), requires_grad=False).cuda() 114 | mask_grad = torch.addcmul(mask_grad, non_zeros, grad_output) 115 | sum_mask_grad = torch.sum(mask_grad, dim) 116 | l1_norm_non_zeros = torch.sum(non_zeros, dim) 117 | sum_v = Variable(torch.zeros(sum_mask_grad.size()), requires_grad=False).cuda() 118 | sum_v = torch.addcdiv(sum_v, sum_mask_grad, l1_norm_non_zeros) 119 | self.gradInput = Variable(torch.zeros(grad_output.size())) 120 | self.gradInput = torch.addcmul(self.gradInput, non_zeros, grad_output - sum_v.expand_as(grad_output)) 121 | self.gradInput = self.gradInput.view(-1, self.num_clusters*self.num_neurons_per_cluster) 122 | return self.gradInput 123 | 124 | class MultiLabelSparseMaxLoss(nn.Module): 125 | 126 | def __init__(self, num_clusters, num_neurons_per_cluster): 127 | super(MultiLabelSparseMaxLoss, self).__init__() 128 | self.num_clusters = num_clusters 129 | self.num_neurons_per_cluster = num_neurons_per_cluster 130 | 131 | def forward(self, input, zs_sparse, target, output_sparsemax, taus, is_gt): 132 | self.output_sparsemax = output_sparsemax 133 | input = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 134 | self.target = target.view(-1, self.num_clusters, self.num_neurons_per_cluster) 135 | batch_size = input.size(0) 136 | dim = 2 137 | target_times_input = torch.sum(self.target * input, dim) 138 | target_inner_product = torch.sum(self.target * self.target, dim) 139 | zs_squared = zs_sparse * zs_sparse 140 | taus_squared = (taus * taus).expand_as(zs_squared) 141 | taus_squared = taus_squared * is_gt 142 | sum_input_taus = torch.sum(zs_squared - taus_squared, dim) 143 | sparsemax_loss = - target_times_input + 0.5*sum_input_taus + 0.5*target_inner_product 144 | sparsemax_loss = torch.sum(sparsemax_loss)/(batch_size * self.num_clusters) 145 | return sparsemax_loss 146 | 147 | def backward(self): 148 | grad_output = (- self.target + self.output_sparsemax)/(batch_size * self.num_clusters) 149 | return grad_output 150 | 151 | 152 | class Net(nn.Module): 153 | def __init__(self, H_clusters, H_neurons_per_cluster): 154 | super(Net, self).__init__() 155 | self.H_clusters=H_clusters 156 | self.H_neurons_per_cluster=H_neurons_per_cluster 157 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 158 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 159 | self.conv2_drop = nn.Dropout2d() 160 | self.fc1 = nn.Linear(320, 50) 161 | self.fc2 = nn.Linear(50,self.H_clusters*self.H_neurons_per_cluster) 162 | 163 | self.sparsemaxActivation = Sparsemax(self.H_clusters,self.H_neurons_per_cluster) 164 | 165 | def forward(self, x): 166 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 167 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 168 | x = x.view(-1, 320) 169 | x = F.relu(self.fc1(x)) 170 | x = F.dropout(x, training=self.training) 171 | x = self.fc2(x) 172 | y_pred, zs_sparse, taus, is_gt = self.sparsemaxActivation(x) 173 | return x, y_pred, zs_sparse, taus, is_gt 174 | 175 | H_clusters, H_neurons_per_cluster, N_class = 1, 10, 10 176 | model = Net(H_clusters, H_neurons_per_cluster) 177 | sparsemaxMulticlassLoss = MultiLabelSparseMaxLoss(H_clusters, H_neurons_per_cluster) 178 | if args.cuda: 179 | model.cuda() 180 | 181 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 182 | code_target_class = np.zeros((N_class,H_clusters*H_neurons_per_cluster), dtype='float32') 183 | 184 | for i in xrange(N_class): 185 | one_hot_vector = np.zeros(H_clusters*H_neurons_per_cluster) 186 | #code_target_class[i] = generateRandomCliqueVector(H_clusters,H_neurons_per_cluster).reshape((H_clusters*H_neurons_per_cluster)) 187 | one_hot_vector[i] = 1.0 188 | code_target_class[i]=one_hot_vector 189 | 190 | table_embedding = nn.Embedding(N_class, H_clusters*H_neurons_per_cluster, sparse=True).cuda() 191 | table_embedding.volatile=True 192 | table_embedding.requires_grad=False 193 | table_embedding.weight = nn.Parameter(torch.from_numpy(code_target_class).cuda()) 194 | table_embedding.weight.requires_grad=False 195 | table_embedding.weight.cuda() 196 | 197 | def train(epoch): 198 | model.train() 199 | for batch_idx, (data, target) in enumerate(train_loader): 200 | if args.cuda: 201 | data, target = data.cuda(), target.cuda() 202 | data, target = Variable(data), Variable(target) 203 | code_target = table_embedding(target) 204 | optimizer.zero_grad() 205 | input_sparsemax, y_pred, zs_sparse, taus, is_gt = model(data) 206 | loss = sparsemaxMulticlassLoss(input_sparsemax, zs_sparse, code_target, y_pred, taus, is_gt) 207 | #loss = F.nll_loss(output, target) 208 | loss.backward() 209 | optimizer.step() 210 | if batch_idx % args.log_interval == 0: 211 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 212 | epoch, batch_idx * len(data), len(train_loader.dataset), 213 | 100. * batch_idx / len(train_loader), loss.data[0])) 214 | 215 | def test(): 216 | model.eval() 217 | test_loss = 0 218 | correct = 0 219 | for data, target in test_loader: 220 | if args.cuda: 221 | data, target = data.cuda(), target.cuda() 222 | data, target = Variable(data, volatile=True), Variable(target) 223 | _, output, _ , _ , _ = model(data) 224 | print(output) 225 | #test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss 226 | pred = output.data.max(1)[1] # get the index of the max log-probability 227 | 228 | correct += pred.eq(target.data).cpu().sum() 229 | 230 | test_loss /= len(test_loader.dataset) 231 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 232 | test_loss, correct, len(test_loader.dataset), 233 | 100. * correct / len(test_loader.dataset))) 234 | 235 | 236 | for epoch in range(1, args.epochs + 1): 237 | train(epoch) 238 | test() 239 | -------------------------------------------------------------------------------- /mnist/main_copy.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | import torch.utils.data 10 | import random 11 | import numpy as np 12 | from random import randint 13 | 14 | # Training settings 15 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 16 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 17 | help='input batch size for training (default: 64)') 18 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 19 | help='input batch size for testing (default: 1000)') 20 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 21 | help='number of epochs to train (default: 10)') 22 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 23 | help='learning rate (default: 0.01)') 24 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 25 | help='SGD momentum (default: 0.5)') 26 | parser.add_argument('--no-cuda', action='store_true', default=False, 27 | help='disables CUDA training') 28 | parser.add_argument('--seed', type=int, default=1, metavar='S', 29 | help='random seed (default: 1)') 30 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 31 | help='how many batches to wait before logging training status') 32 | args = parser.parse_args() 33 | args.cuda = not args.no_cuda and torch.cuda.is_available() 34 | 35 | torch.manual_seed(args.seed) 36 | if args.cuda: 37 | torch.cuda.manual_seed(args.seed) 38 | 39 | 40 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 41 | train_loader = torch.utils.data.DataLoader( 42 | datasets.MNIST('../data', train=True, download=True, 43 | transform=transforms.Compose([ 44 | transforms.ToTensor(), 45 | transforms.Normalize((0.1307,), (0.3081,)) 46 | ])), 47 | batch_size=args.batch_size, shuffle=True, **kwargs) 48 | test_loader = torch.utils.data.DataLoader( 49 | datasets.MNIST('../data', train=False, transform=transforms.Compose([ 50 | transforms.ToTensor(), 51 | transforms.Normalize((0.1307,), (0.3081,)) 52 | ])), 53 | batch_size=args.batch_size, shuffle=True, **kwargs) 54 | 55 | def generateRandomCliqueVector(clusters, nodes_per_cluster): 56 | result = np.zeros(clusters*nodes_per_cluster) 57 | for i in xrange(clusters): 58 | j = random.randint(0,nodes_per_cluster-1) 59 | result[i*nodes_per_cluster+j]=1.0 60 | return result 61 | 62 | class Sparsemax(nn.Module): 63 | def __init__(self, num_clusters, num_neurons_per_cluster): 64 | super(Sparsemax, self).__init__() 65 | self.num_clusters = num_clusters 66 | self.num_neurons_per_cluster = num_neurons_per_cluster 67 | 68 | def forward(self, input): 69 | 70 | input_reshape = torch.zeros(input.size()) 71 | input_reshape = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 72 | #print(input_reshape) 73 | dim = 2 74 | #translate for numerical stability 75 | input_shift = input_reshape # - torch.max(input_reshape, dim)[0].expand_as(input_reshape) 76 | 77 | #sorting input in descending order 78 | z_sorted = torch.sort(input_shift, dim=dim, descending=True)[0] 79 | input_size = input_shift.size()[dim] 80 | range_values = Variable(torch.arange(1, input_size+1), requires_grad=False) 81 | range_values = range_values.expand_as(z_sorted) 82 | 83 | #Determine sparsity of projection 84 | bound = Variable(torch.zeros(z_sorted.size()),requires_grad=False) 85 | 86 | #z_sorted = z_sorted.type_as(bound) 87 | bound = 1 + torch.addcmul(bound, range_values, z_sorted) 88 | cumsum_zs = torch.cumsum(z_sorted, dim) 89 | is_gt = torch.gt(bound, cumsum_zs).type(torch.FloatTensor) 90 | valid = Variable(torch.zeros(range_values.size()),requires_grad=False) 91 | valid = torch.addcmul(valid, range_values, is_gt) 92 | k_max = torch.max(valid, dim)[0] 93 | zs_sparse = Variable(torch.zeros(z_sorted.size()),requires_grad=False) 94 | zs_sparse = torch.addcmul(zs_sparse, is_gt, z_sorted) 95 | sum_zs = (torch.sum(zs_sparse, dim) - 1) 96 | taus = Variable(torch.zeros(k_max.size()),requires_grad=False) 97 | taus = torch.addcdiv(taus, (torch.sum(zs_sparse, dim) - 1), k_max) 98 | taus_expanded = taus.expand_as(input_reshape) 99 | output = Variable(torch.zeros(input_reshape.size())) 100 | output = torch.max(output, input_shift - taus_expanded) 101 | #self.save_for_backward(output) 102 | #loss = sparseMaxLoss(taus) 103 | return output.view(-1, self.num_clusters*self.num_neurons_per_cluster), zs_sparse,taus, is_gt 104 | 105 | 106 | def backward(self, grad_output): 107 | #output_forward, = self.saved_tensors 108 | 109 | self.output = self.output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 110 | grad_output = grad_output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 111 | dim = 2 112 | non_zeros = Variable(torch.ne(self.output, 0).type(torch.FloatTensor), requires_grad=False) 113 | mask_grad = Variable(torch.zeros(self.output.size()), requires_grad=False) 114 | mask_grad = torch.addcmul(mask_grad, non_zeros, grad_output) 115 | sum_mask_grad = torch.sum(mask_grad, dim) 116 | l1_norm_non_zeros = torch.sum(non_zeros, dim) 117 | sum_v = Variable(torch.zeros(sum_mask_grad.size()), requires_grad=False) 118 | sum_v = torch.addcdiv(sum_v, sum_mask_grad, l1_norm_non_zeros) 119 | self.gradInput = Variable(torch.zeros(grad_output.size())) 120 | self.gradInput = torch.addcmul(self.gradInput, non_zeros, grad_output - sum_v.expand_as(grad_output)) 121 | self.gradInput = self.gradInput.view(-1, self.num_clusters*self.num_neurons_per_cluster) 122 | return self.gradInput 123 | 124 | class MultiLabelSparseMaxLoss(nn.Module): 125 | 126 | def __init__(self, num_clusters, num_neurons_per_cluster): 127 | super(MultiLabelSparseMaxLoss, self).__init__() 128 | self.num_clusters = num_clusters 129 | self.num_neurons_per_cluster = num_neurons_per_cluster 130 | 131 | def forward(self, input, zs_sparse, target, output_sparsemax, taus, is_gt): 132 | self.output_sparsemax = output_sparsemax 133 | input = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 134 | self.target = target.view(-1, self.num_clusters, self.num_neurons_per_cluster) 135 | batch_size = input.size(0) 136 | dim = 2 137 | target_times_input = torch.sum(self.target * input, dim) 138 | target_inner_product = torch.sum(self.target * self.target, dim) 139 | zs_squared = zs_sparse * zs_sparse 140 | taus_squared = (taus * taus).expand_as(zs_squared) 141 | taus_squared = taus_squared * is_gt 142 | sum_input_taus = torch.sum(zs_squared - taus_squared, dim) 143 | sparsemax_loss = - target_times_input + 0.5*sum_input_taus + 0.5*target_inner_product 144 | sparsemax_loss = torch.sum(sparsemax_loss)/(batch_size * self.num_clusters) 145 | return sparsemax_loss 146 | 147 | def backward(self): 148 | grad_output = (- self.target + self.output_sparsemax)/(batch_size * self.num_clusters) 149 | return grad_output 150 | 151 | 152 | class Net(nn.Module): 153 | def __init__(self, H_clusters, H_neurons_per_cluster): 154 | super(Net, self).__init__() 155 | self.H_clusters=H_clusters 156 | self.H_neurons_per_cluster=H_neurons_per_cluster 157 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 158 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 159 | self.conv2_drop = nn.Dropout2d() 160 | self.fc1 = nn.Linear(320, 50) 161 | self.fc2 = nn.Linear(50,self.H_clusters*self.H_neurons_per_cluster) 162 | 163 | self.sparsemaxActivation = Sparsemax(self.H_clusters,self.H_neurons_per_cluster) 164 | 165 | def forward(self, x): 166 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 167 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 168 | x = x.view(-1, 320) 169 | x = F.relu(self.fc1(x)) 170 | x = F.dropout(x, training=self.training) 171 | x = self.fc2(x) 172 | y_pred, zs_sparse, taus, is_gt = self.sparsemaxActivation(x) 173 | return x, y_pred, zs_sparse, taus, is_gt 174 | 175 | H_clusters, H_neurons_per_cluster, N_class = 10, 10, 10 176 | model = Net(H_clusters, H_neurons_per_cluster) 177 | sparsemaxMulticlassLoss = MultiLabelSparseMaxLoss(H_clusters, H_neurons_per_cluster) 178 | if args.cuda: 179 | model.cuda() 180 | 181 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 182 | code_target_class = np.zeros((N_class,H_clusters*H_neurons_per_cluster), dtype='float32') 183 | 184 | for i in xrange(N_class): 185 | code_target_class[i] = generateRandomCliqueVector(H_clusters,H_neurons_per_cluster).reshape((H_clusters*H_neurons_per_cluster)) 186 | 187 | 188 | table_embedding = nn.Embedding(N_class, H_clusters*H_neurons_per_cluster, sparse=True).cuda() 189 | table_embedding.volatile=True 190 | table_embedding.requires_grad=False 191 | table_embedding.weight = nn.Parameter(torch.from_numpy(code_target_class).cuda()) 192 | table_embedding.weight.requires_grad=False 193 | table_embedding.weight.cuda() 194 | 195 | def train(epoch): 196 | model.train() 197 | for batch_idx, (data, target) in enumerate(train_loader): 198 | if args.cuda: 199 | data, target = data.cuda(), target.cuda() 200 | data, target = Variable(data), Variable(target) 201 | code_target = table_embedding(target) 202 | optimizer.zero_grad() 203 | input_sparsemax, y_pred, zs_sparse, taus, is_gt = model(data) 204 | loss = sparsemaxMulticlassLoss(input_sparsemax, zs_sparse, code_target, y_pred, taus, is_gt) 205 | #loss = F.nll_loss(output, target) 206 | loss.backward() 207 | optimizer.step() 208 | if batch_idx % args.log_interval == 0: 209 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 210 | epoch, batch_idx * len(data), len(train_loader.dataset), 211 | 100. * batch_idx / len(train_loader), loss.data[0])) 212 | 213 | def test(): 214 | model.eval() 215 | test_loss = 0 216 | correct = 0 217 | for data, target in test_loader: 218 | if args.cuda: 219 | data, target = data.cuda(), target.cuda() 220 | data, target = Variable(data, volatile=True), Variable(target) 221 | output = model(data) 222 | test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss 223 | pred = output.data.max(1)[1] # get the index of the max log-probability 224 | correct += pred.eq(target.data).cpu().sum() 225 | 226 | test_loss /= len(test_loader.dataset) 227 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 228 | test_loss, correct, len(test_loader.dataset), 229 | 100. * correct / len(test_loader.dataset))) 230 | 231 | 232 | for epoch in range(1, args.epochs + 1): 233 | train(epoch) 234 | #test() 235 | -------------------------------------------------------------------------------- /mnist/sparsemax.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.autograd import Variable 8 | 9 | class Sparsemax(nn.Module): 10 | def __init__(self, num_clusters, num_neurons_per_cluster): 11 | super(Sparsemax, self).__init__() 12 | self.num_clusters = num_clusters 13 | self.num_neurons_per_cluster = num_neurons_per_cluster 14 | 15 | def forward(self, input): 16 | 17 | input_reshape = torch.zeros(input.size()) 18 | input_reshape = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 19 | dim = 2 20 | #translate for numerical stability 21 | input_shift = input_reshape # - torch.max(input_reshape, dim)[0].expand_as(input_reshape) 22 | 23 | #sorting input in descending order 24 | z_sorted = torch.sort(input_shift, dim=dim, descending=True)[0] 25 | input_size = input_shift.size()[dim] 26 | range_values = Variable(torch.arange(1, input_size+1), requires_grad=False).cuda() 27 | range_values = range_values.expand_as(z_sorted) 28 | 29 | #Determine sparsity of projection 30 | bound = Variable(torch.zeros(z_sorted.size()),requires_grad=False).cuda() 31 | bound = 1 + torch.addcmul(bound, range_values, z_sorted) 32 | cumsum_zs = torch.cumsum(z_sorted, dim) 33 | is_gt = torch.gt(bound, cumsum_zs).type(torch.FloatTensor).cuda() 34 | valid = Variable(torch.zeros(range_values.size()),requires_grad=False).cuda() 35 | valid = torch.addcmul(valid, range_values, is_gt) 36 | k_max = torch.max(valid, dim)[0] 37 | zs_sparse = Variable(torch.zeros(z_sorted.size()),requires_grad=False).cuda() 38 | zs_sparse = torch.addcmul(zs_sparse, is_gt, z_sorted) 39 | sum_zs = (torch.sum(zs_sparse, dim) - 1) 40 | taus = Variable(torch.zeros(k_max.size()),requires_grad=False).cuda() 41 | taus = torch.addcdiv(taus, (torch.sum(zs_sparse, dim) - 1), k_max) 42 | taus_expanded = taus.expand_as(input_reshape) 43 | output = Variable(torch.zeros(input_reshape.size())).cuda() 44 | output = torch.max(output, input_shift - taus_expanded) 45 | return output.view(-1, self.num_clusters*self.num_neurons_per_cluster), zs_sparse,taus, is_gt 46 | 47 | 48 | def backward(self, grad_output): 49 | self.output = self.output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 50 | grad_output = grad_output.view(-1,self.num_clusters, self.num_neurons_per_cluster) 51 | dim = 2 52 | non_zeros = Variable(torch.ne(self.output, 0).type(torch.FloatTensor), requires_grad=False).cuda() 53 | mask_grad = Variable(torch.zeros(self.output.size()), requires_grad=False).cuda() 54 | mask_grad = torch.addcmul(mask_grad, non_zeros, grad_output) 55 | sum_mask_grad = torch.sum(mask_grad, dim) 56 | l1_norm_non_zeros = torch.sum(non_zeros, dim) 57 | sum_v = Variable(torch.zeros(sum_mask_grad.size()), requires_grad=False).cuda() 58 | sum_v = torch.addcdiv(sum_v, sum_mask_grad, l1_norm_non_zeros) 59 | self.gradInput = Variable(torch.zeros(grad_output.size())) 60 | self.gradInput = torch.addcmul(self.gradInput, non_zeros, grad_output - sum_v.expand_as(grad_output)) 61 | self.gradInput = self.gradInput.view(-1, self.num_clusters*self.num_neurons_per_cluster) 62 | return self.gradInput 63 | 64 | class MultiLabelSparseMaxLoss(nn.Module): 65 | 66 | def __init__(self, num_clusters, num_neurons_per_cluster): 67 | super(MultiLabelSparseMaxLoss, self).__init__() 68 | self.num_clusters = num_clusters 69 | self.num_neurons_per_cluster = num_neurons_per_cluster 70 | 71 | def forward(self, input, zs_sparse, target, output_sparsemax, taus, is_gt): 72 | self.output_sparsemax = output_sparsemax 73 | input = input.view(-1, self.num_clusters, self.num_neurons_per_cluster) 74 | self.target = target.view(-1, self.num_clusters, self.num_neurons_per_cluster) 75 | batch_size = input.size(0) 76 | dim = 2 77 | target_times_input = torch.sum(self.target * input, dim) 78 | target_inner_product = torch.sum(self.target * self.target, dim) 79 | zs_squared = zs_sparse * zs_sparse 80 | taus_squared = (taus * taus).expand_as(zs_squared) 81 | taus_squared = taus_squared * is_gt 82 | sum_input_taus = torch.sum(zs_squared - taus_squared, dim) 83 | sparsemax_loss = - target_times_input + 0.5*sum_input_taus + 0.5*target_inner_product 84 | sparsemax_loss = torch.sum(sparsemax_loss)/(batch_size * self.num_clusters) 85 | return sparsemax_loss 86 | 87 | def backward(self): 88 | grad_output = (- self.target + self.output_sparsemax)/(batch_size * self.num_clusters) 89 | return grad_output 90 | --------------------------------------------------------------------------------