├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── release.sh ├── setup.cfg ├── setup.py ├── subLSTM ├── __init__.py ├── functional │ ├── __init__.py │ └── cell.py └── nn │ ├── __init__.py │ ├── cell.py │ └── rnn.py ├── tasks └── word_language_model │ ├── README.md │ ├── data.py │ ├── data │ └── penn │ │ ├── test.txt │ │ ├── train.txt │ │ └── valid.txt │ ├── generate.py │ ├── main.py │ ├── model.py │ └── requirements.txt └── test ├── test_cell.py ├── test_function.py └── test_rnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | __pycache__/ 3 | .pypirc 4 | pred.txt 5 | multi-bleu.perl 6 | *.pt 7 | *.pyc 8 | #.* 9 | .idea 10 | *.sublime-* 11 | .DS_Store 12 | data/ 13 | build/ 14 | venv/ 15 | __pycache__/ 16 | *.lang 17 | *.log 18 | .cache/ 19 | dist/ 20 | dnc.egg-info/ 21 | tasks/checkpoints/ 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | install: 6 | - pip install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl 7 | - pip install numpy 8 | - pip install visdom 9 | # command to run tests 10 | script: 11 | - pytest -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Russi Chatterjee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # subtractive LSTM (subLSTM), for Pytorch 2 | 3 | [![Build Status](https://travis-ci.org/ixaxaar/pytorch-sublstm.svg?branch=master)](https://travis-ci.org/ixaxaar/pytorch-sublstm) [![PyPI version](https://badge.fury.io/py/pytorch-sublstm.svg)](https://badge.fury.io/py/pytorch-sublstm) 4 | 5 | This is an implementation of subLSTM described in the paper [Cortical microcircuits as gated-recurrent neural networks, Rui Ponte Costa et al.](https://arxiv.org/abs/1711.02448) 6 | 7 | ## Install 8 | 9 | ```bash 10 | pip install pytorch-sublstm 11 | ``` 12 | 13 | 14 | ## Usage 15 | 16 | **Parameters**: 17 | 18 | Following are the constructor parameters: 19 | 20 | | Argument | Default | Description | 21 | | --- | --- | --- | 22 | | input_size | `None` | Size of the input vectors | 23 | | hidden_size | `None` | Size of hidden units | 24 | | num_layers | `1` | Number of layers in the network | 25 | | bias | `True` | Bias | 26 | | batch_first | `False` | Whether data is fed batch first | 27 | | dropout | `0` | Dropout between layers in the network | 28 | | bidirectional | `False` | If the network is bidirectional | 29 | 30 | 31 | ### Example usage: 32 | 33 | #### nn Interface 34 | ```python 35 | import torch 36 | from torch.autograd import Variable 37 | from subLSTM.nn import SubLSTM 38 | 39 | hidden_size = 20 40 | input_size = 10 41 | seq_len = 5 42 | batch_size = 7 43 | hidden = None 44 | 45 | input = Variable(torch.randn(batch_size, seq_len, input_size)) 46 | 47 | rnn = SubLSTM(input_size, hidden_size, num_layers=2, bias=True, batch_first=True) 48 | 49 | # forward pass 50 | output, hidden = rnn(input, hidden) 51 | ``` 52 | 53 | #### Cell Interface 54 | 55 | ```python 56 | import torch 57 | from torch.autograd import Variable 58 | from subLSTM.nn import SubLSTMCell 59 | 60 | hidden_size = 20 61 | input_size = 10 62 | seq_len = 5 63 | batch_size = 7 64 | hidden = None 65 | 66 | hx = Variable(torch.randn(batch_size, hidden_size)) 67 | cx = Variable(torch.randn(batch_size, hidden_size)) 68 | 69 | input = Variable(torch.randn(batch_size, input_size)) 70 | 71 | cell = SubLSTMCell(input_size, hidden_size, bias=True) 72 | (hx, cx) = cell(input, (hx, cx)) 73 | ``` 74 | 75 | ### Tasks: 76 | 77 | A language modeling task is included [here](./tasks/word_language_model/). 78 | Refer to its [README](./tasks/word_language_model/README.md) for more info. 79 | 80 | 81 | ### Attributions: 82 | 83 | A lot of the code is recycled from [pytorch](https://pytorch.org) 84 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | rm -rf dist/ 4 | python3 setup.py sdist 5 | python3 setup.py bdist_wheel 6 | twine upload dist/* 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """A setuptools based setup module. 4 | See: 5 | https://packaging.python.org/en/latest/distributing.html 6 | https://github.com/pypa/sampleproject 7 | """ 8 | 9 | # Always prefer setuptools over distutils 10 | from setuptools import setup, find_packages 11 | # To use a consistent encoding 12 | from codecs import open 13 | from os import path 14 | 15 | here = path.abspath(path.dirname(__file__)) 16 | 17 | # Get the long description from the README file 18 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 19 | long_description = f.read() 20 | 21 | setup( 22 | name='pytorch-sublstm', 23 | 24 | version='0.0.2', 25 | 26 | description='Differentiable Neural Computer, for Pytorch', 27 | long_description=long_description, 28 | 29 | # The project's main homepage. 30 | url='https://github.com/ixaxaar/pytorch-sublstm', 31 | 32 | # Author details 33 | author='Russi Chatterjee', 34 | author_email='root@ixaxaar.in', 35 | 36 | # Choose your license 37 | license='MIT', 38 | 39 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 40 | classifiers=[ 41 | 'Development Status :: 3 - Alpha', 42 | 43 | 'Intended Audience :: Science/Research', 44 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 45 | 46 | 'License :: OSI Approved :: MIT License', 47 | 48 | 'Programming Language :: Python :: 3', 49 | 'Programming Language :: Python :: 3.3', 50 | 'Programming Language :: Python :: 3.4', 51 | 'Programming Language :: Python :: 3.5', 52 | 'Programming Language :: Python :: 3.6', 53 | ], 54 | 55 | keywords='cortical microcircuit pytorch sublstm', 56 | 57 | packages=find_packages(exclude=['contrib', 'docs', 'tests', 'tasks']), 58 | 59 | install_requires=['torch', 'numpy'], 60 | 61 | extras_require={ 62 | 'dev': ['check-manifest'], 63 | 'test': ['coverage'], 64 | }, 65 | 66 | python_requires='>=3', 67 | ) 68 | -------------------------------------------------------------------------------- /subLSTM/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | -------------------------------------------------------------------------------- /subLSTM/functional/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from .cell import SubLSTMCell 4 | -------------------------------------------------------------------------------- /subLSTM/functional/cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch.nn as nn 4 | import torch as T 5 | import torch.nn.functional as F 6 | 7 | 8 | def SubLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): 9 | 10 | hx, cx = hidden 11 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 12 | 13 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 14 | 15 | ingate = F.sigmoid(ingate) 16 | forgetgate = F.sigmoid(forgetgate) 17 | cellgate = F.sigmoid(cellgate) 18 | outgate = F.sigmoid(outgate) 19 | 20 | cy = (forgetgate * cx) + (cellgate - ingate) 21 | hy = F.sigmoid(cy) - outgate 22 | 23 | return hy, cy 24 | -------------------------------------------------------------------------------- /subLSTM/nn/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from .cell import SubLSTMCell 4 | from .rnn import SubLSTM 5 | -------------------------------------------------------------------------------- /subLSTM/nn/cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch.nn as nn 4 | import torch as T 5 | import torch.nn.functional as F 6 | 7 | from torch.nn.modules.rnn import RNNCellBase 8 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF 9 | 10 | import math 11 | 12 | 13 | class SubLSTMCell(RNNCellBase): 14 | r"""A long sub-short-term memory (subLSTM) cell, as described in the paper: 15 | https://arxiv.org/abs/1711.02448 16 | 17 | .. math:: 18 | 19 | \begin{array}{ll} 20 | i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ 21 | f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ 22 | g = \mathrm{sigmoid}(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ 23 | o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ 24 | c' = f * c + g - i \\ 25 | h' = \mathrm{sigmoid}(c') - o \\ 26 | \end{array} 27 | 28 | Args: 29 | input_size: The number of expected features in the input x 30 | hidden_size: The number of features in the hidden state h 31 | bias: If `False`, then the layer does not use bias weights `b_ih` and 32 | `b_hh`. Default: True 33 | 34 | Inputs: input, (h_0, c_0) 35 | - **input** (batch, input_size): tensor containing input features 36 | - **h_0** (batch, hidden_size): tensor containing the initial hidden 37 | state for each element in the batch. 38 | - **c_0** (batch. hidden_size): tensor containing the initial cell state 39 | for each element in the batch. 40 | 41 | Outputs: h_1, c_1 42 | - **h_1** (batch, hidden_size): tensor containing the next hidden state 43 | for each element in the batch 44 | - **c_1** (batch, hidden_size): tensor containing the next cell state 45 | for each element in the batch 46 | 47 | Attributes: 48 | weight_ih: the learnable input-hidden weights, of shape 49 | `(4*hidden_size x input_size)` 50 | weight_hh: the learnable hidden-hidden weights, of shape 51 | `(4*hidden_size x hidden_size)` 52 | bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)` 53 | bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)` 54 | 55 | Examples:: 56 | 57 | >>> rnn = nn.SubLSTMCell(10, 20) 58 | >>> input = Variable(torch.randn(6, 3, 10)) 59 | >>> hx = Variable(torch.randn(3, 20)) 60 | >>> cx = Variable(torch.randn(3, 20)) 61 | >>> output = [] 62 | >>> for i in range(6): 63 | ... hx, cx = rnn(input[i], (hx, cx)) 64 | ... output.append(hx) 65 | """ 66 | 67 | def __init__(self, input_size, hidden_size, bias=True): 68 | super(SubLSTMCell, self).__init__() 69 | self.input_size = input_size 70 | self.hidden_size = hidden_size 71 | self.bias = bias 72 | self.weight_ih = nn.Parameter(T.Tensor(4 * hidden_size, input_size)) 73 | self.weight_hh = nn.Parameter(T.Tensor(4 * hidden_size, hidden_size)) 74 | if bias: 75 | self.bias_ih = nn.Parameter(T.Tensor(4 * hidden_size)) 76 | self.bias_hh = nn.Parameter(T.Tensor(4 * hidden_size)) 77 | else: 78 | self.register_parameter('bias_ih', None) 79 | self.register_parameter('bias_hh', None) 80 | self.reset_parameters() 81 | 82 | def reset_parameters(self): 83 | stdv = 1.0 / math.sqrt(self.hidden_size) 84 | for weight in self.parameters(): 85 | weight.data.uniform_(-stdv, stdv) 86 | 87 | def forward(self, input, hx): 88 | return SubLSTMCellF( 89 | input, hx, 90 | self.weight_ih, self.weight_hh, 91 | self.bias_ih, self.bias_hh, 92 | ) 93 | -------------------------------------------------------------------------------- /subLSTM/nn/rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch.nn as nn 4 | import torch as T 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable as var 7 | from torch.nn import Module 8 | 9 | from torch.nn.modules.rnn import RNNCellBase 10 | from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence as pack, pad_packed_sequence as pad 11 | 12 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF 13 | 14 | import math 15 | 16 | 17 | class SubLSTM(Module): 18 | 19 | def __init__( 20 | self, 21 | input_size, 22 | hidden_size, 23 | num_layers=1, 24 | bias=True, 25 | batch_first=False, 26 | dropout=0, 27 | bidirectional=False 28 | ): 29 | super(SubLSTM, self).__init__() 30 | self.input_size = input_size 31 | self.hidden_size = hidden_size 32 | self.num_layers = num_layers 33 | self.bias = bias 34 | self.batch_first = batch_first 35 | self.dropout = dropout 36 | self.dropout_state = {} 37 | self.bidirectional = bidirectional 38 | num_directions = 2 if bidirectional else 1 39 | 40 | gate_size = 4 * hidden_size 41 | 42 | self._all_weights = [] 43 | for layer in range(num_layers): 44 | for direction in range(num_directions): 45 | layer_input_size = input_size if layer == 0 else hidden_size * num_directions 46 | 47 | w_ih = nn.Parameter(T.Tensor(gate_size, layer_input_size)) 48 | w_hh = nn.Parameter(T.Tensor(gate_size, hidden_size)) 49 | b_ih = nn.Parameter(T.Tensor(gate_size)) 50 | b_hh = nn.Parameter(T.Tensor(gate_size)) 51 | layer_params = (w_ih, w_hh, b_ih, b_hh) 52 | 53 | suffix = '_reverse' if direction == 1 else '' 54 | param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}'] 55 | if bias: 56 | param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}'] 57 | param_names = [x.format(layer, suffix) for x in param_names] 58 | 59 | for name, param in zip(param_names, layer_params): 60 | setattr(self, name, param) 61 | self._all_weights.append(param_names) 62 | 63 | self.flatten_parameters() 64 | self.reset_parameters() 65 | 66 | def flatten_parameters(self): 67 | pass 68 | 69 | def _apply(self, fn): 70 | ret = super(SubLSTM, self)._apply(fn) 71 | self.flatten_parameters() 72 | return ret 73 | 74 | def reset_parameters(self): 75 | stdv = 1.0 / math.sqrt(self.hidden_size) 76 | for weight in self.parameters(): 77 | weight.data.uniform_(-stdv, stdv) 78 | 79 | def forward(self, input, hx=None): 80 | timesteps = input.size(1) if self.batch_first else input.size(0) 81 | directions = 2 if self.bidirectional else 1 82 | is_packed = isinstance(input, PackedSequence) 83 | 84 | if is_packed: 85 | input, batch_sizes = pad(input) 86 | max_batch_size = batch_sizes[0] 87 | else: 88 | batch_sizes = None 89 | max_batch_size = input.size(0) if self.batch_first else input.size(1) 90 | 91 | # layer * direction 92 | if hx is None: 93 | num_directions = 2 if self.bidirectional else 1 94 | hx = var(input.data.new(max_batch_size, self.hidden_size).zero_(), requires_grad=False) 95 | hx = (hx, hx) 96 | hx = [[hx for x in range(directions)] for d in range(self.num_layers)] 97 | 98 | # make weights indexable with layer -> direction 99 | ws = self.all_weights 100 | if directions == 1: 101 | ws = [ [w] for w in ws ] 102 | else: 103 | ws = [ [ws[l*2], ws[l*2+1]] for l in range(self.num_layers) ] 104 | 105 | # make input batch-first, separate into timeslice wise chunks 106 | input = input if self.batch_first else input.transpose(0, 1) 107 | os = [[input[:, i, :] for i in range(timesteps)] for d in range(directions)] 108 | if directions > 1: 109 | os[1].reverse() 110 | 111 | for time in range(timesteps): 112 | for layer in range(self.num_layers): 113 | for direction in range(directions): 114 | 115 | if self.bias: 116 | (w_ih, w_hh, b_ih, b_hh) = ws[layer][direction] 117 | else: 118 | (w_ih, w_hh) = ws[layer][direction] 119 | b_ih = None 120 | b_hh = None 121 | 122 | hy, cy = SubLSTMCellF(os[direction][time], hx[layer][direction], w_ih, w_hh, b_ih, b_hh) 123 | hx[layer][direction] = (hy, cy) 124 | os[direction][time] = hy 125 | 126 | if directions > 1: 127 | os[0][time] = T.cat([ os[d][time] for d in range(directions) ], -1) 128 | os[1][time] = os[0][time] 129 | 130 | output = T.stack([T.stack(o, 1) for o in os]) 131 | output = T.cat(output, -1) if self.bidirectional else output[0] 132 | output = output if self.batch_first else output.transpose(0, 1) 133 | 134 | if is_packed: 135 | output = pack(output, batch_sizes) 136 | return output, hx 137 | 138 | def __repr__(self): 139 | s = '{name}({input_size}, {hidden_size}' 140 | if self.num_layers != 1: 141 | s += ', num_layers={num_layers}' 142 | if self.bias is not True: 143 | s += ', bias={bias}' 144 | if self.batch_first is not False: 145 | s += ', batch_first={batch_first}' 146 | if self.dropout != 0: 147 | s += ', dropout={dropout}' 148 | if self.bidirectional is not False: 149 | s += ', bidirectional={bidirectional}' 150 | s += ')' 151 | return s.format(name=self.__class__.__name__, **self.__dict__) 152 | 153 | def __setstate__(self, d): 154 | super(SubLSTM, self).__setstate__(d) 155 | self.__dict__.setdefault('_data_ptrs', []) 156 | if 'all_weights' in d: 157 | self._all_weights = d['all_weights'] 158 | if isinstance(self._all_weights[0][0], str): 159 | return 160 | num_layers = self.num_layers 161 | num_directions = 2 if self.bidirectional else 1 162 | self._all_weights = [] 163 | for layer in range(num_layers): 164 | for direction in range(num_directions): 165 | suffix = '_reverse' if direction == 1 else '' 166 | weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}'] 167 | weights = [x.format(layer, suffix) for x in weights] 168 | if self.bias: 169 | self._all_weights += [weights] 170 | else: 171 | self._all_weights += [weights[:2]] 172 | 173 | @property 174 | def all_weights(self): 175 | return [[getattr(self, weight) for weight in weights] for weights in self._all_weights] 176 | -------------------------------------------------------------------------------- /tasks/word_language_model/README.md: -------------------------------------------------------------------------------- 1 | # Word-level language modeling RNN 2 | 3 | This example trains a multi-layer RNN (Elman, GRU, LSTM or subLSTM) on a language modeling task. 4 | By default, the training script uses the PTB dataset, provided. 5 | The trained model can then be used by the generate script to generate new text. 6 | 7 | ```bash 8 | # Train a subLSTM on PTB with CUDA, reaching perplexity of 136.90 (15 epochs) 9 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 100 --lr 0.001 --optim adam 10 | # Generate samples from the trained subLSTM model. 11 | python generate.py 12 | ``` 13 | 14 | The model uses the `nn.RNN` module (and its sister modules `nn.GRU`, `nn.LSTM` and `sublstm.SubLSTM`). 15 | which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. 16 | 17 | During training, if a keyboard interrupt (Ctrl-C) is received, 18 | training is stopped and the current model is evaluated against the test dataset. 19 | 20 | The `main.py` script accepts the following arguments: 21 | 22 | ```bash 23 | optional arguments: 24 | -h, --help show this help message and exit 25 | --data DATA location of the data corpus 26 | --model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, 27 | subLSTM) 28 | --emsize EMSIZE size of word embeddings 29 | --nhid NHID number of hidden units per layer 30 | --nlayers NLAYERS number of layers 31 | --lr LR initial learning rate 32 | --clip CLIP gradient clipping 33 | --optim OPTIM learning rule, supports 34 | adam|sparseadam|adamax|rmsprop|sgd|adagrad|adadelta 35 | --epochs EPOCHS upper epoch limit 36 | --batch_size N batch size 37 | --bptt BPTT sequence length 38 | --dropout DROPOUT dropout applied to layers (0 = no dropout) 39 | --tied tie the word embedding and softmax weights 40 | --seed SEED random seed 41 | --cuda use CUDA 42 | --log-interval N report interval 43 | --save SAVE path to save the final model 44 | ``` 45 | -------------------------------------------------------------------------------- /tasks/word_language_model/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Dictionary(object): 5 | def __init__(self): 6 | self.word2idx = {} 7 | self.idx2word = [] 8 | 9 | def add_word(self, word): 10 | if word not in self.word2idx: 11 | self.idx2word.append(word) 12 | self.word2idx[word] = len(self.idx2word) - 1 13 | return self.word2idx[word] 14 | 15 | def __len__(self): 16 | return len(self.idx2word) 17 | 18 | 19 | class Corpus(object): 20 | def __init__(self, path): 21 | self.dictionary = Dictionary() 22 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 23 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 24 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 25 | 26 | def tokenize(self, path): 27 | """Tokenizes a text file.""" 28 | assert os.path.exists(path) 29 | # Add words to the dictionary 30 | with open(path, 'r') as f: 31 | tokens = 0 32 | for line in f: 33 | words = line.split() + [''] 34 | tokens += len(words) 35 | for word in words: 36 | self.dictionary.add_word(word) 37 | 38 | # Tokenize file content 39 | with open(path, 'r') as f: 40 | ids = torch.LongTensor(tokens) 41 | token = 0 42 | for line in f: 43 | words = line.split() + [''] 44 | for word in words: 45 | ids[token] = self.dictionary.word2idx[word] 46 | token += 1 47 | 48 | return ids 49 | -------------------------------------------------------------------------------- /tasks/word_language_model/generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Penn Tree Bank 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | import data 14 | 15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') 16 | 17 | # Model parameters. 18 | parser.add_argument('--data', type=str, default='./data/penn', 19 | help='location of the data corpus') 20 | parser.add_argument('--checkpoint', type=str, default='./model.pt', 21 | help='model checkpoint to use') 22 | parser.add_argument('--outf', type=str, default='generated.txt', 23 | help='output file for generated text') 24 | parser.add_argument('--words', type=int, default='1000', 25 | help='number of words to generate') 26 | parser.add_argument('--seed', type=int, default=1111, 27 | help='random seed') 28 | parser.add_argument('--cuda', action='store_true', 29 | help='use CUDA') 30 | parser.add_argument('--temperature', type=float, default=1.0, 31 | help='temperature - higher will increase diversity') 32 | parser.add_argument('--log-interval', type=int, default=100, 33 | help='reporting interval') 34 | args = parser.parse_args() 35 | 36 | # Set the random seed manually for reproducibility. 37 | torch.manual_seed(args.seed) 38 | if torch.cuda.is_available(): 39 | if not args.cuda: 40 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 41 | else: 42 | torch.cuda.manual_seed(args.seed) 43 | 44 | if args.temperature < 1e-3: 45 | parser.error("--temperature has to be greater or equal 1e-3") 46 | 47 | with open(args.checkpoint, 'rb') as f: 48 | model = torch.load(f) 49 | model.eval() 50 | 51 | if args.cuda: 52 | model.cuda() 53 | else: 54 | model.cpu() 55 | 56 | corpus = data.Corpus(args.data) 57 | ntokens = len(corpus.dictionary) 58 | hidden = model.init_hidden(1) 59 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) 60 | if args.cuda: 61 | input.data = input.data.cuda() 62 | 63 | with open(args.outf, 'w') as outf: 64 | for i in range(args.words): 65 | output, hidden = model(input, hidden) 66 | word_weights = output.squeeze().data.div(args.temperature).exp().cpu() 67 | word_idx = torch.multinomial(word_weights, 1)[0] 68 | input.data.fill_(word_idx) 69 | word = corpus.dictionary.idx2word[word_idx] 70 | 71 | outf.write(word + ('\n' if i % 20 == 19 else ' ')) 72 | 73 | if i % args.log_interval == 0: 74 | print('| Generated {}/{} words'.format(i, args.words)) 75 | -------------------------------------------------------------------------------- /tasks/word_language_model/main.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import argparse 3 | import time 4 | import math 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | 10 | import data 11 | import model 12 | 13 | parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') 14 | parser.add_argument('--data', type=str, default='./data/penn', 15 | help='location of the data corpus') 16 | parser.add_argument('--model', type=str, default='subLSTM', 17 | help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, subLSTM)') 18 | parser.add_argument('--emsize', type=int, default=200, 19 | help='size of word embeddings') 20 | parser.add_argument('--nhid', type=int, default=200, 21 | help='number of hidden units per layer') 22 | parser.add_argument('--nlayers', type=int, default=2, 23 | help='number of layers') 24 | parser.add_argument('--lr', type=float, default=0.0001, 25 | help='initial learning rate') 26 | parser.add_argument('--clip', type=float, default=0.25, 27 | help='gradient clipping') 28 | parser.add_argument('--optim', type=str, default='rmsprop', 29 | help='learning rule, supports adam|sparseadam|adamax|rmsprop|sgd|adagrad|adadelta') 30 | parser.add_argument('--epochs', type=int, default=40, 31 | help='upper epoch limit') 32 | parser.add_argument('--batch_size', type=int, default=20, metavar='N', 33 | help='batch size') 34 | parser.add_argument('--bptt', type=int, default=35, 35 | help='sequence length') 36 | parser.add_argument('--dropout', type=float, default=0.5, 37 | help='dropout applied to layers (0 = no dropout)') 38 | parser.add_argument('--tied', action='store_true', 39 | help='tie the word embedding and softmax weights') 40 | parser.add_argument('--seed', type=int, default=1111, 41 | help='random seed') 42 | parser.add_argument('--cuda', action='store_true', 43 | help='use CUDA') 44 | parser.add_argument('--log-interval', type=int, default=200, metavar='N', 45 | help='report interval') 46 | parser.add_argument('--save', type=str, default='model.pt', 47 | help='path to save the final model') 48 | args = parser.parse_args() 49 | # Set the random seed manually for reproducibility. 50 | torch.manual_seed(args.seed) 51 | if torch.cuda.is_available(): 52 | if not args.cuda: 53 | print("WARNING: You have a CUDA device, so you should probably run with --cuda") 54 | else: 55 | torch.cuda.manual_seed(args.seed) 56 | 57 | ############################################################################### 58 | # Load data 59 | ############################################################################### 60 | 61 | corpus = data.Corpus(args.data) 62 | 63 | # Starting from sequential data, batchify arranges the dataset into columns. 64 | # For instance, with the alphabet as the sequence and batch size 4, we'd get 65 | # ┌ a g m s ┐ 66 | # │ b h n t │ 67 | # │ c i o u │ 68 | # │ d j p v │ 69 | # │ e k q w │ 70 | # └ f l r x ┘. 71 | # These columns are treated as independent by the model, which means that the 72 | # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient 73 | # batch processing. 74 | 75 | def batchify(data, bsz): 76 | # Work out how cleanly we can divide the dataset into bsz parts. 77 | nbatch = data.size(0) // bsz 78 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 79 | data = data.narrow(0, 0, nbatch * bsz) 80 | # Evenly divide the data across the bsz batches. 81 | data = data.view(bsz, -1).t().contiguous() 82 | if args.cuda: 83 | data = data.cuda() 84 | return data 85 | 86 | eval_batch_size = 10 87 | train_data = batchify(corpus.train, args.batch_size) 88 | val_data = batchify(corpus.valid, eval_batch_size) 89 | test_data = batchify(corpus.test, eval_batch_size) 90 | 91 | ############################################################################### 92 | # Build the model 93 | ############################################################################### 94 | 95 | ntokens = len(corpus.dictionary) 96 | model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) 97 | if args.cuda: 98 | model.cuda() 99 | 100 | criterion = nn.CrossEntropyLoss() 101 | 102 | ############################################################################### 103 | # Training code 104 | ############################################################################### 105 | 106 | def repackage_hidden(h): 107 | """Wraps hidden states in new Variables, to detach them from their history.""" 108 | if h is None: 109 | return None 110 | if type(h) == Variable: 111 | return Variable(h.data) 112 | elif type(h) == list: 113 | [ repackage_hidden(x) for x in h ] 114 | elif type(h) == tuple: 115 | tuple([ repackage_hidden(x) for x in h ]) 116 | else: 117 | return tuple(repackage_hidden(v) for v in h) 118 | 119 | 120 | # get_batch subdivides the source data into chunks of length args.bptt. 121 | # If source is equal to the example output of the batchify function, with 122 | # a bptt-limit of 2, we'd get the following two Variables for i = 0: 123 | # ┌ a g m s ┐ ┌ b h n t ┐ 124 | # └ b h n t ┘ └ c i o u ┘ 125 | # Note that despite the name of the function, the subdivison of data is not 126 | # done along the batch dimension (i.e. dimension 1), since that was handled 127 | # by the batchify function. The chunks are along dimension 0, corresponding 128 | # to the seq_len dimension in the LSTM. 129 | 130 | def get_batch(source, i, evaluation=False): 131 | seq_len = min(args.bptt, len(source) - 1 - i) 132 | data = Variable(source[i:i+seq_len], volatile=evaluation) 133 | target = Variable(source[i+1:i+1+seq_len].view(-1)) 134 | return data, target 135 | 136 | 137 | def evaluate(data_source): 138 | # Turn on evaluation mode which disables dropout. 139 | model.eval() 140 | total_loss = 0 141 | ntokens = len(corpus.dictionary) 142 | hidden = model.init_hidden(eval_batch_size) 143 | for i in range(0, data_source.size(0) - 1, args.bptt): 144 | data, targets = get_batch(data_source, i, evaluation=True) 145 | output, hidden = model(data, hidden) 146 | output_flat = output.view(-1, ntokens) 147 | total_loss += len(data) * criterion(output_flat, targets).data 148 | hidden = repackage_hidden(hidden) 149 | return total_loss[0] / len(data_source) 150 | 151 | if args.optim == 'adam': 152 | optimizer = optim.Adam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 153 | if args.optim == 'sparseadam': 154 | optimizer = optim.SparseAdam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 155 | if args.optim == 'adamax': 156 | optimizer = optim.Adamax(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 157 | elif args.optim == 'rmsprop': 158 | optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=1e-10) # 0.0001 159 | elif args.optim == 'sgd': 160 | optimizer = optim.SGD(model.parameters(), lr=args.lr) # 0.01 161 | elif args.optim == 'adagrad': 162 | optimizer = optim.Adagrad(model.parameters(), lr=args.lr) 163 | elif args.optim == 'adadelta': 164 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 165 | 166 | def train(): 167 | # Turn on training mode which enables dropout. 168 | model.train() 169 | total_loss = 0 170 | start_time = time.time() 171 | ntokens = len(corpus.dictionary) 172 | hidden = model.init_hidden(args.batch_size) 173 | for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): 174 | data, targets = get_batch(train_data, i) 175 | # Starting each batch, we detach the hidden state from how it was previously produced. 176 | # If we didn't, the model would try backpropagating all the way to start of the dataset. 177 | hidden = repackage_hidden(hidden) 178 | optimizer.zero_grad() 179 | output, hidden = model(data, hidden) 180 | loss = criterion(output.view(-1, ntokens), targets) 181 | loss.backward() 182 | 183 | # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. 184 | torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) 185 | optimizer.step() 186 | 187 | total_loss += loss.data 188 | 189 | if batch % args.log_interval == 0 and batch > 0: 190 | cur_loss = total_loss[0] / args.log_interval 191 | elapsed = time.time() - start_time 192 | print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | ' 193 | 'loss {:5.2f} | ppl {:8.2f}'.format( 194 | epoch, batch, len(train_data) // args.bptt, lr, 195 | elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) 196 | total_loss = 0 197 | start_time = time.time() 198 | 199 | # Loop over epochs. 200 | lr = args.lr 201 | best_val_loss = None 202 | 203 | # At any point you can hit Ctrl + C to break out of training early. 204 | try: 205 | for epoch in range(1, args.epochs+1): 206 | epoch_start_time = time.time() 207 | train() 208 | val_loss = evaluate(val_data) 209 | print('-' * 89) 210 | print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 211 | 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), 212 | val_loss, math.exp(val_loss))) 213 | print('-' * 89) 214 | # Save the model if the validation loss is the best we've seen so far. 215 | if not best_val_loss or val_loss < best_val_loss: 216 | with open(args.save, 'wb') as f: 217 | torch.save(model, f) 218 | best_val_loss = val_loss 219 | else: 220 | # Anneal the learning rate if no improvement has been seen in the validation dataset. 221 | lr /= 4.0 222 | except KeyboardInterrupt: 223 | print('-' * 89) 224 | print('Exiting from training early') 225 | 226 | # Load the best saved model. 227 | with open(args.save, 'rb') as f: 228 | model = torch.load(f) 229 | 230 | # Run on test data. 231 | test_loss = evaluate(test_data) 232 | print('=' * 89) 233 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( 234 | test_loss, math.exp(test_loss))) 235 | print('=' * 89) 236 | -------------------------------------------------------------------------------- /tasks/word_language_model/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | from subLSTM.nn import SubLSTM 5 | 6 | 7 | class RNNModel(nn.Module): 8 | """Container module with an encoder, a recurrent module, and a decoder.""" 9 | 10 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 11 | super(RNNModel, self).__init__() 12 | self.drop = nn.Dropout(dropout) 13 | self.encoder = nn.Embedding(ntoken, ninp) 14 | if rnn_type in ['LSTM', 'GRU']: 15 | self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) 16 | elif rnn_type == 'subLSTM': 17 | self.rnn = SubLSTM(ninp, nhid, nlayers, dropout=dropout) 18 | else: 19 | try: 20 | nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] 21 | except KeyError: 22 | raise ValueError( """An invalid option for `--model` was supplied, 23 | options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 24 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) 25 | self.decoder = nn.Linear(nhid, ntoken) 26 | 27 | # Optionally tie weights as in: 28 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 29 | # https://arxiv.org/abs/1608.05859 30 | # and 31 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 32 | # https://arxiv.org/abs/1611.01462 33 | if tie_weights: 34 | if nhid != ninp: 35 | raise ValueError('When using the tied flag, nhid must be equal to emsize') 36 | self.decoder.weight = self.encoder.weight 37 | 38 | self.init_weights() 39 | 40 | self.rnn_type = rnn_type 41 | self.nhid = nhid 42 | self.nlayers = nlayers 43 | 44 | def init_weights(self): 45 | initrange = 0.1 46 | self.encoder.weight.data.uniform_(-initrange, initrange) 47 | self.decoder.bias.data.fill_(0) 48 | self.decoder.weight.data.uniform_(-initrange, initrange) 49 | 50 | def forward(self, input, hidden): 51 | emb = self.drop(self.encoder(input)) 52 | output, hidden = self.rnn(emb, hidden) 53 | output = self.drop(output) 54 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 55 | return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden 56 | 57 | def init_hidden(self, bsz): 58 | weight = next(self.parameters()).data 59 | if self.rnn_type == 'LSTM': 60 | return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), 61 | Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) 62 | elif self.rnn_type == 'subLSTM': 63 | return None 64 | else: 65 | return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) 66 | -------------------------------------------------------------------------------- /tasks/word_language_model/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | -------------------------------------------------------------------------------- /test/test_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytest 4 | import numpy as np 5 | 6 | import torch.nn as nn 7 | import torch as T 8 | from torch.autograd import Variable as var 9 | import torch.nn.functional as F 10 | from torch.nn.utils import clip_grad_norm 11 | import torch.optim as optim 12 | import numpy as np 13 | 14 | import sys 15 | import os 16 | import math 17 | import time 18 | sys.path.insert(0, '.') 19 | 20 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF 21 | from subLSTM.nn import SubLSTMCell 22 | 23 | 24 | def test_cell(): 25 | hidden_size = 20 26 | input_size = 10 27 | 28 | for bias in (True, False): 29 | input = var(T.randn(3, input_size)) 30 | hx = var(T.randn(3, hidden_size)) 31 | cx = var(T.randn(3, hidden_size)) 32 | 33 | cell = SubLSTMCell(input_size, hidden_size, bias=bias) 34 | 35 | for i in range(6): 36 | (hx, cx) = cell(input, (hx, cx)) 37 | 38 | hx.sum().backward() 39 | assert hx.size() == T.Size([3, hidden_size]) 40 | assert cx.size() == T.Size([3, hidden_size]) 41 | -------------------------------------------------------------------------------- /test/test_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytest 4 | import numpy as np 5 | 6 | import torch.nn as nn 7 | import torch as T 8 | from torch.autograd import Variable as var 9 | import torch.nn.functional as F 10 | from torch.nn.utils import clip_grad_norm 11 | import torch.optim as optim 12 | import numpy as np 13 | 14 | import sys 15 | import os 16 | import math 17 | import time 18 | sys.path.insert(0, '.') 19 | 20 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF 21 | from subLSTM.nn import SubLSTMCell 22 | 23 | 24 | def test_function(): 25 | hidden_size = 20 26 | input_size = 10 27 | 28 | for bias in (True, False): 29 | weight_ih = T.nn.Parameter(T.Tensor(4 * hidden_size, input_size)) 30 | weight_hh = T.nn.Parameter(T.Tensor(4 * hidden_size, hidden_size)) 31 | bias_ih = T.nn.Parameter(T.Tensor(4 * hidden_size)) if bias else None 32 | bias_hh = T.nn.Parameter(T.Tensor(4 * hidden_size)) if bias else None 33 | 34 | input = var(T.randn(3, input_size)) 35 | hx = var(T.randn(3, hidden_size)) 36 | cx = var(T.randn(3, hidden_size)) 37 | cell = SubLSTMCellF 38 | for i in range(6): 39 | hx, cx = cell(input, (hx, cx), weight_ih, weight_hh, bias_ih, bias_hh) 40 | 41 | hx.sum().backward() 42 | 43 | assert hx.size() == T.Size([3, hidden_size]) 44 | assert cx.size() == T.Size([3, hidden_size]) 45 | -------------------------------------------------------------------------------- /test/test_rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pythonbatch_size 2 | 3 | import pytest 4 | import numpy as np 5 | 6 | import torch.nn as nn 7 | import torch as T 8 | from torch.autograd import Variable as var 9 | import torch.nn.functional as F 10 | from torch.nn.utils import clip_grad_norm 11 | import torch.optim as optim 12 | import numpy as np 13 | 14 | import sys 15 | import os 16 | import math 17 | import time 18 | sys.path.insert(0, '.') 19 | 20 | from subLSTM.nn import SubLSTM 21 | 22 | 23 | def test_rnn(): 24 | hidden_size = 20 25 | input_size = 10 26 | seq_len = 5 27 | batch_size = 7 28 | 29 | for bias in (True, False): 30 | for batch_first in (True, False): 31 | input = var(T.randn(batch_size, seq_len, input_size)) if batch_first else var(T.randn(seq_len, batch_size, input_size)) 32 | hx = None 33 | rnn = SubLSTM(input_size, hidden_size, num_layers=2, bias=bias, batch_first=batch_first) 34 | 35 | outputs = [] 36 | for i in range(6): 37 | output, hx = rnn(input, hx) 38 | outputs.append(output) 39 | 40 | T.stack(outputs).sum().backward() 41 | 42 | assert hx[-1][-1][0].size() == T.Size([batch_size, hidden_size]) 43 | assert hx[-1][-1][1].size() == T.Size([batch_size, hidden_size]) 44 | 45 | 46 | 47 | def test_rnn_bidirectional(): 48 | hidden_size = 20 49 | input_size = 10 50 | seq_len = 5 51 | batch_size = 7 52 | 53 | for bias in (True, False): 54 | for batch_first in (True, False): 55 | input = var(T.randn(batch_size, seq_len, input_size)) if batch_first else var(T.randn(seq_len, batch_size, input_size)) 56 | hx = None 57 | rnn = SubLSTM(input_size, hidden_size, num_layers=3, bias=bias, batch_first=batch_first, bidirectional=True) 58 | 59 | outputs = [] 60 | for i in range(6): 61 | output, hx = rnn(input, hx) 62 | outputs.append(output) 63 | 64 | T.stack(outputs).sum().backward() 65 | 66 | assert hx[-1][-1][0].size() == T.Size([batch_size, hidden_size]) 67 | assert hx[-1][-1][1].size() == T.Size([batch_size, hidden_size]) 68 | 69 | --------------------------------------------------------------------------------