├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── release.sh
├── setup.cfg
├── setup.py
├── subLSTM
    ├── __init__.py
    ├── functional
    │   ├── __init__.py
    │   └── cell.py
    └── nn
    │   ├── __init__.py
    │   ├── cell.py
    │   └── rnn.py
├── tasks
    └── word_language_model
    │   ├── README.md
    │   ├── data.py
    │   ├── data
    │       └── penn
    │       │   ├── test.txt
    │       │   ├── train.txt
    │       │   └── valid.txt
    │   ├── generate.py
    │   ├── main.py
    │   ├── model.py
    │   └── requirements.txt
└── test
    ├── test_cell.py
    ├── test_function.py
    └── test_rnn.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pt
 2 | __pycache__/
 3 | .pypirc
 4 | pred.txt
 5 | multi-bleu.perl
 6 | *.pt
 7 | *.pyc
 8 | #.*
 9 | .idea
10 | *.sublime-*
11 | .DS_Store
12 | data/
13 | build/
14 | venv/
15 | __pycache__/
16 | *.lang
17 | *.log
18 | .cache/
19 | dist/
20 | dnc.egg-info/
21 | tasks/checkpoints/
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | # command to install dependencies
 5 | install:
 6 |   - pip install http://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl
 7 |   - pip install numpy
 8 |   - pip install visdom
 9 | # command to run tests
10 | script:
11 |   - pytest


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Russi Chatterjee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # subtractive LSTM (subLSTM), for Pytorch
 2 | 
 3 | [![Build Status](https://travis-ci.org/ixaxaar/pytorch-sublstm.svg?branch=master)](https://travis-ci.org/ixaxaar/pytorch-sublstm) [![PyPI version](https://badge.fury.io/py/pytorch-sublstm.svg)](https://badge.fury.io/py/pytorch-sublstm)
 4 | 
 5 | This is an implementation of subLSTM described in the paper [Cortical microcircuits as gated-recurrent neural networks, Rui Ponte Costa et al.](https://arxiv.org/abs/1711.02448)
 6 | 
 7 | ## Install
 8 | 
 9 | ```bash
10 | pip install pytorch-sublstm
11 | ```
12 | 
13 | 
14 | ## Usage
15 | 
16 | **Parameters**:
17 | 
18 | Following are the constructor parameters:
19 | 
20 | | Argument | Default | Description |
21 | | --- | --- | --- |
22 | | input_size | `None` | Size of the input vectors |
23 | | hidden_size | `None` | Size of hidden units |
24 | | num_layers | `1` | Number of layers in the network |
25 | | bias | `True` | Bias |
26 | | batch_first | `False` | Whether data is fed batch first |
27 | | dropout | `0` | Dropout between layers in the network |
28 | | bidirectional | `False` | If the network is bidirectional |
29 | 
30 | 
31 | ### Example usage:
32 | 
33 | #### nn Interface
34 | ```python
35 | import torch
36 | from torch.autograd import Variable
37 | from subLSTM.nn import SubLSTM
38 | 
39 | hidden_size = 20
40 | input_size = 10
41 | seq_len = 5
42 | batch_size = 7
43 | hidden = None
44 | 
45 | input = Variable(torch.randn(batch_size, seq_len, input_size))
46 | 
47 | rnn = SubLSTM(input_size, hidden_size, num_layers=2, bias=True, batch_first=True)
48 | 
49 | # forward pass
50 | output, hidden = rnn(input, hidden)
51 | ```
52 | 
53 | #### Cell Interface
54 | 
55 | ```python
56 | import torch
57 | from torch.autograd import Variable
58 | from subLSTM.nn import SubLSTMCell
59 | 
60 | hidden_size = 20
61 | input_size = 10
62 | seq_len = 5
63 | batch_size = 7
64 | hidden = None
65 | 
66 | hx = Variable(torch.randn(batch_size, hidden_size))
67 | cx = Variable(torch.randn(batch_size, hidden_size))
68 | 
69 | input = Variable(torch.randn(batch_size, input_size))
70 | 
71 | cell = SubLSTMCell(input_size, hidden_size, bias=True)
72 | (hx, cx) = cell(input, (hx, cx))
73 | ```
74 | 
75 | ### Tasks:
76 | 
77 | A language modeling task is included [here](./tasks/word_language_model/).
78 | Refer to its [README](./tasks/word_language_model/README.md) for more info.
79 | 
80 | 
81 | ### Attributions:
82 | 
83 | A lot of the code is recycled from [pytorch](https://pytorch.org)
84 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | rm -rf dist/
4 | python3 setup.py sdist
5 | python3 setup.py bdist_wheel
6 | twine upload dist/*
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """A setuptools based setup module.
 4 | See:
 5 | https://packaging.python.org/en/latest/distributing.html
 6 | https://github.com/pypa/sampleproject
 7 | """
 8 | 
 9 | # Always prefer setuptools over distutils
10 | from setuptools import setup, find_packages
11 | # To use a consistent encoding
12 | from codecs import open
13 | from os import path
14 | 
15 | here = path.abspath(path.dirname(__file__))
16 | 
17 | # Get the long description from the README file
18 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
19 |     long_description = f.read()
20 | 
21 | setup(
22 |     name='pytorch-sublstm',
23 | 
24 |     version='0.0.2',
25 | 
26 |     description='Differentiable Neural Computer, for Pytorch',
27 |     long_description=long_description,
28 | 
29 |     # The project's main homepage.
30 |     url='https://github.com/ixaxaar/pytorch-sublstm',
31 | 
32 |     # Author details
33 |     author='Russi Chatterjee',
34 |     author_email='root@ixaxaar.in',
35 | 
36 |     # Choose your license
37 |     license='MIT',
38 | 
39 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
40 |     classifiers=[
41 |         'Development Status :: 3 - Alpha',
42 | 
43 |         'Intended Audience :: Science/Research',
44 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
45 | 
46 |         'License :: OSI Approved :: MIT License',
47 | 
48 |         'Programming Language :: Python :: 3',
49 |         'Programming Language :: Python :: 3.3',
50 |         'Programming Language :: Python :: 3.4',
51 |         'Programming Language :: Python :: 3.5',
52 |         'Programming Language :: Python :: 3.6',
53 |     ],
54 | 
55 |     keywords='cortical microcircuit pytorch sublstm',
56 | 
57 |     packages=find_packages(exclude=['contrib', 'docs', 'tests', 'tasks']),
58 | 
59 |     install_requires=['torch', 'numpy'],
60 | 
61 |     extras_require={
62 |         'dev': ['check-manifest'],
63 |         'test': ['coverage'],
64 |     },
65 | 
66 |     python_requires='>=3',
67 | )
68 | 


--------------------------------------------------------------------------------
/subLSTM/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | 


--------------------------------------------------------------------------------
/subLSTM/functional/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from .cell import SubLSTMCell
4 | 


--------------------------------------------------------------------------------
/subLSTM/functional/cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch.nn as nn
 4 | import torch as T
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def SubLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
 9 | 
10 |   hx, cx = hidden
11 |   gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
12 | 
13 |   ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
14 | 
15 |   ingate = F.sigmoid(ingate)
16 |   forgetgate = F.sigmoid(forgetgate)
17 |   cellgate = F.sigmoid(cellgate)
18 |   outgate = F.sigmoid(outgate)
19 | 
20 |   cy = (forgetgate * cx) + (cellgate - ingate)
21 |   hy = F.sigmoid(cy) - outgate
22 | 
23 |   return hy, cy
24 | 


--------------------------------------------------------------------------------
/subLSTM/nn/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from .cell import SubLSTMCell
4 | from .rnn import SubLSTM
5 | 


--------------------------------------------------------------------------------
/subLSTM/nn/cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch.nn as nn
 4 | import torch as T
 5 | import torch.nn.functional as F
 6 | 
 7 | from torch.nn.modules.rnn import RNNCellBase
 8 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF
 9 | 
10 | import math
11 | 
12 | 
13 | class SubLSTMCell(RNNCellBase):
14 |   r"""A long sub-short-term memory (subLSTM) cell, as described in the paper:
15 |   https://arxiv.org/abs/1711.02448
16 | 
17 |   .. math::
18 | 
19 |     \begin{array}{ll}
20 |     i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
21 |     f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
22 |     g = \mathrm{sigmoid}(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
23 |     o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
24 |     c' = f * c + g - i \\
25 |     h' = \mathrm{sigmoid}(c') - o \\
26 |     \end{array}
27 | 
28 |   Args:
29 |     input_size: The number of expected features in the input x
30 |     hidden_size: The number of features in the hidden state h
31 |     bias: If `False`, then the layer does not use bias weights `b_ih` and
32 |       `b_hh`. Default: True
33 | 
34 |   Inputs: input, (h_0, c_0)
35 |     - **input** (batch, input_size): tensor containing input features
36 |     - **h_0** (batch, hidden_size): tensor containing the initial hidden
37 |       state for each element in the batch.
38 |     - **c_0** (batch. hidden_size): tensor containing the initial cell state
39 |       for each element in the batch.
40 | 
41 |   Outputs: h_1, c_1
42 |     - **h_1** (batch, hidden_size): tensor containing the next hidden state
43 |       for each element in the batch
44 |     - **c_1** (batch, hidden_size): tensor containing the next cell state
45 |       for each element in the batch
46 | 
47 |   Attributes:
48 |     weight_ih: the learnable input-hidden weights, of shape
49 |       `(4*hidden_size x input_size)`
50 |     weight_hh: the learnable hidden-hidden weights, of shape
51 |       `(4*hidden_size x hidden_size)`
52 |     bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
53 |     bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
54 | 
55 |   Examples::
56 | 
57 |     >>> rnn = nn.SubLSTMCell(10, 20)
58 |     >>> input = Variable(torch.randn(6, 3, 10))
59 |     >>> hx = Variable(torch.randn(3, 20))
60 |     >>> cx = Variable(torch.randn(3, 20))
61 |     >>> output = []
62 |     >>> for i in range(6):
63 |     ...   hx, cx = rnn(input[i], (hx, cx))
64 |     ...   output.append(hx)
65 |   """
66 | 
67 |   def __init__(self, input_size, hidden_size, bias=True):
68 |     super(SubLSTMCell, self).__init__()
69 |     self.input_size = input_size
70 |     self.hidden_size = hidden_size
71 |     self.bias = bias
72 |     self.weight_ih = nn.Parameter(T.Tensor(4 * hidden_size, input_size))
73 |     self.weight_hh = nn.Parameter(T.Tensor(4 * hidden_size, hidden_size))
74 |     if bias:
75 |       self.bias_ih = nn.Parameter(T.Tensor(4 * hidden_size))
76 |       self.bias_hh = nn.Parameter(T.Tensor(4 * hidden_size))
77 |     else:
78 |       self.register_parameter('bias_ih', None)
79 |       self.register_parameter('bias_hh', None)
80 |     self.reset_parameters()
81 | 
82 |   def reset_parameters(self):
83 |     stdv = 1.0 / math.sqrt(self.hidden_size)
84 |     for weight in self.parameters():
85 |       weight.data.uniform_(-stdv, stdv)
86 | 
87 |   def forward(self, input, hx):
88 |     return SubLSTMCellF(
89 |         input, hx,
90 |         self.weight_ih, self.weight_hh,
91 |         self.bias_ih, self.bias_hh,
92 |     )
93 | 


--------------------------------------------------------------------------------
/subLSTM/nn/rnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import torch.nn as nn
  4 | import torch as T
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable as var
  7 | from torch.nn import Module
  8 | 
  9 | from torch.nn.modules.rnn import RNNCellBase
 10 | from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence as pack, pad_packed_sequence as pad
 11 | 
 12 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF
 13 | 
 14 | import math
 15 | 
 16 | 
 17 | class SubLSTM(Module):
 18 | 
 19 |   def __init__(
 20 |       self,
 21 |       input_size,
 22 |       hidden_size,
 23 |       num_layers=1,
 24 |       bias=True,
 25 |       batch_first=False,
 26 |       dropout=0,
 27 |       bidirectional=False
 28 |   ):
 29 |     super(SubLSTM, self).__init__()
 30 |     self.input_size = input_size
 31 |     self.hidden_size = hidden_size
 32 |     self.num_layers = num_layers
 33 |     self.bias = bias
 34 |     self.batch_first = batch_first
 35 |     self.dropout = dropout
 36 |     self.dropout_state = {}
 37 |     self.bidirectional = bidirectional
 38 |     num_directions = 2 if bidirectional else 1
 39 | 
 40 |     gate_size = 4 * hidden_size
 41 | 
 42 |     self._all_weights = []
 43 |     for layer in range(num_layers):
 44 |       for direction in range(num_directions):
 45 |         layer_input_size = input_size if layer == 0 else hidden_size * num_directions
 46 | 
 47 |         w_ih = nn.Parameter(T.Tensor(gate_size, layer_input_size))
 48 |         w_hh = nn.Parameter(T.Tensor(gate_size, hidden_size))
 49 |         b_ih = nn.Parameter(T.Tensor(gate_size))
 50 |         b_hh = nn.Parameter(T.Tensor(gate_size))
 51 |         layer_params = (w_ih, w_hh, b_ih, b_hh)
 52 | 
 53 |         suffix = '_reverse' if direction == 1 else ''
 54 |         param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
 55 |         if bias:
 56 |           param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
 57 |         param_names = [x.format(layer, suffix) for x in param_names]
 58 | 
 59 |         for name, param in zip(param_names, layer_params):
 60 |           setattr(self, name, param)
 61 |         self._all_weights.append(param_names)
 62 | 
 63 |     self.flatten_parameters()
 64 |     self.reset_parameters()
 65 | 
 66 |   def flatten_parameters(self):
 67 |     pass
 68 | 
 69 |   def _apply(self, fn):
 70 |     ret = super(SubLSTM, self)._apply(fn)
 71 |     self.flatten_parameters()
 72 |     return ret
 73 | 
 74 |   def reset_parameters(self):
 75 |     stdv = 1.0 / math.sqrt(self.hidden_size)
 76 |     for weight in self.parameters():
 77 |       weight.data.uniform_(-stdv, stdv)
 78 | 
 79 |   def forward(self, input, hx=None):
 80 |     timesteps = input.size(1) if self.batch_first else input.size(0)
 81 |     directions = 2 if self.bidirectional else 1
 82 |     is_packed = isinstance(input, PackedSequence)
 83 | 
 84 |     if is_packed:
 85 |       input, batch_sizes = pad(input)
 86 |       max_batch_size = batch_sizes[0]
 87 |     else:
 88 |       batch_sizes = None
 89 |       max_batch_size = input.size(0) if self.batch_first else input.size(1)
 90 | 
 91 |     # layer * direction
 92 |     if hx is None:
 93 |       num_directions = 2 if self.bidirectional else 1
 94 |       hx = var(input.data.new(max_batch_size, self.hidden_size).zero_(), requires_grad=False)
 95 |       hx = (hx, hx)
 96 |       hx = [[hx for x in range(directions)] for d in range(self.num_layers)]
 97 | 
 98 |     # make weights indexable with layer -> direction
 99 |     ws = self.all_weights
100 |     if directions == 1:
101 |       ws = [ [w] for w in ws ]
102 |     else:
103 |       ws = [ [ws[l*2], ws[l*2+1]] for l in range(self.num_layers) ]
104 | 
105 |     # make input batch-first, separate into timeslice wise chunks
106 |     input = input if self.batch_first else input.transpose(0, 1)
107 |     os = [[input[:, i, :] for i in range(timesteps)] for d in range(directions)]
108 |     if directions > 1:
109 |       os[1].reverse()
110 | 
111 |     for time in range(timesteps):
112 |       for layer in range(self.num_layers):
113 |         for direction in range(directions):
114 | 
115 |           if self.bias:
116 |             (w_ih, w_hh, b_ih, b_hh) = ws[layer][direction]
117 |           else:
118 |             (w_ih, w_hh) = ws[layer][direction]
119 |             b_ih = None
120 |             b_hh = None
121 | 
122 |           hy, cy = SubLSTMCellF(os[direction][time], hx[layer][direction], w_ih, w_hh, b_ih, b_hh)
123 |           hx[layer][direction] = (hy, cy)
124 |           os[direction][time] = hy
125 | 
126 |         if directions > 1:
127 |           os[0][time] = T.cat([ os[d][time] for d in range(directions) ], -1)
128 |           os[1][time] = os[0][time]
129 | 
130 |     output = T.stack([T.stack(o, 1) for o in os])
131 |     output = T.cat(output, -1) if self.bidirectional else output[0]
132 |     output = output if self.batch_first else output.transpose(0, 1)
133 | 
134 |     if is_packed:
135 |       output = pack(output, batch_sizes)
136 |     return output, hx
137 | 
138 |   def __repr__(self):
139 |     s = '{name}({input_size}, {hidden_size}'
140 |     if self.num_layers != 1:
141 |       s += ', num_layers={num_layers}'
142 |     if self.bias is not True:
143 |       s += ', bias={bias}'
144 |     if self.batch_first is not False:
145 |       s += ', batch_first={batch_first}'
146 |     if self.dropout != 0:
147 |       s += ', dropout={dropout}'
148 |     if self.bidirectional is not False:
149 |       s += ', bidirectional={bidirectional}'
150 |     s += ')'
151 |     return s.format(name=self.__class__.__name__, **self.__dict__)
152 | 
153 |   def __setstate__(self, d):
154 |     super(SubLSTM, self).__setstate__(d)
155 |     self.__dict__.setdefault('_data_ptrs', [])
156 |     if 'all_weights' in d:
157 |       self._all_weights = d['all_weights']
158 |     if isinstance(self._all_weights[0][0], str):
159 |       return
160 |     num_layers = self.num_layers
161 |     num_directions = 2 if self.bidirectional else 1
162 |     self._all_weights = []
163 |     for layer in range(num_layers):
164 |       for direction in range(num_directions):
165 |         suffix = '_reverse' if direction == 1 else ''
166 |         weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
167 |         weights = [x.format(layer, suffix) for x in weights]
168 |         if self.bias:
169 |           self._all_weights += [weights]
170 |         else:
171 |           self._all_weights += [weights[:2]]
172 | 
173 |   @property
174 |   def all_weights(self):
175 |     return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
176 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/README.md:
--------------------------------------------------------------------------------
 1 | # Word-level language modeling RNN
 2 | 
 3 | This example trains a multi-layer RNN (Elman, GRU, LSTM or subLSTM) on a language modeling task.
 4 | By default, the training script uses the PTB dataset, provided.
 5 | The trained model can then be used by the generate script to generate new text.
 6 | 
 7 | ```bash
 8 | # Train a subLSTM on PTB with CUDA, reaching perplexity of 136.90 (15 epochs)
 9 | python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 100 --lr 0.001 --optim adam
10 | # Generate samples from the trained subLSTM model.
11 | python generate.py
12 | ```
13 | 
14 | The model uses the `nn.RNN` module (and its sister modules `nn.GRU`, `nn.LSTM` and `sublstm.SubLSTM`).
15 | which will automatically use the cuDNN backend if run on CUDA with cuDNN installed.
16 | 
17 | During training, if a keyboard interrupt (Ctrl-C) is received,
18 | training is stopped and the current model is evaluated against the test dataset.
19 | 
20 | The `main.py` script accepts the following arguments:
21 | 
22 | ```bash
23 | optional arguments:
24 |   -h, --help         show this help message and exit
25 |   --data DATA        location of the data corpus
26 |   --model MODEL      type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU,
27 |                      subLSTM)
28 |   --emsize EMSIZE    size of word embeddings
29 |   --nhid NHID        number of hidden units per layer
30 |   --nlayers NLAYERS  number of layers
31 |   --lr LR            initial learning rate
32 |   --clip CLIP        gradient clipping
33 |   --optim OPTIM      learning rule, supports
34 |                      adam|sparseadam|adamax|rmsprop|sgd|adagrad|adadelta
35 |   --epochs EPOCHS    upper epoch limit
36 |   --batch_size N     batch size
37 |   --bptt BPTT        sequence length
38 |   --dropout DROPOUT  dropout applied to layers (0 = no dropout)
39 |   --tied             tie the word embedding and softmax weights
40 |   --seed SEED        random seed
41 |   --cuda             use CUDA
42 |   --log-interval N   report interval
43 |   --save SAVE        path to save the final model
44 | ```
45 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | class Dictionary(object):
 5 |     def __init__(self):
 6 |         self.word2idx = {}
 7 |         self.idx2word = []
 8 | 
 9 |     def add_word(self, word):
10 |         if word not in self.word2idx:
11 |             self.idx2word.append(word)
12 |             self.word2idx[word] = len(self.idx2word) - 1
13 |         return self.word2idx[word]
14 | 
15 |     def __len__(self):
16 |         return len(self.idx2word)
17 | 
18 | 
19 | class Corpus(object):
20 |     def __init__(self, path):
21 |         self.dictionary = Dictionary()
22 |         self.train = self.tokenize(os.path.join(path, 'train.txt'))
23 |         self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
24 |         self.test = self.tokenize(os.path.join(path, 'test.txt'))
25 | 
26 |     def tokenize(self, path):
27 |         """Tokenizes a text file."""
28 |         assert os.path.exists(path)
29 |         # Add words to the dictionary
30 |         with open(path, 'r') as f:
31 |             tokens = 0
32 |             for line in f:
33 |                 words = line.split() + ['<eos>']
34 |                 tokens += len(words)
35 |                 for word in words:
36 |                     self.dictionary.add_word(word)
37 | 
38 |         # Tokenize file content
39 |         with open(path, 'r') as f:
40 |             ids = torch.LongTensor(tokens)
41 |             token = 0
42 |             for line in f:
43 |                 words = line.split() + ['<eos>']
44 |                 for word in words:
45 |                     ids[token] = self.dictionary.word2idx[word]
46 |                     token += 1
47 | 
48 |         return ids
49 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/generate.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Language Modeling on Penn Tree Bank
 3 | #
 4 | # This file generates new sentences sampled from the language model
 5 | #
 6 | ###############################################################################
 7 | 
 8 | import argparse
 9 | 
10 | import torch
11 | from torch.autograd import Variable
12 | 
13 | import data
14 | 
15 | parser = argparse.ArgumentParser(description='PyTorch PTB Language Model')
16 | 
17 | # Model parameters.
18 | parser.add_argument('--data', type=str, default='./data/penn',
19 |                     help='location of the data corpus')
20 | parser.add_argument('--checkpoint', type=str, default='./model.pt',
21 |                     help='model checkpoint to use')
22 | parser.add_argument('--outf', type=str, default='generated.txt',
23 |                     help='output file for generated text')
24 | parser.add_argument('--words', type=int, default='1000',
25 |                     help='number of words to generate')
26 | parser.add_argument('--seed', type=int, default=1111,
27 |                     help='random seed')
28 | parser.add_argument('--cuda', action='store_true',
29 |                     help='use CUDA')
30 | parser.add_argument('--temperature', type=float, default=1.0,
31 |                     help='temperature - higher will increase diversity')
32 | parser.add_argument('--log-interval', type=int, default=100,
33 |                     help='reporting interval')
34 | args = parser.parse_args()
35 | 
36 | # Set the random seed manually for reproducibility.
37 | torch.manual_seed(args.seed)
38 | if torch.cuda.is_available():
39 |     if not args.cuda:
40 |         print("WARNING: You have a CUDA device, so you should probably run with --cuda")
41 |     else:
42 |         torch.cuda.manual_seed(args.seed)
43 | 
44 | if args.temperature < 1e-3:
45 |     parser.error("--temperature has to be greater or equal 1e-3")
46 | 
47 | with open(args.checkpoint, 'rb') as f:
48 |     model = torch.load(f)
49 | model.eval()
50 | 
51 | if args.cuda:
52 |     model.cuda()
53 | else:
54 |     model.cpu()
55 | 
56 | corpus = data.Corpus(args.data)
57 | ntokens = len(corpus.dictionary)
58 | hidden = model.init_hidden(1)
59 | input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
60 | if args.cuda:
61 |     input.data = input.data.cuda()
62 | 
63 | with open(args.outf, 'w') as outf:
64 |     for i in range(args.words):
65 |         output, hidden = model(input, hidden)
66 |         word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
67 |         word_idx = torch.multinomial(word_weights, 1)[0]
68 |         input.data.fill_(word_idx)
69 |         word = corpus.dictionary.idx2word[word_idx]
70 | 
71 |         outf.write(word + ('\n' if i % 20 == 19 else ' '))
72 | 
73 |         if i % args.log_interval == 0:
74 |             print('| Generated {}/{} words'.format(i, args.words))
75 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/main.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import argparse
  3 | import time
  4 | import math
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch.autograd import Variable
  9 | 
 10 | import data
 11 | import model
 12 | 
 13 | parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
 14 | parser.add_argument('--data', type=str, default='./data/penn',
 15 |           help='location of the data corpus')
 16 | parser.add_argument('--model', type=str, default='subLSTM',
 17 |           help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, subLSTM)')
 18 | parser.add_argument('--emsize', type=int, default=200,
 19 |           help='size of word embeddings')
 20 | parser.add_argument('--nhid', type=int, default=200,
 21 |           help='number of hidden units per layer')
 22 | parser.add_argument('--nlayers', type=int, default=2,
 23 |           help='number of layers')
 24 | parser.add_argument('--lr', type=float, default=0.0001,
 25 |           help='initial learning rate')
 26 | parser.add_argument('--clip', type=float, default=0.25,
 27 |           help='gradient clipping')
 28 | parser.add_argument('--optim', type=str, default='rmsprop',
 29 |           help='learning rule, supports adam|sparseadam|adamax|rmsprop|sgd|adagrad|adadelta')
 30 | parser.add_argument('--epochs', type=int, default=40,
 31 |           help='upper epoch limit')
 32 | parser.add_argument('--batch_size', type=int, default=20, metavar='N',
 33 |           help='batch size')
 34 | parser.add_argument('--bptt', type=int, default=35,
 35 |           help='sequence length')
 36 | parser.add_argument('--dropout', type=float, default=0.5,
 37 |           help='dropout applied to layers (0 = no dropout)')
 38 | parser.add_argument('--tied', action='store_true',
 39 |           help='tie the word embedding and softmax weights')
 40 | parser.add_argument('--seed', type=int, default=1111,
 41 |           help='random seed')
 42 | parser.add_argument('--cuda', action='store_true',
 43 |           help='use CUDA')
 44 | parser.add_argument('--log-interval', type=int, default=200, metavar='N',
 45 |           help='report interval')
 46 | parser.add_argument('--save', type=str,  default='model.pt',
 47 |           help='path to save the final model')
 48 | args = parser.parse_args()
 49 | # Set the random seed manually for reproducibility.
 50 | torch.manual_seed(args.seed)
 51 | if torch.cuda.is_available():
 52 |   if not args.cuda:
 53 |     print("WARNING: You have a CUDA device, so you should probably run with --cuda")
 54 |   else:
 55 |     torch.cuda.manual_seed(args.seed)
 56 | 
 57 | ###############################################################################
 58 | # Load data
 59 | ###############################################################################
 60 | 
 61 | corpus = data.Corpus(args.data)
 62 | 
 63 | # Starting from sequential data, batchify arranges the dataset into columns.
 64 | # For instance, with the alphabet as the sequence and batch size 4, we'd get
 65 | # ┌ a g m s ┐
 66 | # │ b h n t │
 67 | # │ c i o u │
 68 | # │ d j p v │
 69 | # │ e k q w │
 70 | # └ f l r x ┘.
 71 | # These columns are treated as independent by the model, which means that the
 72 | # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 73 | # batch processing.
 74 | 
 75 | def batchify(data, bsz):
 76 |   # Work out how cleanly we can divide the dataset into bsz parts.
 77 |   nbatch = data.size(0) // bsz
 78 |   # Trim off any extra elements that wouldn't cleanly fit (remainders).
 79 |   data = data.narrow(0, 0, nbatch * bsz)
 80 |   # Evenly divide the data across the bsz batches.
 81 |   data = data.view(bsz, -1).t().contiguous()
 82 |   if args.cuda:
 83 |     data = data.cuda()
 84 |   return data
 85 | 
 86 | eval_batch_size = 10
 87 | train_data = batchify(corpus.train, args.batch_size)
 88 | val_data = batchify(corpus.valid, eval_batch_size)
 89 | test_data = batchify(corpus.test, eval_batch_size)
 90 | 
 91 | ###############################################################################
 92 | # Build the model
 93 | ###############################################################################
 94 | 
 95 | ntokens = len(corpus.dictionary)
 96 | model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
 97 | if args.cuda:
 98 |   model.cuda()
 99 | 
100 | criterion = nn.CrossEntropyLoss()
101 | 
102 | ###############################################################################
103 | # Training code
104 | ###############################################################################
105 | 
106 | def repackage_hidden(h):
107 |   """Wraps hidden states in new Variables, to detach them from their history."""
108 |   if h is None:
109 |     return None
110 |   if type(h) == Variable:
111 |     return Variable(h.data)
112 |   elif type(h) == list:
113 |     [ repackage_hidden(x) for x in h ]
114 |   elif type(h) == tuple:
115 |     tuple([ repackage_hidden(x) for x in h ])
116 |   else:
117 |     return tuple(repackage_hidden(v) for v in h)
118 | 
119 | 
120 | # get_batch subdivides the source data into chunks of length args.bptt.
121 | # If source is equal to the example output of the batchify function, with
122 | # a bptt-limit of 2, we'd get the following two Variables for i = 0:
123 | # ┌ a g m s ┐ ┌ b h n t ┐
124 | # └ b h n t ┘ └ c i o u ┘
125 | # Note that despite the name of the function, the subdivison of data is not
126 | # done along the batch dimension (i.e. dimension 1), since that was handled
127 | # by the batchify function. The chunks are along dimension 0, corresponding
128 | # to the seq_len dimension in the LSTM.
129 | 
130 | def get_batch(source, i, evaluation=False):
131 |   seq_len = min(args.bptt, len(source) - 1 - i)
132 |   data = Variable(source[i:i+seq_len], volatile=evaluation)
133 |   target = Variable(source[i+1:i+1+seq_len].view(-1))
134 |   return data, target
135 | 
136 | 
137 | def evaluate(data_source):
138 |   # Turn on evaluation mode which disables dropout.
139 |   model.eval()
140 |   total_loss = 0
141 |   ntokens = len(corpus.dictionary)
142 |   hidden = model.init_hidden(eval_batch_size)
143 |   for i in range(0, data_source.size(0) - 1, args.bptt):
144 |     data, targets = get_batch(data_source, i, evaluation=True)
145 |     output, hidden = model(data, hidden)
146 |     output_flat = output.view(-1, ntokens)
147 |     total_loss += len(data) * criterion(output_flat, targets).data
148 |     hidden = repackage_hidden(hidden)
149 |   return total_loss[0] / len(data_source)
150 | 
151 | if args.optim == 'adam':
152 |   optimizer = optim.Adam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
153 | if args.optim == 'sparseadam':
154 |   optimizer = optim.SparseAdam(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
155 | if args.optim == 'adamax':
156 |   optimizer = optim.Adamax(model.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
157 | elif args.optim == 'rmsprop':
158 |   optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=1e-10) # 0.0001
159 | elif args.optim == 'sgd':
160 |   optimizer = optim.SGD(model.parameters(), lr=args.lr) # 0.01
161 | elif args.optim == 'adagrad':
162 |   optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
163 | elif args.optim == 'adadelta':
164 |   optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
165 | 
166 | def train():
167 |   # Turn on training mode which enables dropout.
168 |   model.train()
169 |   total_loss = 0
170 |   start_time = time.time()
171 |   ntokens = len(corpus.dictionary)
172 |   hidden = model.init_hidden(args.batch_size)
173 |   for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
174 |     data, targets = get_batch(train_data, i)
175 |     # Starting each batch, we detach the hidden state from how it was previously produced.
176 |     # If we didn't, the model would try backpropagating all the way to start of the dataset.
177 |     hidden = repackage_hidden(hidden)
178 |     optimizer.zero_grad()
179 |     output, hidden = model(data, hidden)
180 |     loss = criterion(output.view(-1, ntokens), targets)
181 |     loss.backward()
182 | 
183 |     # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
184 |     torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
185 |     optimizer.step()
186 | 
187 |     total_loss += loss.data
188 | 
189 |     if batch % args.log_interval == 0 and batch > 0:
190 |       cur_loss = total_loss[0] / args.log_interval
191 |       elapsed = time.time() - start_time
192 |       print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
193 |           'loss {:5.2f} | ppl {:8.2f}'.format(
194 |         epoch, batch, len(train_data) // args.bptt, lr,
195 |         elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
196 |       total_loss = 0
197 |       start_time = time.time()
198 | 
199 | # Loop over epochs.
200 | lr = args.lr
201 | best_val_loss = None
202 | 
203 | # At any point you can hit Ctrl + C to break out of training early.
204 | try:
205 |   for epoch in range(1, args.epochs+1):
206 |     epoch_start_time = time.time()
207 |     train()
208 |     val_loss = evaluate(val_data)
209 |     print('-' * 89)
210 |     print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
211 |         'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
212 |                        val_loss, math.exp(val_loss)))
213 |     print('-' * 89)
214 |     # Save the model if the validation loss is the best we've seen so far.
215 |     if not best_val_loss or val_loss < best_val_loss:
216 |       with open(args.save, 'wb') as f:
217 |         torch.save(model, f)
218 |       best_val_loss = val_loss
219 |     else:
220 |       # Anneal the learning rate if no improvement has been seen in the validation dataset.
221 |       lr /= 4.0
222 | except KeyboardInterrupt:
223 |   print('-' * 89)
224 |   print('Exiting from training early')
225 | 
226 | # Load the best saved model.
227 | with open(args.save, 'rb') as f:
228 |   model = torch.load(f)
229 | 
230 | # Run on test data.
231 | test_loss = evaluate(test_data)
232 | print('=' * 89)
233 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
234 |   test_loss, math.exp(test_loss)))
235 | print('=' * 89)
236 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.autograd import Variable
 3 | 
 4 | from subLSTM.nn import SubLSTM
 5 | 
 6 | 
 7 | class RNNModel(nn.Module):
 8 |     """Container module with an encoder, a recurrent module, and a decoder."""
 9 | 
10 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
11 |         super(RNNModel, self).__init__()
12 |         self.drop = nn.Dropout(dropout)
13 |         self.encoder = nn.Embedding(ntoken, ninp)
14 |         if rnn_type in ['LSTM', 'GRU']:
15 |             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
16 |         elif rnn_type == 'subLSTM':
17 |             self.rnn = SubLSTM(ninp, nhid, nlayers, dropout=dropout)
18 |         else:
19 |             try:
20 |                 nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
21 |             except KeyError:
22 |                 raise ValueError( """An invalid option for `--model` was supplied,
23 |                                  options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
24 |             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
25 |         self.decoder = nn.Linear(nhid, ntoken)
26 | 
27 |         # Optionally tie weights as in:
28 |         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
29 |         # https://arxiv.org/abs/1608.05859
30 |         # and
31 |         # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
32 |         # https://arxiv.org/abs/1611.01462
33 |         if tie_weights:
34 |             if nhid != ninp:
35 |                 raise ValueError('When using the tied flag, nhid must be equal to emsize')
36 |             self.decoder.weight = self.encoder.weight
37 | 
38 |         self.init_weights()
39 | 
40 |         self.rnn_type = rnn_type
41 |         self.nhid = nhid
42 |         self.nlayers = nlayers
43 | 
44 |     def init_weights(self):
45 |         initrange = 0.1
46 |         self.encoder.weight.data.uniform_(-initrange, initrange)
47 |         self.decoder.bias.data.fill_(0)
48 |         self.decoder.weight.data.uniform_(-initrange, initrange)
49 | 
50 |     def forward(self, input, hidden):
51 |         emb = self.drop(self.encoder(input))
52 |         output, hidden = self.rnn(emb, hidden)
53 |         output = self.drop(output)
54 |         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
55 |         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
56 | 
57 |     def init_hidden(self, bsz):
58 |         weight = next(self.parameters()).data
59 |         if self.rnn_type == 'LSTM':
60 |             return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
61 |                     Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
62 |         elif self.rnn_type == 'subLSTM':
63 |             return None
64 |         else:
65 |             return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())
66 | 


--------------------------------------------------------------------------------
/tasks/word_language_model/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | 


--------------------------------------------------------------------------------
/test/test_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pytest
 4 | import numpy as np
 5 | 
 6 | import torch.nn as nn
 7 | import torch as T
 8 | from torch.autograd import Variable as var
 9 | import torch.nn.functional as F
10 | from torch.nn.utils import clip_grad_norm
11 | import torch.optim as optim
12 | import numpy as np
13 | 
14 | import sys
15 | import os
16 | import math
17 | import time
18 | sys.path.insert(0, '.')
19 | 
20 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF
21 | from subLSTM.nn import SubLSTMCell
22 | 
23 | 
24 | def test_cell():
25 |   hidden_size = 20
26 |   input_size = 10
27 | 
28 |   for bias in (True, False):
29 |     input = var(T.randn(3, input_size))
30 |     hx = var(T.randn(3, hidden_size))
31 |     cx = var(T.randn(3, hidden_size))
32 | 
33 |     cell = SubLSTMCell(input_size, hidden_size, bias=bias)
34 | 
35 |     for i in range(6):
36 |       (hx, cx) = cell(input, (hx, cx))
37 | 
38 |     hx.sum().backward()
39 |     assert hx.size() == T.Size([3, hidden_size])
40 |     assert cx.size() == T.Size([3, hidden_size])
41 | 


--------------------------------------------------------------------------------
/test/test_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pytest
 4 | import numpy as np
 5 | 
 6 | import torch.nn as nn
 7 | import torch as T
 8 | from torch.autograd import Variable as var
 9 | import torch.nn.functional as F
10 | from torch.nn.utils import clip_grad_norm
11 | import torch.optim as optim
12 | import numpy as np
13 | 
14 | import sys
15 | import os
16 | import math
17 | import time
18 | sys.path.insert(0, '.')
19 | 
20 | from subLSTM.functional import SubLSTMCell as SubLSTMCellF
21 | from subLSTM.nn import SubLSTMCell
22 | 
23 | 
24 | def test_function():
25 |   hidden_size = 20
26 |   input_size = 10
27 | 
28 |   for bias in (True, False):
29 |     weight_ih = T.nn.Parameter(T.Tensor(4 * hidden_size, input_size))
30 |     weight_hh = T.nn.Parameter(T.Tensor(4 * hidden_size, hidden_size))
31 |     bias_ih = T.nn.Parameter(T.Tensor(4 * hidden_size)) if bias else None
32 |     bias_hh = T.nn.Parameter(T.Tensor(4 * hidden_size)) if bias else None
33 | 
34 |     input = var(T.randn(3, input_size))
35 |     hx = var(T.randn(3, hidden_size))
36 |     cx = var(T.randn(3, hidden_size))
37 |     cell = SubLSTMCellF
38 |     for i in range(6):
39 |       hx, cx = cell(input, (hx, cx), weight_ih, weight_hh, bias_ih, bias_hh)
40 | 
41 |     hx.sum().backward()
42 | 
43 |     assert hx.size() == T.Size([3, hidden_size])
44 |     assert cx.size() == T.Size([3, hidden_size])
45 | 


--------------------------------------------------------------------------------
/test/test_rnn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env pythonbatch_size
 2 | 
 3 | import pytest
 4 | import numpy as np
 5 | 
 6 | import torch.nn as nn
 7 | import torch as T
 8 | from torch.autograd import Variable as var
 9 | import torch.nn.functional as F
10 | from torch.nn.utils import clip_grad_norm
11 | import torch.optim as optim
12 | import numpy as np
13 | 
14 | import sys
15 | import os
16 | import math
17 | import time
18 | sys.path.insert(0, '.')
19 | 
20 | from subLSTM.nn import SubLSTM
21 | 
22 | 
23 | def test_rnn():
24 |   hidden_size = 20
25 |   input_size = 10
26 |   seq_len = 5
27 |   batch_size = 7
28 | 
29 |   for bias in (True, False):
30 |     for batch_first in (True, False):
31 |       input = var(T.randn(batch_size, seq_len, input_size)) if batch_first else var(T.randn(seq_len, batch_size, input_size))
32 |       hx = None
33 |       rnn = SubLSTM(input_size, hidden_size, num_layers=2, bias=bias, batch_first=batch_first)
34 | 
35 |       outputs = []
36 |       for i in range(6):
37 |         output, hx = rnn(input, hx)
38 |         outputs.append(output)
39 | 
40 |       T.stack(outputs).sum().backward()
41 | 
42 |       assert hx[-1][-1][0].size() == T.Size([batch_size, hidden_size])
43 |       assert hx[-1][-1][1].size() == T.Size([batch_size, hidden_size])
44 | 
45 | 
46 | 
47 | def test_rnn_bidirectional():
48 |   hidden_size = 20
49 |   input_size = 10
50 |   seq_len = 5
51 |   batch_size = 7
52 | 
53 |   for bias in (True, False):
54 |     for batch_first in (True, False):
55 |       input = var(T.randn(batch_size, seq_len, input_size)) if batch_first else var(T.randn(seq_len, batch_size, input_size))
56 |       hx = None
57 |       rnn = SubLSTM(input_size, hidden_size, num_layers=3, bias=bias, batch_first=batch_first, bidirectional=True)
58 | 
59 |       outputs = []
60 |       for i in range(6):
61 |         output, hx = rnn(input, hx)
62 |         outputs.append(output)
63 | 
64 |       T.stack(outputs).sum().backward()
65 | 
66 |       assert hx[-1][-1][0].size() == T.Size([batch_size, hidden_size])
67 |       assert hx[-1][-1][1].size() == T.Size([batch_size, hidden_size])
68 | 
69 | 


--------------------------------------------------------------------------------