├── wordPTB ├── data │ ├── .gitgnore~ │ └── .gitgnore ├── results │ └── wPTB.PNG ├── README.md ├── IndRNN.py ├── BatchNorm_step_timefirst.py ├── reader.py ├── bn_eachstep_withdrop_timefirst.py └── IndRNN_onlyrecurrent.py ├── action recognition ├── __init__.py ├── README.md ├── opts.py ├── Indrnn_action_network.py ├── IndRNN.py ├── data_reader_numpy_test.py ├── Indrnn_action_train.py ├── data_reader_numpy_witheval.py └── IndRNN_onlyrecurrent.py ├── .gitignore ├── cPTB ├── data │ └── .gitignore ├── results │ └── cPTB.PNG ├── README.md ├── IndRNN.py ├── BatchNorm_step_timefirst.py ├── reader.py ├── bn_eachstep_withdrop_timefirst.py ├── penntree_charlevel_rernn.py └── IndRNN_onlyrecurrent.py ├── Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf ├── IndRNN.py ├── adding ├── IndRNN.py └── adding.py ├── mnist ├── IndRNN.py ├── pixelmnist.py ├── Data_gen.py ├── Data_gen_permute.py └── IndRNN_onlyrecurrent.py ├── README.md └── IndRNN_onlyrecurrent.py /wordPTB/data/.gitgnore~: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /action recognition/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | -------------------------------------------------------------------------------- /cPTB/data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /wordPTB/data/.gitgnore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /cPTB/results/cPTB.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/cPTB/results/cPTB.PNG -------------------------------------------------------------------------------- /wordPTB/results/wPTB.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/wordPTB/results/wPTB.PNG -------------------------------------------------------------------------------- /Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf -------------------------------------------------------------------------------- /action recognition/README.md: -------------------------------------------------------------------------------- 1 | ## The skeleton-based Action Recognition example 2 | ### Usage 3 | 1, First, ready the data. Two ways. 4 | (1) Use your own data reader. Change the code at [Indrnn_action_train.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/action%20recognition/Indrnn_action_train.py#L69) 5 | (2) Use the provided data reader. Generate the data ndarray. Download the NTU RGB+D dataset, change the skeleton into a ndarray, and keep the length and label of each data entry. 6 | 2, Run the code. Add the Theano flags if using GPU. `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' ` 7 | `python -u Indrnn_action_train.py --use_bn_afterrnn --use_dropout --droprate 0.25 --use_weightdecay_nohiddenW` 8 | If use the CV test setting, add `--test_CV`. For example: 9 | `python -u Indrnn_action_train.py --test_CV --use_bn_afterrnn --use_dropout --droprate 0.1 --use_weightdecay_nohiddenW` 10 | 11 | ### Considerations 12 | 1, Usually sequence length of 20 is used for this dataset. It is short, so no need to impose the constraint on the recurrent weight (Similar results using it). 13 | -------------------------------------------------------------------------------- /action recognition/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | 5 | def train_opts(parser): 6 | parser.add_argument('--lr', type=np.float32, default=2e-4,help='lr') 7 | parser.add_argument('--batch_size', type=int, default=128, help='batch_size') 8 | parser.add_argument('--seq_len', type=int, default=20) 9 | parser.add_argument('--num_layers', type=int, default=6,help='num_layers') 10 | parser.add_argument('--hidden_units', type=int, default=512) 11 | parser.add_argument('--test_CV', action='store_true', default=False,help='use the CS test setting. If True, then use CV test setting.') 12 | parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False) 13 | parser.add_argument('--decayrate', type=np.float32, default=1e-4,help='lr') 14 | 15 | 16 | parser.add_argument('--use_bn_afterrnn', action='store_true', default=False) 17 | 18 | 19 | 20 | parser.add_argument('--ini_in2hid', type=np.float32, default=0.002) 21 | 22 | parser.add_argument('--constrain_U', action='store_true', default=False) 23 | parser.add_argument('--MAG', type=np.float32, default=5.0) 24 | 25 | parser.add_argument('--rotation_aug', action='store_true', default=False) 26 | parser.add_argument('--eval_fold', type=int, default=5) 27 | parser.add_argument('--ini_b', type=np.float32, default=0.0) 28 | parser.add_argument('--end_rate', type=np.float32, default=1e-6) 29 | 30 | 31 | 32 | 33 | parser.add_argument('--use_dropout', action='store_true', default=False) 34 | parser.add_argument('--droprate', type=np.float32, default=0.1,help='lr') 35 | parser.add_argument('--rec_drop', action='store_true', default=False) 36 | parser.add_argument('--drop_layers', type=int, default=1) 37 | parser.add_argument('--conv_drop', action='store_true', default=False) 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /cPTB/README.md: -------------------------------------------------------------------------------- 1 | ## The language modeling example using character-level Penn Treebank (PTB-c) 2 | ### Usage 3 | 1, First, download the data and add it to the `data` folder. 4 | >> The PTB dataset used comes from Tomas Mikolov's webpage: 5 | >> http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 6 | 7 | 2, Run the code using the generally stacked IndRNN or the residual IndRNN. 8 | 9 | >> Stacked IndRNN: `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_dropout --num_layers 6 --droprate 0.25 --seq_len 150 --use_weightdecay_nohiddenW` 10 | 11 | >> Residual IndRNN: `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_residual --num_layers 11 --use_dropout --droprate 0.3 --seq_len 150 --use_weightdecay_nohiddenW` 12 | >> The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results. 13 | 14 | >> For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation. 15 | 16 | >> `data_aug` here only provides different start for each training epoch to provide stable statistics for BN. 17 | 18 | 3, Longer sequence performs better in our experiments, showing that longer dependency helps. 19 | 20 | ![alt text](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/cPTB/results/cPTB.PNG) 21 | -------------------------------------------------------------------------------- /wordPTB/README.md: -------------------------------------------------------------------------------- 1 | ## The language modeling example using word-level Penn Treebank (PTB-c) 2 | ### Usage 3 | 1, First, download the data and add it to the `data` folder. 4 | >> The PTB dataset used comes from Tomas Mikolov's webpage: 5 | >> http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 6 | 7 | 2, Run the code using the generally stacked IndRNN or the residual IndRNN. 8 | 9 | >> Residual IndRNN: `python -u penntree_wordlevel_rernn_WT.py --word_level --w_tying --data_aug --lr 5e-4 --seq_len 50 --use_residual --num_layers 11 --hidden_units 2000 --batch_size 128 --use_dropout --drop_embedding --droprate_last 0.8 --droprate 0.35 --ini_normal --U_std 0.2 --U_mean 0.4 --ini_last 0.03` 10 | >> The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results. 11 | 12 | >> For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation. 13 | 14 | >> `data_aug` here only provides different start for each training epoch to provide stable statistics for BN. 15 | 16 | 3, This is a rather small dataset for word-level language modelling, so initialization matters. The example running configuration may not be best, but shows a better performance than the existing models (except the neural architecture search which constructs new models while learning). 17 | 18 | ![alt text](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/wordPTB/results/wPTB.PNG) 19 | -------------------------------------------------------------------------------- /action recognition/Indrnn_action_network.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import argparse 4 | import os 5 | import time 6 | import theano 7 | import numpy as np 8 | import theano.tensor as T 9 | import lasagne 10 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,BatchNormLayer 11 | from lasagne.layers import DenseLayer,ElemwiseSumLayer,SliceLayer 12 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer 13 | from lasagne.nonlinearities import softmax, rectify,tanh 14 | from lasagne.init import Uniform,Normal,HeNormal 15 | 16 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent 17 | 18 | import opts 19 | 20 | sys.setrecursionlimit(10000) 21 | parser = argparse.ArgumentParser(description='network') 22 | 23 | #parser.set_defaults(use_weightdecay_nohiddenW=True, use_bn=True, use_birnn=True) 24 | 25 | opts.train_opts(parser) 26 | args = parser.parse_args() 27 | 28 | batch_size = args.batch_size 29 | seq_len=args.seq_len 30 | num_layers=args.num_layers 31 | hidden_units=args.hidden_units 32 | outputclass=60 33 | indim=50#150 34 | droprate=args.droprate 35 | gradclipvalue=10 36 | act=rectify 37 | U_bound=pow(args.MAG, 1.0 / seq_len) 38 | U_lowbound=pow(1.0/args.MAG, 1.0 / seq_len) 39 | 40 | rnnmodel=indrnn_onlyrecurrent 41 | 42 | 43 | ini_W=HeNormal(gain=np.sqrt(2)/np.sqrt(args.seq_len)) 44 | if args.use_bn_afterrnn: 45 | ini_W=Uniform(args.ini_in2hid) 46 | 47 | def build_indrnn_network(X_sym): 48 | net = {} 49 | net['input0'] = InputLayer((batch_size, seq_len, indim, 3),X_sym) 50 | net['input']=ReshapeLayer(net['input0'], (batch_size, seq_len, indim*3)) 51 | net['rnn0']=DimshuffleLayer(net['input'],(1,0,2)) 52 | for l in range(1, num_layers+1): 53 | hidini=0 54 | if l==num_layers: 55 | hidini=U_lowbound 56 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1)) 57 | net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],hidden_units,W=ini_W,b=lasagne.init.Constant(args.ini_b),nonlinearity=None) # 58 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size, -1)) 59 | if args.conv_drop: 60 | net['rnn%d'%(l-1)]=DropoutLayer(net['rnn%d'%(l-1)], p=droprate, shared_axes=(0,)) 61 | net['rnn%d'%l]=net['rnn%d'%(l-1)] 62 | if not args.use_bn_afterrnn: 63 | net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],beta=lasagne.init.Constant(args.ini_b),axes= (0,1)) 64 | 65 | net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],hidden_units,W_hid_to_hid=Uniform(range=(hidini,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=gradclipvalue) 66 | 67 | if args.use_bn_afterrnn: 68 | net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1)) 69 | if args.use_dropout and l%args.drop_layers==0: 70 | net['rnn%d'%l]=DropoutLayer(net['rnn%d'%l], p=droprate, shared_axes=(0,)) 71 | 72 | net['rnn%d'%num_layers]=lasagne.layers.SliceLayer(net['rnn%d'%num_layers],indices=-1, axis=0) 73 | net['out']=DenseLayer(net['rnn%d'%num_layers],outputclass,nonlinearity=softmax) 74 | return net 75 | -------------------------------------------------------------------------------- /IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /adding/IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /cPTB/IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /mnist/IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /wordPTB/IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /action recognition/IndRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Please cite the following paper if you find it useful. 6 | 7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 8 | @article{li2018independently, 9 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 10 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 11 | booktitle={CVPR2018}, 12 | year={2018} 13 | } 14 | """ 15 | import numpy as np 16 | import theano 17 | import theano.tensor as T 18 | import lasagne.nonlinearities as nonlinearities 19 | import lasagne.init as init 20 | from lasagne.utils import unroll_scan 21 | 22 | from lasagne.layers import MergeLayer, Layer 23 | from lasagne.layers import InputLayer 24 | from lasagne.layers import DenseLayer 25 | from lasagne.layers import CustomRecurrentLayer 26 | import lasagne 27 | 28 | __all__ = [ 29 | "MulLayer", 30 | "IndRNNLayer" 31 | ] 32 | 33 | 34 | 35 | class MulLayer(lasagne.layers.Layer): 36 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 37 | super(MulLayer, self).__init__(incoming, **kwargs) 38 | num_inputs = self.input_shape[1] 39 | self.W = self.add_param(W, (num_inputs, ), name='W') 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return input * self.W 43 | 44 | def get_output_shape_for(self, input_shape): 45 | return input_shape#(input_shape[0], self.num_units) 46 | 47 | 48 | class IndRNNLayer(CustomRecurrentLayer): 49 | 50 | def __init__(self, incoming, num_units, 51 | W_in_to_hid=init.Uniform(), 52 | W_hid_to_hid=init.Uniform(), 53 | b=init.Constant(0.), 54 | nonlinearity=nonlinearities.rectify, 55 | hid_init=init.Constant(0.), 56 | backwards=False, 57 | learn_init=False, 58 | gradient_steps=-1, 59 | grad_clipping=0, 60 | unroll_scan=False, 61 | precompute_input=True, 62 | mask_input=None, 63 | only_return_final=False, 64 | **kwargs): 65 | 66 | if isinstance(incoming, tuple): 67 | input_shape = incoming 68 | else: 69 | input_shape = incoming.output_shape 70 | # Retrieve the supplied name, if it exists; otherwise use '' 71 | if 'name' in kwargs: 72 | basename = kwargs['name'] + '.' 73 | # Create a separate version of kwargs for the contained layers 74 | # which does not include 'name' 75 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 76 | if key != 'name') 77 | else: 78 | basename = '' 79 | layer_kwargs = kwargs 80 | # We will be passing the input at each time step to the dense layer, 81 | # so we need to remove the second dimension (the time dimension) 82 | in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 83 | num_units, W=W_in_to_hid, b=b, 84 | nonlinearity=None, 85 | name=basename + 'input_to_hidden', 86 | **layer_kwargs) 87 | # The hidden-to-hidden layer expects its inputs to have num_units 88 | # features because it recycles the previous hidden state 89 | 90 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 91 | W=W_hid_to_hid, 92 | name=basename + 'hidden_to_hidden', 93 | **layer_kwargs) 94 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 95 | # num_units, W=W_hid_to_hid, b=None, 96 | # nonlinearity=None, 97 | # name=basename + 'hidden_to_hidden', 98 | # **layer_kwargs) 99 | 100 | # Make child layer parameters intuitively accessible 101 | self.W_in_to_hid = in_to_hid.W 102 | self.W_hid_to_hid = hid_to_hid.W 103 | self.b = in_to_hid.b 104 | 105 | # Just use the CustomRecurrentLayer with the DenseLayers we created 106 | super(IndRNNLayer, self).__init__( 107 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 108 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 109 | gradient_steps=gradient_steps, 110 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 111 | precompute_input=precompute_input, mask_input=mask_input, 112 | only_return_final=only_return_final, **kwargs) 113 | 114 | 115 | -------------------------------------------------------------------------------- /cPTB/BatchNorm_step_timefirst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is adapted from the BatchNormLayer of Lasagne. 4 | It implemens the batch noralization function where the first dimension of input is TIMESTEPS. It calculates (and averages over) mean and variance for each step over the batch_size dimension. 5 | """ 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | from lasagne import init 11 | from lasagne import nonlinearities 12 | 13 | from lasagne.layers import Layer 14 | 15 | class BatchNorm_step_timefirst_Layer(Layer): 16 | def __init__(self, incoming, axes='auto', epsilon=1e-4, alpha=0.1, 17 | beta=init.Constant(0), gamma=init.Constant(1), 18 | mean=init.Constant(0), inv_std=init.Constant(1), **kwargs): 19 | super(BatchNorm_step_timefirst_Layer, self).__init__(incoming, **kwargs) 20 | 21 | if axes == 'auto': 22 | # default: normalize over all but the second axis 23 | axes = (0,) + tuple(range(2, len(self.input_shape))) 24 | elif isinstance(axes, int): 25 | axes = (axes,) 26 | self.axes = axes 27 | if len(axes)==1: 28 | self.mean_axes=self.axes 29 | else: 30 | self.mean_axes=(axes[1],) 31 | 32 | self.epsilon = epsilon 33 | self.alpha = alpha 34 | 35 | # create parameters, ignoring all dimensions in axes 36 | shape = [size for axis, size in enumerate(self.input_shape) 37 | if axis not in self.axes] 38 | meanshape = [size for axis, size in enumerate(self.input_shape) 39 | if axis not in self.mean_axes] 40 | if any(size is None for size in shape): 41 | raise ValueError("BatchNormLayer needs specified input sizes for " 42 | "all axes not normalized over.") 43 | if beta is None: 44 | self.beta = None 45 | else: 46 | self.beta = self.add_param(beta, shape, 'beta', 47 | trainable=True, regularizable=False) 48 | if gamma is None: 49 | self.gamma = None 50 | else: 51 | self.gamma = self.add_param(gamma, shape, 'gamma', 52 | trainable=True, regularizable=True) 53 | self.mean = self.add_param(mean, meanshape, 'mean', 54 | trainable=False, regularizable=False) 55 | self.inv_std = self.add_param(inv_std, meanshape, 'inv_std', 56 | trainable=False, regularizable=False) 57 | 58 | def get_output_for(self, input, deterministic=False, 59 | batch_norm_use_averages=None, 60 | batch_norm_update_averages=None, **kwargs): 61 | input_mean = input.mean(self.mean_axes) 62 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon)) 63 | 64 | # Decide whether to use the stored averages or mini-batch statistics 65 | if batch_norm_use_averages is None: 66 | batch_norm_use_averages = deterministic 67 | use_averages = batch_norm_use_averages 68 | 69 | if use_averages: 70 | mean = self.mean 71 | inv_std = self.inv_std 72 | else: 73 | mean = input_mean 74 | inv_std = input_inv_std 75 | 76 | # Decide whether to update the stored averages 77 | if batch_norm_update_averages is None: 78 | batch_norm_update_averages = not deterministic 79 | update_averages = batch_norm_update_averages 80 | 81 | if update_averages: 82 | # Trick: To update the stored statistics, we create memory-aliased 83 | # clones of the stored statistics: 84 | running_mean = theano.clone(self.mean, share_inputs=False) 85 | running_inv_std = theano.clone(self.inv_std, share_inputs=False) 86 | # set a default update for them: 87 | running_mean.default_update = ((1 - self.alpha) * running_mean + 88 | self.alpha * input_mean) 89 | running_inv_std.default_update = ((1 - self.alpha) * 90 | running_inv_std + 91 | self.alpha * input_inv_std) 92 | # and make sure they end up in the graph without participating in 93 | # the computation (this way their default_update will be collected 94 | # and applied, but the computation will be optimized away): 95 | mean += 0 * running_mean 96 | inv_std += 0 * running_inv_std 97 | 98 | # prepare dimshuffle pattern inserting broadcastable axes as needed 99 | param_axes = iter(range(input.ndim - len(self.axes))) 100 | pattern = ['x' if input_axis in self.axes 101 | else next(param_axes) 102 | for input_axis in range(input.ndim)] 103 | 104 | # apply dimshuffle pattern to all parameters 105 | beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) 106 | gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) 107 | 108 | mean_param_axes = iter(range(input.ndim - len(self.mean_axes))) 109 | mean_pattern = ['x' if input_axis in self.mean_axes 110 | else next(mean_param_axes) 111 | for input_axis in range(input.ndim)] 112 | mean = mean.dimshuffle(mean_pattern) 113 | inv_std = inv_std.dimshuffle(mean_pattern) 114 | 115 | # normalize 116 | normalized = (input - mean) * (gamma * inv_std) + beta 117 | return normalized 118 | 119 | -------------------------------------------------------------------------------- /wordPTB/BatchNorm_step_timefirst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is adapted from the BatchNormLayer of Lasagne. 4 | It implemens the batch noralization function where the first dimension of input is TIMESTEPS. It calculates (and averages over) mean and variance for each step over the batch_size dimension. 5 | """ 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | from lasagne import init 11 | from lasagne import nonlinearities 12 | 13 | from lasagne.layers import Layer 14 | 15 | class BatchNorm_step_timefirst_Layer(Layer): 16 | def __init__(self, incoming, axes='auto', epsilon=1e-4, alpha=0.1, 17 | beta=init.Constant(0), gamma=init.Constant(1), 18 | mean=init.Constant(0), inv_std=init.Constant(1), **kwargs): 19 | super(BatchNorm_step_timefirst_Layer, self).__init__(incoming, **kwargs) 20 | 21 | if axes == 'auto': 22 | # default: normalize over all but the second axis 23 | axes = (0,) + tuple(range(2, len(self.input_shape))) 24 | elif isinstance(axes, int): 25 | axes = (axes,) 26 | self.axes = axes 27 | if len(axes)==1: 28 | self.mean_axes=self.axes 29 | else: 30 | self.mean_axes=(axes[1],) 31 | 32 | self.epsilon = epsilon 33 | self.alpha = alpha 34 | 35 | # create parameters, ignoring all dimensions in axes 36 | shape = [size for axis, size in enumerate(self.input_shape) 37 | if axis not in self.axes] 38 | meanshape = [size for axis, size in enumerate(self.input_shape) 39 | if axis not in self.mean_axes] 40 | if any(size is None for size in shape): 41 | raise ValueError("BatchNormLayer needs specified input sizes for " 42 | "all axes not normalized over.") 43 | if beta is None: 44 | self.beta = None 45 | else: 46 | self.beta = self.add_param(beta, shape, 'beta', 47 | trainable=True, regularizable=False) 48 | if gamma is None: 49 | self.gamma = None 50 | else: 51 | self.gamma = self.add_param(gamma, shape, 'gamma', 52 | trainable=True, regularizable=True) 53 | self.mean = self.add_param(mean, meanshape, 'mean', 54 | trainable=False, regularizable=False) 55 | self.inv_std = self.add_param(inv_std, meanshape, 'inv_std', 56 | trainable=False, regularizable=False) 57 | 58 | def get_output_for(self, input, deterministic=False, 59 | batch_norm_use_averages=None, 60 | batch_norm_update_averages=None, **kwargs): 61 | input_mean = input.mean(self.mean_axes) 62 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon)) 63 | 64 | # Decide whether to use the stored averages or mini-batch statistics 65 | if batch_norm_use_averages is None: 66 | batch_norm_use_averages = deterministic 67 | use_averages = batch_norm_use_averages 68 | 69 | if use_averages: 70 | mean = self.mean 71 | inv_std = self.inv_std 72 | else: 73 | mean = input_mean 74 | inv_std = input_inv_std 75 | 76 | # Decide whether to update the stored averages 77 | if batch_norm_update_averages is None: 78 | batch_norm_update_averages = not deterministic 79 | update_averages = batch_norm_update_averages 80 | 81 | if update_averages: 82 | # Trick: To update the stored statistics, we create memory-aliased 83 | # clones of the stored statistics: 84 | running_mean = theano.clone(self.mean, share_inputs=False) 85 | running_inv_std = theano.clone(self.inv_std, share_inputs=False) 86 | # set a default update for them: 87 | running_mean.default_update = ((1 - self.alpha) * running_mean + 88 | self.alpha * input_mean) 89 | running_inv_std.default_update = ((1 - self.alpha) * 90 | running_inv_std + 91 | self.alpha * input_inv_std) 92 | # and make sure they end up in the graph without participating in 93 | # the computation (this way their default_update will be collected 94 | # and applied, but the computation will be optimized away): 95 | mean += 0 * running_mean 96 | inv_std += 0 * running_inv_std 97 | 98 | # prepare dimshuffle pattern inserting broadcastable axes as needed 99 | param_axes = iter(range(input.ndim - len(self.axes))) 100 | pattern = ['x' if input_axis in self.axes 101 | else next(param_axes) 102 | for input_axis in range(input.ndim)] 103 | 104 | # apply dimshuffle pattern to all parameters 105 | beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) 106 | gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) 107 | 108 | mean_param_axes = iter(range(input.ndim - len(self.mean_axes))) 109 | mean_pattern = ['x' if input_axis in self.mean_axes 110 | else next(mean_param_axes) 111 | for input_axis in range(input.ndim)] 112 | mean = mean.dimshuffle(mean_pattern) 113 | inv_std = inv_std.dimshuffle(mean_pattern) 114 | 115 | # normalize 116 | normalized = (input - mean) * (gamma * inv_std) + beta 117 | return normalized 118 | 119 | -------------------------------------------------------------------------------- /action recognition/data_reader_numpy_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import h5py 3 | import numpy as np 4 | import time 5 | import random 6 | #import glob 7 | #import skimage.transform 8 | #from skimage import color 9 | import pickle 10 | import theano 11 | #import cv2 12 | from multiprocessing import Pool 13 | from threading import Thread 14 | import os.path 15 | #RGB_frames = '/home/sl669/caffe/colordataset/ImageNET/ILSVRC2015/Data/CLS-LOC/val/'#'/home/sl669/caffe/ucf101/framearrays/'# 16 | 17 | 18 | 19 | from __main__ import test_dataset 20 | datasets=test_dataset 21 | class batch_thread(): 22 | def __init__(self, result, batch_size_,seq_len):#, datasets 23 | self.result = result 24 | self.batch_size_=batch_size_ 25 | self.datasets = datasets 26 | self.seq_len=seq_len 27 | self.idx=-1 28 | 29 | dataname=datasets+'.npy' 30 | labelname=datasets+'_label.npy' 31 | lenname=datasets+'_len.npy' 32 | self.data_handle=np.load(dataname) 33 | self.label_handle=np.load(labelname) 34 | self.len_handle=np.load(lenname) 35 | 36 | self.num_videos = len(self.data_handle) 37 | self.shufflevideolist=np.arange(self.num_videos) 38 | np.random.shuffle(self.shufflevideolist) 39 | 40 | print ('Dataset size', self.num_videos) 41 | 42 | def __call__(self):###Be careful. The appended data may change like pointer. 43 | templabel=[] 44 | batch_data=[] 45 | tempindex=[] 46 | for j in range(self.batch_size_): 47 | self.idx +=1 48 | if self.idx == self.num_videos: 49 | self.idx =0 50 | np.random.shuffle(self.shufflevideolist) 51 | shufflevideoindex=self.shufflevideolist[self.idx] 52 | 53 | label=self.label_handle[shufflevideoindex] 54 | templabel.append(np.int32(label)) 55 | tempindex.append(np.int32(shufflevideoindex)) 56 | dataset=self.data_handle[shufflevideoindex] 57 | len_data=self.len_handle[shufflevideoindex] 58 | 59 | sample=np.zeros(tuple((self.seq_len,)+self.data_handle[shufflevideoindex].shape[1:])) 60 | lenperseg=len_data//self.seq_len 61 | if lenperseg==1 and len_data>self.seq_len: 62 | startid=np.random.randint(len_data-self.seq_len) 63 | sample=dataset[startid:startid+self.seq_len] 64 | elif len_data<=self.seq_len: 65 | startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len))) 66 | endid=min(self.seq_len,startid+len_data) 67 | datasid=0 68 | dataeid=len_data 69 | if startid+len_data>self.seq_len: 70 | datasid=np.random.randint(startid+len_data-self.seq_len) 71 | dataeid=datasid+self.seq_len-startid 72 | sample[startid:endid]=dataset[datasid:dataeid] 73 | else: 74 | for framei in range(self.seq_len): 75 | if framei==self.seq_len-1: 76 | index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1)) 77 | else: 78 | index=lenperseg*framei + np.random.randint(lenperseg) 79 | sample[framei]=dataset[index] 80 | #print (index,lenperseg) 81 | 82 | batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well. 83 | #print(batch_data) 84 | 85 | 86 | self.result['data']=np.asarray(batch_data,dtype=np.float32) 87 | self.result['label']= np.asarray(templabel,dtype=np.int32) 88 | self.result['index']= np.asarray(tempindex,dtype=np.int32) 89 | 90 | 91 | def GetDatasetSize(self): 92 | return self.num_videos 93 | 94 | 95 | 96 | class DataHandler(object): 97 | 98 | def __init__(self, batch_size, seq_len):#, datasets 99 | self.batch_size_ = batch_size 100 | #self.datasets = datasets 101 | random.seed(10) 102 | 103 | self.thread_result = {} 104 | self.thread = None 105 | 106 | self.batch_advancer =batch_thread(self.thread_result,self.batch_size_,seq_len)#, self.datasets 107 | 108 | self.datasetsize=self.batch_advancer.GetDatasetSize() 109 | 110 | self.dispatch_worker() 111 | self.join_worker() 112 | 113 | 114 | def GetBatch(self): 115 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 116 | if self.thread is not None: 117 | self.join_worker() 118 | 119 | # self.batch_data_=self.thread_result['data'] 120 | # self.batch_label_=self.thread_result['label'] 121 | 122 | self.batch_data_=self.thread_result['data'] 123 | self.batch_label_= self.thread_result['label'] 124 | self.batch_index_= self.thread_result['index'] 125 | 126 | self.dispatch_worker() 127 | return self.batch_data_, self.batch_label_,self.batch_index_ 128 | 129 | 130 | 131 | 132 | 133 | def dispatch_worker(self): 134 | assert self.thread is None 135 | self.thread = Thread(target=self.batch_advancer) 136 | self.thread.start() 137 | 138 | def join_worker(self): 139 | assert self.thread is not None 140 | self.thread.join() 141 | self.thread = None 142 | 143 | def GetDatasetSize(self): 144 | return self.datasetsize 145 | 146 | 147 | 148 | 149 | 150 | def main(): 151 | dh = DataHandler(10, 30,'train_ntus')#'test_ntus.h5')#'test_ntus_allwitherror.h5')# 152 | print (dh.GetDatasetSize) 153 | # 154 | # x,y,i = dh.GetBatch() 155 | # print (x.shape) 156 | # print (y[0:3],x[0,0,0],x[1,0,0],x[0,1,0]) 157 | # x,y,i = dh.GetBatch() 158 | # #print (x[0,0],y) 159 | # print (y,x[0,0,0]) 160 | # x,y,i = dh.GetBatch() 161 | # #print (x[0,0],y) 162 | # print (y,x[0,0,0]) 163 | # x,y,i = dh.GetBatch() 164 | # #print (x[0,0],y) 165 | # print (y,x[0,0,0]) 166 | # # exit() 167 | 168 | if __name__ == '__main__': 169 | main() 170 | 171 | -------------------------------------------------------------------------------- /cPTB/reader.py: -------------------------------------------------------------------------------- 1 | # This file is adapted from the tool provided with Tensorflow for 2 | # reading the Penn Treebank dataset. The original copyright notice is 3 | # provided below. 4 | # 5 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # ============================================================================== 19 | #from pylearn2.scripts.tutorials.grbm_smd.make_dataset import train_pkl_path 20 | 21 | 22 | """Utilities for training on the Hutter Prize and PTB datasets.""" 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | 27 | import collections 28 | import os 29 | 30 | import numpy as np 31 | 32 | 33 | def _read_symbols(filename): 34 | with open(filename, "r") as f: 35 | return f.read() 36 | 37 | 38 | def _read_words(filename): 39 | with open(filename, "r") as f: 40 | return f.read().decode("utf-8").replace("\n", "").split() 41 | 42 | 43 | def _build_vocab(filename): 44 | data = _read_words(filename) 45 | 46 | counter = collections.Counter(data) 47 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 48 | 49 | words, _ = list(zip(*count_pairs)) 50 | word_to_id = dict(zip(words, range(len(words)))) 51 | 52 | return word_to_id 53 | 54 | 55 | def _file_to_word_ids(filename, word_to_id): 56 | data = _read_words(filename) 57 | return [word_to_id[word] for word in data if word in word_to_id] 58 | 59 | 60 | def hutter_raw_data(data_path=None, num_test_symbols=5000000): 61 | """Load raw data from data directory "data_path". 62 | 63 | The raw Hutter prize data is at: 64 | http://mattmahoney.net/dc/enwik8.zip 65 | 66 | Args: 67 | data_path: string path to the directory where simple-examples.tgz has 68 | been extracted. 69 | num_test_symbols: number of symbols at the end that make up the test set 70 | 71 | Returns: 72 | tuple (train_data, valid_data, test_data, unique) 73 | where each of the data objects can be passed to hutter_iterator. 74 | """ 75 | 76 | data_path = os.path.join(data_path, "enwik8") 77 | 78 | raw_data = _read_symbols(data_path) 79 | raw_data = np.fromstring(raw_data, dtype=np.uint8) 80 | unique, data = np.unique(raw_data, return_inverse=True) 81 | train_data = data[: -2 * num_test_symbols] 82 | valid_data = data[-2 * num_test_symbols: -num_test_symbols] 83 | test_data = data[-num_test_symbols:] 84 | return train_data, valid_data, test_data, unique 85 | 86 | 87 | def ptb_raw_data(data_path=None,filename='ptb.'): 88 | """Load PTB raw data from data directory "data_path". 89 | 90 | Reads PTB text files, converts strings to integer ids, 91 | and performs mini-batching of the inputs. 92 | 93 | The PTB dataset comes from Tomas Mikolov's webpage: 94 | 95 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 96 | 97 | Args: 98 | data_path: string path to the directory where simple-examples.tgz has 99 | been extracted. 100 | 101 | Returns: 102 | tuple (train_data, valid_data, test_data, vocabulary) 103 | where each of the data objects can be passed to PTBIterator. 104 | """ 105 | 106 | train_path = os.path.join(data_path, filename+'train.txt') 107 | valid_path = os.path.join(data_path, filename+'valid.txt') 108 | test_path = os.path.join(data_path, filename+'test.txt') 109 | #print (train_path) 110 | 111 | word_to_id = _build_vocab(train_path) 112 | train_data = _file_to_word_ids(train_path, word_to_id) 113 | valid_data = _file_to_word_ids(valid_path, word_to_id) 114 | test_data = _file_to_word_ids(test_path, word_to_id) 115 | vocabulary = len(word_to_id) 116 | # save_name='ptb_char' 117 | print ('voc',vocabulary) 118 | # np.savez(save_name, train_data, valid_data, test_data, vocabulary) 119 | return train_data, valid_data, test_data, vocabulary 120 | 121 | 122 | def data_iterator(raw_data, batch_size, num_steps): 123 | """Iterate on the raw Hutter prize data or the raw PTB data. 124 | 125 | This generates batch_size pointers into the given raw data, and allows 126 | minibatch iteration along these pointers. 127 | 128 | Args: 129 | raw_data: one of the raw data outputs from hutter_raw_data or ptb_raw_data. 130 | batch_size: int, the batch size. 131 | num_steps: int, the number of unrolls. 132 | 133 | Yields: 134 | Pairs of the batched data, each a matrix of shape [batch_size, num_steps]. 135 | The second element of the tuple is the same data time-shifted to the 136 | right by one. 137 | 138 | Raises: 139 | ValueError: if batch_size or num_steps are too high. 140 | """ 141 | raw_data = np.array(raw_data, dtype=np.int32) 142 | 143 | data_len = len(raw_data) 144 | batch_len = data_len // batch_size 145 | data = np.zeros([batch_size, batch_len], dtype=np.int32) 146 | for i in range(batch_size): 147 | data[i] = raw_data[batch_len * i:batch_len * (i + 1)] 148 | 149 | epoch_size = (batch_len - 1) // num_steps 150 | 151 | if epoch_size == 0: 152 | raise ValueError("epoch_size == 0, decrease batch_size or num_steps") 153 | 154 | for i in range(epoch_size): 155 | x = data[:, i*num_steps:(i+1)*num_steps] 156 | y = data[:, i*num_steps+1:(i+1)*num_steps+1] 157 | yield (x, y) 158 | 159 | #ptb_raw_data('data/') 160 | -------------------------------------------------------------------------------- /wordPTB/reader.py: -------------------------------------------------------------------------------- 1 | # This file is adapted from the tool provided with Tensorflow for 2 | # reading the Penn Treebank dataset. The original copyright notice is 3 | # provided below. 4 | # 5 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # ============================================================================== 19 | #from pylearn2.scripts.tutorials.grbm_smd.make_dataset import train_pkl_path 20 | 21 | 22 | """Utilities for training on the Hutter Prize and PTB datasets.""" 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | 27 | import collections 28 | import os 29 | 30 | import numpy as np 31 | 32 | 33 | def _read_symbols(filename): 34 | with open(filename, "r") as f: 35 | return f.read() 36 | 37 | 38 | def _read_words(filename): 39 | with open(filename, "r") as f: 40 | return f.read().decode("utf-8").replace("\n", "").split() 41 | 42 | 43 | def _build_vocab(filename): 44 | data = _read_words(filename) 45 | 46 | counter = collections.Counter(data) 47 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 48 | 49 | words, _ = list(zip(*count_pairs)) 50 | word_to_id = dict(zip(words, range(len(words)))) 51 | 52 | return word_to_id 53 | 54 | 55 | def _file_to_word_ids(filename, word_to_id): 56 | data = _read_words(filename) 57 | return [word_to_id[word] for word in data if word in word_to_id] 58 | 59 | 60 | def hutter_raw_data(data_path=None, num_test_symbols=5000000): 61 | """Load raw data from data directory "data_path". 62 | 63 | The raw Hutter prize data is at: 64 | http://mattmahoney.net/dc/enwik8.zip 65 | 66 | Args: 67 | data_path: string path to the directory where simple-examples.tgz has 68 | been extracted. 69 | num_test_symbols: number of symbols at the end that make up the test set 70 | 71 | Returns: 72 | tuple (train_data, valid_data, test_data, unique) 73 | where each of the data objects can be passed to hutter_iterator. 74 | """ 75 | 76 | data_path = os.path.join(data_path, "enwik8") 77 | 78 | raw_data = _read_symbols(data_path) 79 | raw_data = np.fromstring(raw_data, dtype=np.uint8) 80 | unique, data = np.unique(raw_data, return_inverse=True) 81 | train_data = data[: -2 * num_test_symbols] 82 | valid_data = data[-2 * num_test_symbols: -num_test_symbols] 83 | test_data = data[-num_test_symbols:] 84 | return train_data, valid_data, test_data, unique 85 | 86 | 87 | def ptb_raw_data(data_path=None,filename='ptb.'): 88 | """Load PTB raw data from data directory "data_path". 89 | 90 | Reads PTB text files, converts strings to integer ids, 91 | and performs mini-batching of the inputs. 92 | 93 | The PTB dataset comes from Tomas Mikolov's webpage: 94 | 95 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 96 | 97 | Args: 98 | data_path: string path to the directory where simple-examples.tgz has 99 | been extracted. 100 | 101 | Returns: 102 | tuple (train_data, valid_data, test_data, vocabulary) 103 | where each of the data objects can be passed to PTBIterator. 104 | """ 105 | 106 | train_path = os.path.join(data_path, filename+'train.txt') 107 | valid_path = os.path.join(data_path, filename+'valid.txt') 108 | test_path = os.path.join(data_path, filename+'test.txt') 109 | #print (train_path) 110 | 111 | word_to_id = _build_vocab(train_path) 112 | train_data = _file_to_word_ids(train_path, word_to_id) 113 | valid_data = _file_to_word_ids(valid_path, word_to_id) 114 | test_data = _file_to_word_ids(test_path, word_to_id) 115 | vocabulary = len(word_to_id) 116 | # save_name='ptb_char' 117 | print ('voc',vocabulary) 118 | # np.savez(save_name, train_data, valid_data, test_data, vocabulary) 119 | return train_data, valid_data, test_data, vocabulary 120 | 121 | 122 | def data_iterator(raw_data, batch_size, num_steps): 123 | """Iterate on the raw Hutter prize data or the raw PTB data. 124 | 125 | This generates batch_size pointers into the given raw data, and allows 126 | minibatch iteration along these pointers. 127 | 128 | Args: 129 | raw_data: one of the raw data outputs from hutter_raw_data or ptb_raw_data. 130 | batch_size: int, the batch size. 131 | num_steps: int, the number of unrolls. 132 | 133 | Yields: 134 | Pairs of the batched data, each a matrix of shape [batch_size, num_steps]. 135 | The second element of the tuple is the same data time-shifted to the 136 | right by one. 137 | 138 | Raises: 139 | ValueError: if batch_size or num_steps are too high. 140 | """ 141 | raw_data = np.array(raw_data, dtype=np.int32) 142 | 143 | data_len = len(raw_data) 144 | batch_len = data_len // batch_size 145 | data = np.zeros([batch_size, batch_len], dtype=np.int32) 146 | for i in range(batch_size): 147 | data[i] = raw_data[batch_len * i:batch_len * (i + 1)] 148 | 149 | epoch_size = (batch_len - 1) // num_steps 150 | 151 | if epoch_size == 0: 152 | raise ValueError("epoch_size == 0, decrease batch_size or num_steps") 153 | 154 | for i in range(epoch_size): 155 | x = data[:, i*num_steps:(i+1)*num_steps] 156 | y = data[:, i*num_steps+1:(i+1)*num_steps+1] 157 | yield (x, y) 158 | 159 | #ptb_raw_data('data/') 160 | -------------------------------------------------------------------------------- /cPTB/bn_eachstep_withdrop_timefirst.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from lasagne import init 5 | from lasagne import nonlinearities 6 | 7 | from lasagne.layers import Layer 8 | from lasagne.layers import DropoutLayer 9 | class BatchNormLayer(Layer): 10 | def __init__(self, incoming, axes='auto', droprate=0.2, epsilon=1e-4, alpha=0.1,sparsity=1.0, 11 | beta=init.Constant(0), gamma=init.Constant(1), 12 | mean=init.Constant(0), inv_std=init.Constant(1), **kwargs): 13 | super(BatchNormLayer, self).__init__(incoming, **kwargs) 14 | 15 | if axes == 'auto': 16 | # default: normalize over all but the second axis 17 | axes = (0,) + tuple(range(2, len(self.input_shape))) 18 | elif isinstance(axes, int): 19 | axes = (axes,) 20 | self.axes = axes 21 | if len(axes)==1: 22 | self.mean_axes=self.axes 23 | else: 24 | self.mean_axes=(axes[1],) 25 | 26 | self.epsilon = epsilon 27 | self.alpha = alpha 28 | 29 | # create parameters, ignoring all dimensions in axes 30 | shape = [size for axis, size in enumerate(self.input_shape) 31 | if axis not in self.axes] 32 | meanshape = [size for axis, size in enumerate(self.input_shape) 33 | if axis not in self.mean_axes] 34 | if any(size is None for size in shape): 35 | raise ValueError("BatchNormLayer needs specified input sizes for " 36 | "all axes not normalized over.") 37 | if beta is None: 38 | self.beta = None 39 | else: 40 | self.beta = self.add_param(beta, shape, 'beta', 41 | trainable=True, regularizable=False) 42 | if gamma is None: 43 | self.gamma = None 44 | else: 45 | self.gamma = self.add_param(gamma, shape, 'gamma', 46 | trainable=True, regularizable=True) 47 | self.mean = self.add_param(mean, meanshape, 'mean', 48 | trainable=False, regularizable=False) 49 | self.inv_std = self.add_param(inv_std, meanshape, 'inv_std', 50 | trainable=False, regularizable=False) 51 | #print('here',len(self.input_shape)) 52 | self.sparsity=sparsity 53 | if len(self.input_shape)==3: 54 | self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1],self.input_shape[2]), p=droprate, shared_axes=(0,1), **kwargs) 55 | else: 56 | self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1]), p=droprate, shared_axes=(0,), **kwargs) 57 | 58 | def get_output_for(self, input, deterministic=False, 59 | batch_norm_use_averages=None, 60 | batch_norm_update_averages=None, **kwargs): 61 | if self.sparsity==1: 62 | input_mean = input.mean(self.mean_axes) 63 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon)) 64 | else: 65 | input_mean = input.mean(self.mean_axes)*(1.0/self.sparsity) 66 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) *(1.0/self.sparsity) -(1-self.sparsity)*T.sqr(input_mean) + self.epsilon)) 67 | 68 | # Decide whether to use the stored averages or mini-batch statistics 69 | if batch_norm_use_averages is None: 70 | batch_norm_use_averages = deterministic 71 | use_averages = batch_norm_use_averages 72 | 73 | if use_averages: 74 | mean = self.mean 75 | inv_std = self.inv_std 76 | else: 77 | mean = input_mean 78 | inv_std = input_inv_std 79 | 80 | # Decide whether to update the stored averages 81 | if batch_norm_update_averages is None: 82 | batch_norm_update_averages = not deterministic 83 | update_averages = batch_norm_update_averages 84 | 85 | if update_averages: 86 | # Trick: To update the stored statistics, we create memory-aliased 87 | # clones of the stored statistics: 88 | running_mean = theano.clone(self.mean, share_inputs=False) 89 | running_inv_std = theano.clone(self.inv_std, share_inputs=False) 90 | # set a default update for them: 91 | running_mean.default_update = ((1 - self.alpha) * running_mean + 92 | self.alpha * input_mean) 93 | running_inv_std.default_update = ((1 - self.alpha) * 94 | running_inv_std + 95 | self.alpha * input_inv_std) 96 | # and make sure they end up in the graph without participating in 97 | # the computation (this way their default_update will be collected 98 | # and applied, but the computation will be optimized away): 99 | mean += 0 * running_mean 100 | inv_std += 0 * running_inv_std 101 | 102 | # prepare dimshuffle pattern inserting broadcastable axes as needed 103 | param_axes = iter(range(input.ndim - len(self.axes))) 104 | pattern = ['x' if input_axis in self.axes 105 | else next(param_axes) 106 | for input_axis in range(input.ndim)] 107 | 108 | # apply dimshuffle pattern to all parameters 109 | beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) 110 | gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) 111 | 112 | mean_param_axes = iter(range(input.ndim - len(self.mean_axes))) 113 | mean_pattern = ['x' if input_axis in self.mean_axes 114 | else next(mean_param_axes) 115 | for input_axis in range(input.ndim)] 116 | mean = mean.dimshuffle(mean_pattern) 117 | inv_std = inv_std.dimshuffle(mean_pattern) 118 | 119 | input=self.dropout.get_output_for(input,deterministic=deterministic) 120 | 121 | # normalize 122 | normalized = (input - mean) * (gamma * inv_std) + beta 123 | return normalized 124 | 125 | -------------------------------------------------------------------------------- /wordPTB/bn_eachstep_withdrop_timefirst.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from lasagne import init 5 | from lasagne import nonlinearities 6 | 7 | from lasagne.layers import Layer 8 | from lasagne.layers import DropoutLayer 9 | class BatchNormLayer(Layer): 10 | def __init__(self, incoming, axes='auto', droprate=0.2, epsilon=1e-4, alpha=0.1,sparsity=1.0, 11 | beta=init.Constant(0), gamma=init.Constant(1), 12 | mean=init.Constant(0), inv_std=init.Constant(1), **kwargs): 13 | super(BatchNormLayer, self).__init__(incoming, **kwargs) 14 | 15 | if axes == 'auto': 16 | # default: normalize over all but the second axis 17 | axes = (0,) + tuple(range(2, len(self.input_shape))) 18 | elif isinstance(axes, int): 19 | axes = (axes,) 20 | self.axes = axes 21 | if len(axes)==1: 22 | self.mean_axes=self.axes 23 | else: 24 | self.mean_axes=(axes[1],) 25 | 26 | self.epsilon = epsilon 27 | self.alpha = alpha 28 | 29 | # create parameters, ignoring all dimensions in axes 30 | shape = [size for axis, size in enumerate(self.input_shape) 31 | if axis not in self.axes] 32 | meanshape = [size for axis, size in enumerate(self.input_shape) 33 | if axis not in self.mean_axes] 34 | if any(size is None for size in shape): 35 | raise ValueError("BatchNormLayer needs specified input sizes for " 36 | "all axes not normalized over.") 37 | if beta is None: 38 | self.beta = None 39 | else: 40 | self.beta = self.add_param(beta, shape, 'beta', 41 | trainable=True, regularizable=False) 42 | if gamma is None: 43 | self.gamma = None 44 | else: 45 | self.gamma = self.add_param(gamma, shape, 'gamma', 46 | trainable=True, regularizable=True) 47 | self.mean = self.add_param(mean, meanshape, 'mean', 48 | trainable=False, regularizable=False) 49 | self.inv_std = self.add_param(inv_std, meanshape, 'inv_std', 50 | trainable=False, regularizable=False) 51 | #print('here',len(self.input_shape)) 52 | self.sparsity=sparsity 53 | if len(self.input_shape)==3: 54 | self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1],self.input_shape[2]), p=droprate, shared_axes=(0,1), **kwargs) 55 | else: 56 | self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1]), p=droprate, shared_axes=(0,), **kwargs) 57 | 58 | def get_output_for(self, input, deterministic=False, 59 | batch_norm_use_averages=None, 60 | batch_norm_update_averages=None, **kwargs): 61 | if self.sparsity==1: 62 | input_mean = input.mean(self.mean_axes) 63 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon)) 64 | else: 65 | input_mean = input.mean(self.mean_axes)*(1.0/self.sparsity) 66 | input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) *(1.0/self.sparsity) -(1-self.sparsity)*T.sqr(input_mean) + self.epsilon)) 67 | 68 | # Decide whether to use the stored averages or mini-batch statistics 69 | if batch_norm_use_averages is None: 70 | batch_norm_use_averages = deterministic 71 | use_averages = batch_norm_use_averages 72 | 73 | if use_averages: 74 | mean = self.mean 75 | inv_std = self.inv_std 76 | else: 77 | mean = input_mean 78 | inv_std = input_inv_std 79 | 80 | # Decide whether to update the stored averages 81 | if batch_norm_update_averages is None: 82 | batch_norm_update_averages = not deterministic 83 | update_averages = batch_norm_update_averages 84 | 85 | if update_averages: 86 | # Trick: To update the stored statistics, we create memory-aliased 87 | # clones of the stored statistics: 88 | running_mean = theano.clone(self.mean, share_inputs=False) 89 | running_inv_std = theano.clone(self.inv_std, share_inputs=False) 90 | # set a default update for them: 91 | running_mean.default_update = ((1 - self.alpha) * running_mean + 92 | self.alpha * input_mean) 93 | running_inv_std.default_update = ((1 - self.alpha) * 94 | running_inv_std + 95 | self.alpha * input_inv_std) 96 | # and make sure they end up in the graph without participating in 97 | # the computation (this way their default_update will be collected 98 | # and applied, but the computation will be optimized away): 99 | mean += 0 * running_mean 100 | inv_std += 0 * running_inv_std 101 | 102 | # prepare dimshuffle pattern inserting broadcastable axes as needed 103 | param_axes = iter(range(input.ndim - len(self.axes))) 104 | pattern = ['x' if input_axis in self.axes 105 | else next(param_axes) 106 | for input_axis in range(input.ndim)] 107 | 108 | # apply dimshuffle pattern to all parameters 109 | beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) 110 | gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) 111 | 112 | mean_param_axes = iter(range(input.ndim - len(self.mean_axes))) 113 | mean_pattern = ['x' if input_axis in self.mean_axes 114 | else next(mean_param_axes) 115 | for input_axis in range(input.ndim)] 116 | mean = mean.dimshuffle(mean_pattern) 117 | inv_std = inv_std.dimshuffle(mean_pattern) 118 | 119 | input=self.dropout.get_output_for(input,deterministic=deterministic) 120 | 121 | # normalize 122 | normalized = (input - mean) * (gamma * inv_std) + beta 123 | return normalized 124 | 125 | -------------------------------------------------------------------------------- /action recognition/Indrnn_action_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import argparse 4 | import os 5 | 6 | import time 7 | 8 | import lasagne 9 | import theano 10 | import numpy as np 11 | import theano.tensor as T 12 | 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,BatchNormLayer,DenseLayer,ElemwiseSumLayer 14 | 15 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer 16 | from lasagne.layers.normalization import batch_norm 17 | from lasagne.nonlinearities import softmax, rectify,tanh 18 | from lasagne.init import Uniform,Normal,HeNormal 19 | 20 | import opts 21 | from Indrnn_action_network import build_indrnn_network as build_rnn_network 22 | import Indrnn_action_network 23 | 24 | sys.setrecursionlimit(10000) 25 | 26 | parser = argparse.ArgumentParser(description='lstm action') 27 | opts.train_opts(parser) 28 | args = parser.parse_args() 29 | print (args) 30 | 31 | 32 | batch_size = args.batch_size 33 | seq_len=args.seq_len 34 | outputclass=60 35 | indim=50#150 36 | lr=args.lr 37 | opti=lasagne.updates.adam 38 | U_bound=Indrnn_action_network.U_bound 39 | 40 | X_sym = T.tensor4('inputs')#,dtype=theano.config.floatX) 41 | y_sym = T.ivector('label')#,dtype=theano.config.floatX) 42 | learn_net=build_rnn_network(X_sym) 43 | prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=False) 44 | loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym).mean() 45 | if args.use_weightdecay_nohiddenW: 46 | params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True) 47 | for para in params: 48 | if para.name!='hidden_to_hidden.W': 49 | loss += args.decayrate *lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100)) 50 | acc=T.mean(lasagne.objectives.categorical_accuracy(prediction, y_sym, top_k=1),dtype=theano.config.floatX) 51 | 52 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True) 53 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX) 54 | grads = theano.grad(loss, params) 55 | updates = opti( grads, params, learning_rate=learning_ratetrain) 56 | print('Compiling') 57 | train_fn = theano.function([X_sym, y_sym,learning_ratetrain], [loss,acc], updates=updates) 58 | 59 | test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True,batch_norm_use_averages=False) 60 | test_acc=T.mean(lasagne.objectives.categorical_accuracy(test_prediction, y_sym, top_k=1),dtype=theano.config.floatX) 61 | test_fn = theano.function([X_sym, y_sym], [test_acc,test_prediction]) 62 | 63 | bn_test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True)#,batch_norm_use_averages=True 64 | bn_test_acc=T.mean(lasagne.objectives.categorical_accuracy(bn_test_prediction, y_sym, top_k=1),dtype=theano.config.floatX) 65 | bn_test_fn = theano.function([X_sym, y_sym], [bn_test_acc,bn_test_prediction]) 66 | 67 | 68 | learning_rate=np.float32(lr) 69 | if args.test_CV: 70 | train_datasets='train_CV_ntus' 71 | test_dataset='test_CV_ntus' 72 | else: 73 | train_datasets='train_ntus' 74 | test_dataset='test_ntus' 75 | from data_reader_numpy_witheval import DataHandler_train,DataHandler_eval 76 | from data_reader_numpy_test import DataHandler as testDataHandler 77 | dh_train = DataHandler_train(batch_size,seq_len, args.rotation_aug) 78 | dh_eval = DataHandler_eval(batch_size,seq_len) 79 | dh_test= testDataHandler(batch_size,seq_len) 80 | num_train_batches=int(np.ceil(dh_train.GetDatasetSize()/(batch_size+0.0))) 81 | num_eval_batches=int(np.ceil(dh_eval.GetDatasetSize()/(batch_size+0.0))) 82 | num_test_batches=int(np.ceil(dh_test.GetDatasetSize()/(batch_size+0.0))) 83 | labelname='test_ntus_label.npy' 84 | if args.test_CV: 85 | labelname='test_CV_ntus_label.npy' 86 | testlabels=np.load(labelname) 87 | 88 | aveloss=0 89 | aveacc=0 90 | lastacc=0 91 | dispFreq=20 92 | testnos=20 93 | stepcount=0 94 | patience=0 95 | patienceThre=10 96 | while True: 97 | x, y = dh_train.GetBatch() 98 | loss,acc=train_fn(x, y,learning_rate) 99 | stepcount+=1 100 | aveloss+=loss 101 | aveacc+=acc 102 | 103 | if args.constrain_U: 104 | for para in params: 105 | if para.name=='hidden_to_hidden.W': 106 | para.set_value(np.clip(para.get_value(),-U_bound,U_bound)) 107 | 108 | if np.isnan(loss): 109 | print ('NaN detected in cost') 110 | assert(2==3) 111 | if np.isinf(loss): 112 | print ('INF detected in cost') 113 | assert(2==3) 114 | 115 | if np.mod(stepcount, dispFreq) == 0: 116 | aveloss=aveloss/dispFreq 117 | aveacc=aveacc/dispFreq 118 | print("lr",learning_rate,"trainingerror",aveloss,"aveacc",aveacc) 119 | aveloss=0 120 | aveacc=0 121 | 122 | if np.mod(stepcount, num_train_batches)==0: 123 | stepcount=0 124 | aveacc=0 125 | eval_batches=num_eval_batches*args.eval_fold 126 | for testi in range(eval_batches): 127 | x, y = dh_eval.GetBatch() 128 | test_acc_top1,_=test_fn(x, y) 129 | aveacc+=test_acc_top1 130 | bn_aveacc=0 131 | for testi in range(eval_batches): 132 | x, y = dh_eval.GetBatch() 133 | bn_test_acc_top1,_=bn_test_fn(x, y) 134 | bn_aveacc+=bn_test_acc_top1 135 | 136 | print ('evalacc,bn_evalacc', aveacc/eval_batches, bn_aveacc/eval_batches) 137 | epocacc=bn_aveacc/eval_batches 138 | aveacc=0 139 | 140 | if (epocacc >lastacc):# and itericount>=0.8*rateschedulecount 141 | best_para=lasagne.layers.get_all_param_values(learn_net['out']) 142 | lastacc=epocacc 143 | patience=0 144 | elif patience>patienceThre: 145 | #learning_rate=np.float32(learning_rate*0.2) 146 | print ('learning rate',learning_rate) 147 | lasagne.layers.set_all_param_values(learn_net['out'], best_para) 148 | patience=0 149 | learning_rate=np.float32(learning_rate*0.1) 150 | if learning_rate200): 175 | learning_rate=np.float32(learning_rate*0.1) 176 | print ('learning rate',learning_rate) 177 | count=0 178 | if learning_rate<1e-6: 179 | break 180 | 181 | tmse=0 182 | 183 | save_name=args.model+str(seq_len) 184 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out'])) 185 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Independently Recurrent Neural Networks 2 | This code is to implement the [IndRNN](https://arxiv.org/abs/1803.04831). It is based on Theano and Lasagne. Please refer to [this one](https://github.com/Sunnydreamrain/IndRNN_pytorch) for pytorch. 3 | 4 | Please cite the following paper if you find it useful. 5 | [Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.](https://arxiv.org/abs/1803.04831) 6 | 7 | @inproceedings{li2018independently, 8 | title={Independently recurrent neural network (indrnn): Building A longer and deeper RNN}, 9 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 10 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, 11 | pages={5457--5466}, 12 | year={2018} 13 | } 14 | 15 | # Summary 16 | In IndRNNs, neurons in each layer are independent from each other, and the cross-channel information is obtained through stacking multiple layers. 17 | Advantages over the RNN and/or LSTM: 18 | - The gradient backpropagation through time can be regulated to effectively address the gradient vanishing and exploding problems. 19 | - Long-term memory can be kept with IndRNNs to process long sequences. Experiments have demonstrated that an IndRNN can well process sequences over 5000 steps. 20 | - An IndRNN can work well with non-saturated function such as relu as activation function and be trained robustly. 21 | - Multiple layers of IndRNNs can be efficiently stacked, especially with residual connections over layers, to increase the depth of the network. An example of 21 layer-IndRNN is demonstrated in the experiments. 22 | - Behaviour of IndRNN neurons in each layer are easy to interpret due to the independence of neurons in each layer. 23 | 24 | Experiments have demonstrated that IndRNN performs much better than the traditional RNN and LSTM models on various tasks such as the adding problem, sequential MNIST classification, language modelling and action recognition. 25 | 26 | # Usage 27 | `IndRNN.py` provides the IndRNN function as described in the paper. 28 | `IndRNN_onlyrecurrent.py` provides only the recurrent+activation of the IndRNN function. Therefore, processing of the input with dense connection or convolution operation is needed. This is usedful for adding batch normalization (BN) between the processing of input and activation function. 29 | 30 | ### Requirements 31 | [Theano](http://deeplearning.net/software/theano/install.html) and [Lasagne](https://lasagne.readthedocs.io/en/latest/user/installation.html) need to be installed first. 32 | ``` 33 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip 34 | pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip 35 | ``` 36 | 37 | ## For the adding example 38 | `python -u adding.py` 39 | Different options are available in adding.py. 40 | Example: `python -u adding.py --model indrnn --seq_len 100` 41 | Example of using GPU: `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' python -u adding.py --model indrnn --seq_len 100` 42 | 43 | ## For the Sequential MNIST example 44 | `python -u pixelmnist.py --use_bn_afterrnn` 45 | or with options: 46 | `python -u adding.py --model indrnn --num_layers 6 --hidden_units 128 --use_bn_afterrnn` 47 | Example of using GPU: `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' python -u adding.py --model indrnn --num_layers 6 --hidden_units 128 --use_bn_afterrnn` 48 | 49 | For this task, the batch normalization (BN) is used. It can be used before the activation function (relu) or after it. In our experiments, it converges faster by putting BN after the activation function. 50 | 51 | ## For the language modeling example using character-level Penn Treebank (PTB-c) 52 | `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_dropout --num_layers 6 --droprate 0.25 --seq_len 150 --use_weightdecay_nohiddenW` 53 | `data_aug` here only provides different start for each training epoch to provide stable statistics for BN. 54 | or using the residual model: 55 | `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_residual --num_layers 11 --use_dropout --droprate 0.3 --seq_len 150 --use_weightdecay_nohiddenW` 56 | The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results. 57 | 58 | For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation. 59 | 60 | ## For the language modeling example using word-level Penn Treebank 61 | Please find details in the directoy [wordPTB](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/tree/master/wordPTB). 62 | 63 | ## For the skeleton-based Action Recognition example 64 | Please find details in the directoy [action recognition](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/tree/master/action%20recognition). 65 | 66 | # Considerations in implementation 67 | ### 1, Initialization of the recurrent weights 68 | For relu, `Uniform(0,1)` is used to make different neurons keep different kinds of memory. But for problems that only use the output of the last time step such as the adding problem, MNIST classification problem, and action recognition problem, the recurrent weights for the last IndRNN layer (caution: only the last one not all) can be initialized to be all `1` or a proper range `(1-epsilon, 1+epsilon)` where `epsilon` is a small number, since only long-term memory is needed for the output of this layer. Examples are shown in [adding.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/adding/adding.py#L49). 69 | 70 | ### 2, Constraint of the recurrent weights 71 | For relu, generally it can be set to `[-U_bound, U_bound]` where `U_bound=pow(args.MAG, 1.0 / seq_len)` and `MAG` can be 2 or 10 or others. If the sequence is very long, it can be `[-1, 1]` since it is very close to 1 and the precision of GPU is limited. If the sequence is short such as 20, no constraint is needed. Example of the constraint is shown at [adding.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/adding/adding.py#L150). By the way, this constraint can also be implemented as a weight decay of ||max(0,|U|-U_bound)||. 72 | For simplicity, the constraint can always set to `[-1, 1]` as it can keep long-term memory already and the difference in performance is small. 73 | 74 | ### 3, Usage of batch normalization (BN) 75 | Generally, over 3 layers, BN can help accelerate the training. BN can be used before the activation function or after it. In our experiments, we find it converges faster by putting BN after the activation function. However, for tasks such as PTB_c where the output of one batch is further used as the initialization of the next batch, it is better to put BN before activation as mentioned at the above example. 76 | 77 | ### 4, Learning rate 78 | In our experiments, ADAM with a learning rate of 2e-4 works well. 79 | 80 | ### 5, Weight decay 81 | If weight decay is used, no need to add the recurrent weights. 82 | 83 | ### 6, Usage of dropout 84 | Dropout (if used) is applied with the same mask over time. 85 | 86 | ### Note 87 | The above considerations are just suggestions. I did not explore lots of training techniques such as training methods, initialization techniques. So better results may be achieved with other options. 88 | 89 | # Other implementations 90 | Tensorflow: 91 | [https://github.com/batzner/indrnn](https://github.com/batzner/indrnn) 92 | Keras: 93 | [https://github.com/titu1994/Keras-IndRNN](https://github.com/titu1994/Keras-IndRNN) 94 | Pytorch: 95 | [https://github.com/Sunnydreamrain/IndRNN_pytorch](https://github.com/Sunnydreamrain/IndRNN_pytorch) 96 | [https://github.com/StefOe/indrnn-pytorch](https://github.com/StefOe/indrnn-pytorch) 97 | [https://github.com/theSage21/IndRNN](https://github.com/theSage21/IndRNN) 98 | [https://github.com/zhangxu0307/Ind-RNN](https://github.com/zhangxu0307/Ind-RNN) 99 | CNTK: 100 | [https://github.com/delzac/cntkx](https://github.com/delzac/cntkx) 101 | Chainer: 102 | [https://github.com/0shimax/chainer-IndRNN](https://github.com/0shimax/chainer-IndRNN) 103 | -------------------------------------------------------------------------------- /mnist/pixelmnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import argparse 4 | import os 5 | 6 | import time 7 | from collections import OrderedDict 8 | import lasagne 9 | import theano 10 | import numpy as np 11 | import theano.tensor as T 12 | 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,DenseLayer,ElemwiseSumLayer 14 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer,BatchNormLayer 15 | from lasagne.nonlinearities import softmax, rectify,tanh,leaky_rectify 16 | from lasagne.init import Uniform, Normal,HeNormal 17 | 18 | from lasagne.layers import RecurrentLayer,LSTMLayer 19 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent 20 | 21 | parser = argparse.ArgumentParser(description='IndRNN solving the pixel MNIST problem') 22 | parser.add_argument('--model', type=str, default='indrnn', help='models') 23 | parser.add_argument('--num_layers', type=int, default=6) 24 | parser.add_argument('--hidden_units', type=int, default=128) 25 | parser.add_argument('--batch_size', type=int, default=32) 26 | parser.add_argument('--lr', type=np.float32, default=2e-4, help='lr') 27 | parser.add_argument('--ini', type=np.float32, default=0.001, help='ini') 28 | parser.add_argument('--gradclipvalue', type=np.float32, default=10) 29 | parser.add_argument('--use_permute', action='store_true', default=False) 30 | parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False) 31 | parser.add_argument('--decayrate', type=np.float32, default=5e-4,help='lr') 32 | parser.add_argument('--ini_b', type=np.float32, default=0.0) 33 | parser.add_argument('--MAG', type=int, default=2) 34 | 35 | 36 | parser.add_argument('--use_bn_afterrnn', action='store_true', default=False) 37 | args = parser.parse_args() 38 | print (args) 39 | 40 | 41 | batch_size = args.batch_size 42 | hidden_units=args.hidden_units 43 | outputclass=10 44 | 45 | from Data_gen import DataHandler,testDataHandler 46 | if args.use_permute: 47 | from Data_gen_permute import DataHandler,testDataHandler 48 | dh=DataHandler(batch_size) 49 | dh_test=testDataHandler(batch_size) 50 | x,y=dh.get_batch() 51 | seq_len=x.shape[1] 52 | feature_size=x.shape[2] 53 | 54 | U_bound=pow(args.MAG, 1.0 / seq_len) 55 | U_lowbound=pow(1.0/args.MAG, 1.0 / seq_len) 56 | act=rectify 57 | lr=args.lr 58 | num_layers=args.num_layers 59 | opti=lasagne.updates.adam 60 | 61 | 62 | def build_lstm_network(rnnmodel): 63 | net = {} 64 | net['input'] = InputLayer((batch_size, seq_len, feature_size)) 65 | net['rnn']=rnnmodel(net['input'],hidden_units,forgetgate=lasagne.layers.Gate(b=lasagne.init.Constant(1.)),peepholes=False, only_return_final=True,grad_clipping=args.gradclipvalue) 66 | net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=softmax) 67 | return net 68 | def build_rnn_network(rnnmodel): 69 | net = {} 70 | net['input'] = InputLayer((batch_size, seq_len, feature_size)) 71 | net['rnn']=rnnmodel(net['input'],hidden_units,nonlinearity=act,W_in_to_hid=Normal(args.ini),W_hid_to_hid=lambda shape: np.identity(hidden_units,dtype=np.float32),only_return_final=True ,grad_clipping=args.gradclipvalue) 72 | net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=softmax) 73 | return net 74 | 75 | 76 | ini_W=HeNormal(gain=np.sqrt(2)/2.0) 77 | if args.use_bn_afterrnn: 78 | ini_W=Uniform(args.ini) 79 | 80 | 81 | def build_res_rnn_network(rnnmodel): 82 | net = {} 83 | net['input'] = InputLayer((batch_size, seq_len, feature_size)) 84 | net['rnn0']=DimshuffleLayer(net['input'],(1,0,2)) 85 | for l in range(1, num_layers+1): 86 | hidini=0 87 | if l==num_layers: 88 | hidini=U_lowbound 89 | 90 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1)) 91 | net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],hidden_units,W=ini_W,b=Uniform(range=(0,args.ini_b)),nonlinearity=None) #W=Uniform(ini_rernn_in_to_hid), # 92 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size, -1)) 93 | 94 | net['rnn%d'%l]=net['rnn%d'%(l-1)] 95 | if not args.use_bn_afterrnn: 96 | net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1),beta=Uniform(range=(0,args.ini_b))) 97 | 98 | net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],hidden_units,W_hid_to_hid=Uniform(range=(hidini,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=args.gradclipvalue) 99 | if args.use_bn_afterrnn: 100 | net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1)) 101 | if l==num_layers: 102 | net['rnn%d'%num_layers]=lasagne.layers.SliceLayer(net['rnn%d'%num_layers],indices=-1, axis=0) 103 | 104 | net['out']=DenseLayer(net['rnn%d'%num_layers],outputclass,nonlinearity=softmax) 105 | return net 106 | 107 | 108 | if args.model=='rnn': 109 | learn_net=build_rnn_network(RecurrentLayer) 110 | elif args.model=='lstm': 111 | learn_net=build_lstm_network(LSTMLayer) 112 | elif args.model=='indrnn': 113 | learn_net=build_res_rnn_network(indrnn_onlyrecurrent) 114 | 115 | 116 | X_sym = T.tensor3('inputs',dtype=theano.config.floatX) 117 | y_sym = T.ivector()#T.vector('label',dtype=theano.config.floatX) 118 | 119 | prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=False)#,batch_norm_use_averages=True 120 | loss = T.mean(lasagne.objectives.categorical_crossentropy(prediction, y_sym)) 121 | acc=T.mean(lasagne.objectives.categorical_accuracy(prediction, y_sym, top_k=1),dtype=theano.config.floatX) 122 | if args.use_weightdecay_nohiddenW: 123 | params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True) 124 | for para in params: 125 | if para.name!='hidden_to_hidden.W': 126 | loss += args.decayrate *lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100)) 127 | 128 | 129 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True) 130 | 131 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX) 132 | 133 | grads = theano.grad(loss, params) 134 | 135 | updates = opti( grads, params, learning_rate=learning_ratetrain)#nesterov_momentum( loss, params, learning_rate=learning_ratetrain)# 136 | 137 | print('Compiling') 138 | train_fn = theano.function([X_sym, y_sym,learning_ratetrain], [loss,acc], updates=updates) 139 | 140 | test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True,batch_norm_use_averages=False) 141 | test_loss = T.mean(lasagne.objectives.categorical_crossentropy(test_prediction, y_sym)) 142 | test_acc=T.mean(lasagne.objectives.categorical_accuracy(test_prediction, y_sym, top_k=1),dtype=theano.config.floatX) 143 | test_fn = theano.function([X_sym, y_sym], [test_loss,test_acc]) 144 | 145 | bn_test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True) 146 | bn_test_loss = T.mean(lasagne.objectives.categorical_crossentropy(bn_test_prediction, y_sym)) 147 | bn_test_acc=T.mean(lasagne.objectives.categorical_accuracy(bn_test_prediction, y_sym, top_k=1),dtype=theano.config.floatX) 148 | bn_test_fn = theano.function([X_sym, y_sym], [bn_test_loss,bn_test_acc]) 149 | 150 | 151 | 152 | learning_rate=np.float32(lr) 153 | print ('learning rate',learning_rate) 154 | tacc=0 155 | count=0 156 | for batchi in range(1,10000000): 157 | x,y=dh.get_batch() 158 | 159 | if args.model=='indrnn': 160 | i=0 161 | for para in params: 162 | if para.name=='hidden_to_hidden.W': 163 | para.set_value(np.clip(para.get_value(),-1*U_bound,U_bound)) 164 | i+=1 165 | 166 | mse,acc=train_fn(x, y,learning_rate) 167 | tacc+=acc 168 | count+=1 169 | 170 | if batchi%1000==0:#1000 171 | print ('train acc',tacc/count) 172 | count=0 173 | tacc=0 174 | 175 | totaltestacc=0 176 | totatltestno=0 177 | #learning_ratetrainbase=learning_ratetrainbase*(1 - 1e-7) 178 | while(1): 179 | inputs, targets = dh_test.get_batch() 180 | test_mse,test_acc = test_fn(inputs,targets) 181 | totaltestacc+=test_acc 182 | totatltestno+=1 183 | if totatltestno==dh_test.GetDatasetSize(): 184 | break 185 | print ("accuracy: ", totaltestacc/totatltestno) 186 | 187 | totaltestacc=0 188 | totatltestno=0 189 | while(1): 190 | inputs, targets = dh_test.get_batch() 191 | test_mse,test_acc = bn_test_fn(inputs,targets) 192 | totaltestacc+=test_acc 193 | totatltestno+=1 194 | if totatltestno==dh_test.GetDatasetSize(): 195 | break 196 | print ("bn_accuracy: ", totaltestacc/totatltestno) 197 | 198 | if batchi%(100*6000)==0: #dh.GetDatasetSize()==0: 199 | learning_rate=np.float32(learning_rate*0.1) 200 | print ('learning rate',learning_rate) 201 | if learning_rate<1e-8: 202 | break 203 | 204 | save_name='MNIST_'+args.model 205 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out'])) -------------------------------------------------------------------------------- /mnist/Data_gen.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from threading import Thread 3 | import sys 4 | import os 5 | 6 | 7 | def load_dataset(): 8 | # We first define a download function, supporting both Python 2 and 3. 9 | if sys.version_info[0] == 2: 10 | from urllib import urlretrieve 11 | else: 12 | from urllib.request import urlretrieve 13 | 14 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 15 | print("Downloading %s" % filename) 16 | urlretrieve(source + filename, filename) 17 | 18 | # We then define functions for loading MNIST images and labels. 19 | # For convenience, they also download the requested files if needed. 20 | import gzip 21 | 22 | def load_mnist_images(filename): 23 | if not os.path.exists(filename): 24 | download(filename) 25 | # Read the inputs in Yann LeCun's binary format. 26 | with gzip.open(filename, 'rb') as f: 27 | data = np.frombuffer(f.read(), np.uint8, offset=16) 28 | # The inputs are vectors now, we reshape them to monochrome 2D images, 29 | # following the shape convention: (examples, channels, rows, columns) 30 | data = data.reshape(-1, 1, 28, 28) 31 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 32 | # (Actually to range [0, 255/256], for compatibility to the version 33 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 34 | return data / np.float32(256) 35 | 36 | def load_mnist_labels(filename): 37 | if not os.path.exists(filename): 38 | download(filename) 39 | # Read the labels in Yann LeCun's binary format. 40 | with gzip.open(filename, 'rb') as f: 41 | data = np.frombuffer(f.read(), np.uint8, offset=8) 42 | # The labels are vectors of integers now, that's exactly what we want. 43 | return data 44 | 45 | # We can now download and read the training and test set images and labels. 46 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 47 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 48 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 49 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 50 | 51 | return (X_train, y_train), (X_test, y_test) 52 | 53 | 54 | 55 | (X_train, y_train), (X_test, y_test) = load_dataset() 56 | X_train = X_train.reshape(X_train.shape[0], -1, 1) 57 | X_test = X_test.reshape(X_test.shape[0], -1, 1) 58 | X_train = X_train.astype('float32') 59 | X_test = X_test.astype('float32') 60 | X_train -= 0.5 61 | X_test -= 0.5 62 | X_train *= 2 63 | X_test *= 2 64 | 65 | class batch_thread(): 66 | def __init__(self, result, batch_size_): 67 | self.result = result 68 | self.batch_size_=batch_size_ 69 | self.indices = np.arange(len(y_train)) 70 | np.random.shuffle(self.indices) 71 | self.idx=0 72 | def __call__(self): 73 | batch_data_ = np.zeros((self.batch_size_, X_train.shape[1], X_train.shape[2]), dtype=np.float32) 74 | batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 75 | for i in range(self.batch_size_): 76 | batch_data_[i,:,:]=X_train[self.indices[self.idx],:,:] 77 | batch_label_[i]=y_train[self.indices[self.idx]] 78 | self.idx+=1 79 | if self.idx==len(self.indices): 80 | self.idx=0 81 | np.random.shuffle(self.indices) 82 | 83 | self.result['data']=batch_data_ 84 | self.result['label']=batch_label_ 85 | 86 | 87 | class DataHandler(object): 88 | 89 | def __init__(self, batch_size): 90 | self.batch_size_ = batch_size # batch size 91 | 92 | self.batch_data_ = np.zeros((self.batch_size_, 3, 32, 32), dtype=np.float32) 93 | self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 94 | 95 | self.thread_result = {} 96 | self.thread = None 97 | self.batch_advancer =batch_thread(self.thread_result,self.batch_size_) 98 | 99 | 100 | self.dispatch_worker() 101 | self.join_worker() 102 | 103 | 104 | def get_batch(self): 105 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 106 | if self.thread is not None: 107 | self.join_worker() 108 | 109 | self.batch_data_=self.thread_result['data'] 110 | self.batch_label_=self.thread_result['label'] 111 | 112 | self.dispatch_worker() 113 | return self.batch_data_, self.batch_label_ 114 | 115 | 116 | def dispatch_worker(self): 117 | assert self.thread is None 118 | self.thread = Thread(target=self.batch_advancer) 119 | self.thread.start() 120 | 121 | def join_worker(self): 122 | assert self.thread is not None 123 | self.thread.join() 124 | self.thread = None 125 | 126 | def GetDatasetSize(self): 127 | return len(Aug_Y_train)//(2*self.batch_size_) 128 | 129 | 130 | 131 | 132 | class testbatch_thread(): 133 | def __init__(self, result, batch_size_): 134 | self.result = result 135 | self.batch_size_=batch_size_ 136 | self.indices = np.arange(len(y_test)) 137 | np.random.shuffle(self.indices) 138 | self.idx=0 139 | def __call__(self): 140 | batch_data_ = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32) 141 | batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 142 | if self.idx+self.batch_size_>len(y_test): 143 | batch_data_[:len(y_test)-self.idx]=X_test[self.indices[self.idx:len(y_test)],:,:] 144 | batch_label_[:len(y_test)-self.idx]=y_test[self.indices[self.idx:len(y_test)]] 145 | needed=self.batch_size_-(len(y_test)-self.idx) 146 | batch_data_[len(y_test)-self.idx:]=X_test[self.indices[0:needed],:,:] 147 | batch_label_[len(y_test)-self.idx:]=y_test[self.indices[0:needed]] 148 | self.idx=needed 149 | else: 150 | batch_data_=X_test[self.indices[self.idx:self.idx+self.batch_size_],:,:] 151 | batch_label_=y_test[self.indices[self.idx:self.idx+self.batch_size_]] 152 | self.idx+=self.batch_size_ 153 | 154 | self.result['data']=batch_data_ 155 | self.result['label']=batch_label_ 156 | 157 | if self.idx==len(y_test): 158 | self.idx=0 159 | 160 | 161 | class testDataHandler(object): 162 | 163 | def __init__(self, batch_size): 164 | self.batch_size_ = batch_size # batch size 165 | 166 | self.batch_data_ = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32) 167 | self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 168 | 169 | self.thread_result = {} 170 | self.thread = None 171 | self.batch_advancer =testbatch_thread(self.thread_result,self.batch_size_) 172 | 173 | 174 | self.dispatch_worker() 175 | self.join_worker() 176 | 177 | 178 | def get_batch(self): 179 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 180 | if self.thread is not None: 181 | self.join_worker() 182 | 183 | self.batch_data_=self.thread_result['data'] 184 | self.batch_label_=self.thread_result['label'] 185 | 186 | self.dispatch_worker() 187 | return self.batch_data_, self.batch_label_ 188 | 189 | 190 | def dispatch_worker(self): 191 | assert self.thread is None 192 | self.thread = Thread(target=self.batch_advancer) 193 | self.thread.start() 194 | 195 | def join_worker(self): 196 | assert self.thread is not None 197 | self.thread.join() 198 | self.thread = None 199 | def GetDatasetSize(self): 200 | return len(y_test)//self.batch_size_ 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | class finaltestbatch_thread(): 211 | def __init__(self, result, batch_size_): 212 | self.result = result 213 | self.batch_size_=batch_size_ 214 | self.idx=0 215 | def __call__(self): 216 | temp_data_ = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32) 217 | temp_label_ = np.zeros((self.batch_size_*2), dtype=np.int32) 218 | 219 | temp_data_[::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,:] 220 | temp_label_[::2]=y_test[self.idx:self.idx+self.batch_size_] 221 | 222 | temp_data_[1::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,::-1] 223 | temp_label_[1::2]=y_test[self.idx:self.idx+self.batch_size_] 224 | 225 | self.result['data']=temp_data_ 226 | self.result['label']=temp_label_ 227 | self.idx+=self.batch_size_ 228 | if self.idx==len(y_test): 229 | self.idx=0 230 | 231 | 232 | class finaltestDataHandler(object): 233 | 234 | def __init__(self, batch_size): 235 | self.batch_size_ = batch_size # batch size 236 | 237 | self.batch_data_ = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32) 238 | self.batch_label_ = np.zeros((self.batch_size_*2), dtype=np.int32) 239 | 240 | self.thread_result = {} 241 | self.thread = None 242 | self.batch_advancer =finaltestbatch_thread(self.thread_result,self.batch_size_) 243 | 244 | 245 | self.dispatch_worker() 246 | self.join_worker() 247 | 248 | 249 | def get_batch(self): 250 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 251 | if self.thread is not None: 252 | self.join_worker() 253 | 254 | self.batch_data_=self.thread_result['data'] 255 | self.batch_label_=self.thread_result['label'] 256 | 257 | self.dispatch_worker() 258 | return self.batch_data_, self.batch_label_ 259 | 260 | 261 | def dispatch_worker(self): 262 | assert self.thread is None 263 | self.thread = Thread(target=self.batch_advancer) 264 | self.thread.start() 265 | 266 | def join_worker(self): 267 | assert self.thread is not None 268 | self.thread.join() 269 | self.thread = None 270 | def GetDatasetSize(self): 271 | return len(y_test)//self.batch_size_ 272 | -------------------------------------------------------------------------------- /mnist/Data_gen_permute.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from threading import Thread 3 | import sys 4 | import os 5 | 6 | 7 | def load_dataset(): 8 | # We first define a download function, supporting both Python 2 and 3. 9 | if sys.version_info[0] == 2: 10 | from urllib import urlretrieve 11 | else: 12 | from urllib.request import urlretrieve 13 | 14 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 15 | print("Downloading %s" % filename) 16 | urlretrieve(source + filename, filename) 17 | 18 | # We then define functions for loading MNIST images and labels. 19 | # For convenience, they also download the requested files if needed. 20 | import gzip 21 | 22 | def load_mnist_images(filename): 23 | if not os.path.exists(filename): 24 | download(filename) 25 | # Read the inputs in Yann LeCun's binary format. 26 | with gzip.open(filename, 'rb') as f: 27 | data = np.frombuffer(f.read(), np.uint8, offset=16) 28 | # The inputs are vectors now, we reshape them to monochrome 2D images, 29 | # following the shape convention: (examples, channels, rows, columns) 30 | data = data.reshape(-1, 1, 28, 28) 31 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 32 | # (Actually to range [0, 255/256], for compatibility to the version 33 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 34 | return data / np.float32(256) 35 | 36 | def load_mnist_labels(filename): 37 | if not os.path.exists(filename): 38 | download(filename) 39 | # Read the labels in Yann LeCun's binary format. 40 | with gzip.open(filename, 'rb') as f: 41 | data = np.frombuffer(f.read(), np.uint8, offset=8) 42 | # The labels are vectors of integers now, that's exactly what we want. 43 | return data 44 | 45 | # We can now download and read the training and test set images and labels. 46 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 47 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 48 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 49 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 50 | 51 | return (X_train, y_train), (X_test, y_test) 52 | 53 | 54 | 55 | (X_train, y_train), (X_test, y_test) = load_dataset() 56 | X_train = X_train.reshape(X_train.shape[0], -1, 1) 57 | X_test = X_test.reshape(X_test.shape[0], -1, 1) 58 | X_train = X_train.astype('float32') 59 | X_test = X_test.astype('float32') 60 | X_train -= 0.5 61 | X_test -= 0.5 62 | X_train *= 2 63 | X_test *= 2 64 | 65 | 66 | seq_len=X_train.shape[1] 67 | if seq_len!=X_test.shape[1]: 68 | print ('seq len wrong') 69 | assert 2==3 70 | P = np.random.permutation(seq_len) 71 | X_train=X_train[:,P,:] 72 | X_test=X_test[:,P,:] 73 | 74 | 75 | 76 | class batch_thread(): 77 | def __init__(self, result, batch_size_): 78 | self.result = result 79 | self.batch_size_=batch_size_ 80 | self.indices = np.arange(len(y_train)) 81 | np.random.shuffle(self.indices) 82 | self.idx=0 83 | def __call__(self): 84 | batch_data_ = np.zeros((self.batch_size_, X_train.shape[1], X_train.shape[2]), dtype=np.float32) 85 | batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 86 | for i in range(self.batch_size_): 87 | batch_data_[i,:,:]=X_train[self.indices[self.idx],:,:] 88 | batch_label_[i]=y_train[self.indices[self.idx]] 89 | self.idx+=1 90 | if self.idx==len(self.indices): 91 | self.idx=0 92 | np.random.shuffle(self.indices) 93 | 94 | self.result['data']=batch_data_ 95 | self.result['label']=batch_label_ 96 | 97 | 98 | class DataHandler(object): 99 | 100 | def __init__(self, batch_size): 101 | self.batch_size_ = batch_size # batch size 102 | 103 | self.batch_data_ = np.zeros((self.batch_size_, 3, 32, 32), dtype=np.float32) 104 | self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 105 | 106 | self.thread_result = {} 107 | self.thread = None 108 | self.batch_advancer =batch_thread(self.thread_result,self.batch_size_) 109 | 110 | 111 | self.dispatch_worker() 112 | self.join_worker() 113 | 114 | 115 | def get_batch(self): 116 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 117 | if self.thread is not None: 118 | self.join_worker() 119 | 120 | self.batch_data_=self.thread_result['data'] 121 | self.batch_label_=self.thread_result['label'] 122 | 123 | self.dispatch_worker() 124 | return self.batch_data_, self.batch_label_ 125 | 126 | 127 | def dispatch_worker(self): 128 | assert self.thread is None 129 | self.thread = Thread(target=self.batch_advancer) 130 | self.thread.start() 131 | 132 | def join_worker(self): 133 | assert self.thread is not None 134 | self.thread.join() 135 | self.thread = None 136 | 137 | def GetDatasetSize(self): 138 | return len(Aug_Y_train)//(2*self.batch_size_) 139 | 140 | 141 | 142 | 143 | class testbatch_thread(): 144 | def __init__(self, result, batch_size_): 145 | self.result = result 146 | self.batch_size_=batch_size_ 147 | self.indices = np.arange(len(y_test)) 148 | np.random.shuffle(self.indices) 149 | self.idx=0 150 | def __call__(self): 151 | batch_data_ = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32) 152 | batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 153 | if self.idx+self.batch_size_>len(y_test): 154 | batch_data_[:len(y_test)-self.idx]=X_test[self.indices[self.idx:len(y_test)],:,:] 155 | batch_label_[:len(y_test)-self.idx]=y_test[self.indices[self.idx:len(y_test)]] 156 | needed=self.batch_size_-(len(y_test)-self.idx) 157 | batch_data_[len(y_test)-self.idx:]=X_test[self.indices[0:needed],:,:] 158 | batch_label_[len(y_test)-self.idx:]=y_test[self.indices[0:needed]] 159 | self.idx=needed 160 | else: 161 | batch_data_=X_test[self.indices[self.idx:self.idx+self.batch_size_],:,:] 162 | batch_label_=y_test[self.indices[self.idx:self.idx+self.batch_size_]] 163 | self.idx+=self.batch_size_ 164 | 165 | self.result['data']=batch_data_ 166 | self.result['label']=batch_label_ 167 | 168 | if self.idx==len(y_test): 169 | self.idx=0 170 | 171 | 172 | class testDataHandler(object): 173 | 174 | def __init__(self, batch_size): 175 | self.batch_size_ = batch_size # batch size 176 | 177 | self.batch_data_ = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32) 178 | self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32) 179 | 180 | self.thread_result = {} 181 | self.thread = None 182 | self.batch_advancer =testbatch_thread(self.thread_result,self.batch_size_) 183 | 184 | 185 | self.dispatch_worker() 186 | self.join_worker() 187 | 188 | 189 | def get_batch(self): 190 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 191 | if self.thread is not None: 192 | self.join_worker() 193 | 194 | self.batch_data_=self.thread_result['data'] 195 | self.batch_label_=self.thread_result['label'] 196 | 197 | self.dispatch_worker() 198 | return self.batch_data_, self.batch_label_ 199 | 200 | 201 | def dispatch_worker(self): 202 | assert self.thread is None 203 | self.thread = Thread(target=self.batch_advancer) 204 | self.thread.start() 205 | 206 | def join_worker(self): 207 | assert self.thread is not None 208 | self.thread.join() 209 | self.thread = None 210 | def GetDatasetSize(self): 211 | return len(y_test)//self.batch_size_ 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | class finaltestbatch_thread(): 222 | def __init__(self, result, batch_size_): 223 | self.result = result 224 | self.batch_size_=batch_size_ 225 | self.idx=0 226 | def __call__(self): 227 | temp_data_ = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32) 228 | temp_label_ = np.zeros((self.batch_size_*2), dtype=np.int32) 229 | 230 | temp_data_[::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,:] 231 | temp_label_[::2]=y_test[self.idx:self.idx+self.batch_size_] 232 | 233 | temp_data_[1::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,::-1] 234 | temp_label_[1::2]=y_test[self.idx:self.idx+self.batch_size_] 235 | 236 | self.result['data']=temp_data_ 237 | self.result['label']=temp_label_ 238 | self.idx+=self.batch_size_ 239 | if self.idx==len(y_test): 240 | self.idx=0 241 | 242 | 243 | class finaltestDataHandler(object): 244 | 245 | def __init__(self, batch_size): 246 | self.batch_size_ = batch_size # batch size 247 | 248 | self.batch_data_ = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32) 249 | self.batch_label_ = np.zeros((self.batch_size_*2), dtype=np.int32) 250 | 251 | self.thread_result = {} 252 | self.thread = None 253 | self.batch_advancer =finaltestbatch_thread(self.thread_result,self.batch_size_) 254 | 255 | 256 | self.dispatch_worker() 257 | self.join_worker() 258 | 259 | 260 | def get_batch(self): 261 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 262 | if self.thread is not None: 263 | self.join_worker() 264 | 265 | self.batch_data_=self.thread_result['data'] 266 | self.batch_label_=self.thread_result['label'] 267 | 268 | self.dispatch_worker() 269 | return self.batch_data_, self.batch_label_ 270 | 271 | 272 | def dispatch_worker(self): 273 | assert self.thread is None 274 | self.thread = Thread(target=self.batch_advancer) 275 | self.thread.start() 276 | 277 | def join_worker(self): 278 | assert self.thread is not None 279 | self.thread.join() 280 | self.thread = None 281 | def GetDatasetSize(self): 282 | return len(y_test)//self.batch_size_ 283 | -------------------------------------------------------------------------------- /action recognition/data_reader_numpy_witheval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import h5py 3 | import numpy as np 4 | import time 5 | import random 6 | #import glob 7 | #import skimage.transform 8 | #from skimage import color 9 | import pickle 10 | #import theano 11 | #import cv2 12 | from multiprocessing import Pool 13 | from threading import Thread 14 | import os.path 15 | #RGB_frames = '/home/sl669/caffe/colordataset/ImageNET/ILSVRC2015/Data/CLS-LOC/val/'#'/home/sl669/caffe/ucf101/framearrays/'# 16 | 17 | from __main__ import train_datasets 18 | #train_datasets='train_ntus' 19 | datasets=train_datasets 20 | dataname=datasets+'.npy' 21 | labelname=datasets+'_label.npy' 22 | lenname=datasets+'_len.npy' 23 | data_handle=np.load(dataname) 24 | label_handle=np.load(labelname) 25 | len_handle=np.load(lenname) 26 | num_videos = len(data_handle) 27 | train_no=int(num_videos*0.95) 28 | test_no=num_videos-train_no 29 | 30 | shufflevideolist=np.arange(num_videos) 31 | np.random.shuffle(shufflevideolist) 32 | 33 | shufflevideolist_train=shufflevideolist[:train_no] 34 | shufflevideolist_test=shufflevideolist[train_no:] 35 | 36 | print ('Dataset train size, test size', train_no,test_no) 37 | 38 | 39 | def rotate( input,s,b): 40 | shape=input.shape 41 | input=input.reshape((-1,3)) 42 | XT=input[:,0] 43 | YT=input[:,1] 44 | ZT=input[:,2] 45 | s=s/180.0*np.pi 46 | b=b/180.0*np.pi 47 | RX = XT*np.cos(b) - ZT*np.sin(b) + ZT*np.sin(b)*np.cos(s) + YT*np.sin(b)*np.sin(s) - ZT*np.sin(b)*(np.cos(s) - 1); 48 | RY = YT*np.cos(s); 49 | RZ = ZT*np.cos(b)*np.cos(s) - ZT*(np.cos(b) - 1) - XT*np.sin(b) + YT*np.cos(b)*np.sin(s) - ZT*np.cos(b)*(np.cos(s) - 1); 50 | RX=RX.reshape((-1,1)) 51 | RY=RY.reshape((-1,1)) 52 | RZ=RZ.reshape((-1,1)) 53 | output=np.concatenate([RX,RY,RZ],axis=1) 54 | output=output.reshape(shape) 55 | #print(shape,output.shape,input.shape) 56 | return output 57 | 58 | class batch_thread_train(): 59 | def __init__(self, result, batch_size_,seq_len,use_rotation=False): 60 | self.result = result 61 | self.batch_size_=batch_size_ 62 | self.seq_len=seq_len 63 | self.idx=0 64 | self.use_rotation=use_rotation 65 | 66 | def __call__(self):###Be careful. The appended data may change like pointer. 67 | templabel=[] 68 | batch_data=[] 69 | for j in range(self.batch_size_): 70 | self.idx +=1 71 | if self.idx == train_no: 72 | self.idx =0 73 | np.random.shuffle(shufflevideolist_train) 74 | shufflevideoindex=shufflevideolist_train[self.idx] 75 | 76 | 77 | label=label_handle[shufflevideoindex] 78 | templabel.append(np.int32(label)) 79 | dataset=data_handle[shufflevideoindex] 80 | len_data=len_handle[shufflevideoindex] 81 | 82 | sample=np.zeros(tuple((self.seq_len,)+data_handle[shufflevideoindex].shape[1:])) 83 | lenperseg=len_data//self.seq_len 84 | if lenperseg==1 and len_data>self.seq_len: 85 | startid=np.random.randint(len_data-self.seq_len) 86 | sample=dataset[startid:startid+self.seq_len] 87 | #print('wrong data length first') 88 | elif len_data<=self.seq_len: 89 | startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len))) 90 | endid=min(self.seq_len,startid+len_data) 91 | datasid=0 92 | dataeid=len_data 93 | if startid+len_data>self.seq_len: 94 | datasid=np.random.randint(startid+len_data-self.seq_len) 95 | dataeid=datasid+self.seq_len-startid 96 | sample[startid:endid]=dataset[datasid:dataeid] 97 | else: 98 | for framei in range(self.seq_len): 99 | if framei==self.seq_len-1: 100 | index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1)) 101 | else: 102 | index=lenperseg*framei + np.random.randint(lenperseg) 103 | sample[framei]=dataset[index] 104 | 105 | #print(sample) 106 | if self.use_rotation: 107 | if np.random.randint(2): 108 | s=np.random.randint(2)*45#random(1)*45 109 | b=np.random.randint(2)*45#random(1)*45 110 | #print(sample.shape) 111 | sample=rotate(sample,s,b) 112 | #print (index,lenperseg) 113 | # rframei=np.random.randint(len_data) 114 | # tmean=(dataset[rframei,0,:]+dataset[rframei,12,:]+dataset[rframei,16,:])/3 115 | # sample=sample-tmean 116 | batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well. 117 | #print(batch_data) 118 | 119 | self.result['data']=np.asarray(batch_data,dtype=np.float32) 120 | self.result['label']= np.asarray(templabel,dtype=np.int32) 121 | 122 | class DataHandler_train(object): 123 | 124 | def __init__(self, batch_size, seq_len, use_rotation=False):#datasets, 125 | self.batch_size_ = batch_size 126 | #self.datasets = datasets 127 | random.seed(10) 128 | 129 | self.thread_result = {} 130 | self.thread = None 131 | 132 | self.batch_advancer =batch_thread_train(self.thread_result,self.batch_size_,seq_len,use_rotation) 133 | 134 | self.dispatch_worker() 135 | self.join_worker() 136 | 137 | 138 | def GetBatch(self): 139 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 140 | if self.thread is not None: 141 | self.join_worker() 142 | 143 | self.batch_data_=self.thread_result['data'] 144 | self.batch_label_= self.thread_result['label'] 145 | 146 | self.dispatch_worker() 147 | return self.batch_data_, self.batch_label_ 148 | 149 | def dispatch_worker(self): 150 | assert self.thread is None 151 | self.thread = Thread(target=self.batch_advancer) 152 | self.thread.start() 153 | 154 | def join_worker(self): 155 | assert self.thread is not None 156 | self.thread.join() 157 | self.thread = None 158 | 159 | def GetDatasetSize(self): 160 | return train_no 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | class batch_thread_eval(): 171 | def __init__(self, result, batch_size_,seq_len): 172 | self.result = result 173 | self.batch_size_=batch_size_ 174 | self.seq_len=seq_len 175 | self.idx=0 176 | 177 | def __call__(self):###Be careful. The appended data may change like pointer. 178 | templabel=[] 179 | batch_data=[] 180 | for j in range(self.batch_size_): 181 | self.idx +=1 182 | if self.idx == test_no: 183 | self.idx =0 184 | np.random.shuffle(shufflevideolist_test) 185 | shufflevideoindex=shufflevideolist_test[self.idx] 186 | 187 | 188 | label=label_handle[shufflevideoindex] 189 | templabel.append(np.int32(label)) 190 | dataset=data_handle[shufflevideoindex] 191 | len_data=len_handle[shufflevideoindex] 192 | 193 | sample=np.zeros(tuple((self.seq_len,)+data_handle[shufflevideoindex].shape[1:])) 194 | lenperseg=len_data//self.seq_len 195 | if lenperseg==1 and len_data>self.seq_len: 196 | startid=np.random.randint(len_data-self.seq_len) 197 | sample=dataset[startid:startid+self.seq_len] 198 | elif len_data<=self.seq_len: 199 | startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len))) 200 | endid=min(self.seq_len,startid+len_data) 201 | datasid=0 202 | dataeid=len_data 203 | if startid+len_data>self.seq_len: 204 | datasid=np.random.randint(startid+len_data-self.seq_len) 205 | dataeid=datasid+self.seq_len-startid 206 | sample[startid:endid]=dataset[datasid:dataeid] 207 | else: 208 | for framei in range(self.seq_len): 209 | if framei==self.seq_len-1: 210 | index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1)) 211 | else: 212 | index=lenperseg*framei + np.random.randint(lenperseg) 213 | sample[framei]=dataset[index] 214 | #print (index,lenperseg) 215 | 216 | batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well. 217 | #print(batch_data) 218 | 219 | self.result['data']=np.asarray(batch_data,dtype=np.float32) 220 | self.result['label']= np.asarray(templabel,dtype=np.int32) 221 | 222 | class DataHandler_eval(object): 223 | 224 | def __init__(self, batch_size, seq_len):#, datasets 225 | self.batch_size_ = batch_size 226 | #self.datasets = datasets 227 | random.seed(10) 228 | 229 | self.thread_result = {} 230 | self.thread = None 231 | 232 | self.batch_advancer =batch_thread_eval(self.thread_result,self.batch_size_,seq_len) 233 | 234 | self.dispatch_worker() 235 | self.join_worker() 236 | 237 | 238 | def GetBatch(self): 239 | #self.batch_data_ = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32) 240 | if self.thread is not None: 241 | self.join_worker() 242 | 243 | self.batch_data_=self.thread_result['data'] 244 | self.batch_label_= self.thread_result['label'] 245 | 246 | self.dispatch_worker() 247 | return self.batch_data_, self.batch_label_ 248 | 249 | def dispatch_worker(self): 250 | assert self.thread is None 251 | self.thread = Thread(target=self.batch_advancer) 252 | self.thread.start() 253 | 254 | def join_worker(self): 255 | assert self.thread is not None 256 | self.thread.join() 257 | self.thread = None 258 | 259 | def GetDatasetSize(self): 260 | return test_no 261 | 262 | 263 | 264 | def main(): 265 | dh = DataHandler_train(1, 30,True)#'test_ntus.h5')#'test_ntus_allwitherror.h5')# 266 | print (dh.GetDatasetSize()) 267 | dh_eval = DataHandler_eval(10, 30)#'test_ntus.h5')#'test_ntus_allwitherror.h5')# 268 | print (dh_eval.GetDatasetSize()) 269 | 270 | x,y = dh.GetBatch() 271 | # print (x.shape) 272 | # print (y[0:3],x[0,0,0],x[1,0,0],x[0,1,0]) 273 | # x,y = dh_eval.GetBatch() 274 | # #print (x[0,0],y) 275 | # print (y,x[0,0,0]) 276 | # x,y = dh.GetBatch() 277 | # #print (x[0,0],y) 278 | # print (y,x[0,0,0]) 279 | x,y = dh.GetBatch() 280 | #print (x[0,0],y) 281 | #print (y,x[0,0,0]) 282 | # exit() 283 | 284 | if __name__ == '__main__': 285 | main() 286 | 287 | -------------------------------------------------------------------------------- /cPTB/penntree_charlevel_rernn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import argparse 4 | import os 5 | 6 | import time 7 | 8 | import lasagne 9 | import theano 10 | import numpy as np 11 | import theano.tensor as T 12 | 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate, DenseLayer 14 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer, SliceLayer,ElemwiseSumLayer 15 | from lasagne.nonlinearities import softmax, rectify,tanh,very_leaky_rectify 16 | from lasagne.init import Uniform,Normal,HeNormal 17 | 18 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent 19 | from BatchNorm_step_timefirst import BatchNorm_step_timefirst_Layer 20 | 21 | 22 | np.set_printoptions(threshold=3000,edgeitems=50) 23 | sys.setrecursionlimit(50000) 24 | parser = argparse.ArgumentParser(description='IndRNN for the char level PennTreeBank Language Model') 25 | parser.add_argument('--hidden_units', type=int, default=2000) 26 | parser.add_argument('--batch_size', type=int, default=128,help='batch_size') 27 | parser.add_argument('--seq_len', type=int, default=50,help='seq_len') 28 | parser.add_argument('--num_layers', type=int, default=6,help='num_layers') 29 | parser.add_argument('--lr', type=np.float32, default=2e-4, help='lr') 30 | parser.add_argument('--act', type=str, default='relu', help='act') 31 | parser.add_argument('--data_aug', action='store_true', default=False) 32 | parser.add_argument('--gradclipvalue', type=np.float32, default=10, help='gradclipvalue') 33 | parser.add_argument('--MAG', type=int, default=2) 34 | parser.add_argument('--fix_bound', action='store_true', default=False) 35 | 36 | #bn 37 | parser.add_argument('--use_bn_afterrnn', action='store_true', default=False) 38 | 39 | #drop 40 | parser.add_argument('--use_dropout', action='store_true', default=False) 41 | parser.add_argument('--droprate', type=np.float32, default=0.3, help='lr') 42 | parser.add_argument('--droplayers', type=int, default=1,help='droplayers') 43 | 44 | #residual 45 | parser.add_argument('--use_residual', action='store_true', default=False) 46 | parser.add_argument('--residual_layers', type=int, default=2) 47 | parser.add_argument('--residual_block', type=int, default=3) 48 | parser.add_argument('--unit_factor', type=np.float32, default=1, help='lr') 49 | 50 | #weight decay 51 | parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False) 52 | parser.add_argument('--decayfactor', type=np.float32, default=1e-4, help='decayfactor') 53 | 54 | #initialization 55 | parser.add_argument('--pThre', type=int, default=20) 56 | parser.add_argument('--ini_in2hid', type=np.float32, default=0.005, help='ini_in2hid') 57 | parser.add_argument('--ini_b', type=np.float32, default=0.0, help='ini_in2hid') 58 | 59 | args = parser.parse_args() 60 | print (args) 61 | 62 | 63 | num_layers=args.num_layers 64 | droplayers=args.droplayers 65 | outputclass=50 66 | batch_size = args.batch_size 67 | seq_len=args.seq_len 68 | hidden_units=args.hidden_units 69 | use_dropout=args.use_dropout 70 | lr=np.float32(args.lr) 71 | droprate=np.float32(args.droprate) 72 | opti=lasagne.updates.adam 73 | 74 | rnnmodel=indrnn_onlyrecurrent 75 | act=rectify 76 | if args.act=='tanh': 77 | act=tanh 78 | 79 | 80 | 81 | from reader import data_iterator, ptb_raw_data 82 | name_dataset='ptb.char.' 83 | def get_raw_data(dataset='ptb',data_path='data/'): 84 | raw_data = ptb_raw_data(data_path,filename=name_dataset) 85 | return raw_data 86 | train_data, valid_data, test_data, _ = get_raw_data('ptb') 87 | epoch_size =((len(train_data) // batch_size) - 1) // seq_len 88 | 89 | 90 | seq_len1=len(train_data) 91 | U_bound=pow(args.MAG, 1.0 / seq_len1) 92 | if args.act=='tanh': 93 | U_bound=pow(args.MAG/(pow(0.9,seq_len1/10.0)), 1.0 / seq_len1) 94 | if args.fix_bound: 95 | U_bound=1.0 96 | #Because the last state of one batch is used as the initial state of the next batch, the total length is used here. 97 | # This bound can simply set to 1. (1) the sequence is too long and they are already very close to 1. 98 | #(2) Due to the precision of GPU, if it is rounded to a larger value, it may explode. 99 | 100 | taxdrop= (0,) 101 | 102 | ini_W=HeNormal(gain=np.sqrt(2)/2.0) 103 | if args.use_bn_afterrnn: 104 | ini_W=Normal(args.ini_in2hid) 105 | 106 | units=[] 107 | acc_units=[] 108 | acc_units.append(0) 109 | sum_units=0 110 | if args.unit_factor!=1 and num_layers%(args.residual_block*args.residual_layers)!=args.start_residual: 111 | print ('layers should be layers = args.residual_block*args.residual_layers +1') 112 | assert 2==3 113 | for l in range(num_layers): 114 | units_inc_factor=1 115 | if l>=1: 116 | units_inc_factor=np.power(args.unit_factor, (l-1)//(args.residual_block*args.residual_layers)) 117 | units.append(np.int(hidden_units*units_inc_factor)) 118 | sum_units+=np.int(hidden_units*units_inc_factor) 119 | acc_units.append(sum_units) 120 | 121 | #print(units,acc_units) 122 | def build_rnn_network(rnnmodel,X_sym,hid_init_sym): 123 | net = {} 124 | 125 | net['input0'] = InputLayer((batch_size, seq_len),X_sym) 126 | net['input']=lasagne.layers.EmbeddingLayer(net['input0'],outputclass,units[0])#,W=lasagne.init.Uniform(inial_scale) 127 | net['rnn0']=DimshuffleLayer(net['input'],(1,0,2)) #change to (time, batch_size,hidden_units) 128 | 129 | for l in range(1, num_layers+1): 130 | net['hiddeninput%d'%l] = InputLayer((batch_size, units[l-1]),hid_init_sym[:,acc_units[l-1]:acc_units[l]]) 131 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1)) 132 | net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],units[l-1],W=ini_W,b=lasagne.init.Constant(args.ini_b),nonlinearity=None) #W=Uniform(ini_rernn_in_to_hid), # 133 | net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size, -1)) 134 | 135 | if args.use_residual and l>args.residual_layers and (l-1)%args.residual_layers==0:# and l!=num_layers 136 | if units[l - 1]!=units[l - 1 - args.residual_layers]: 137 | net['leftbranch%d' % (l - 1)] = ReshapeLayer(net['sum%d'%(l-args.residual_layers)], (batch_size * seq_len, -1)) 138 | net['leftbranch%d' % (l - 1)] = DenseLayer(net['leftbranch%d' % (l - 1)], units[l - 1], W=ini_W, nonlinearity=None) 139 | net['leftbranch%d' % (l - 1)] = ReshapeLayer(net['leftbranch%d' % (l - 1)], (seq_len, batch_size, -1)) 140 | net['leftbranch%d' % (l - 1)] = BatchNorm_step_timefirst_Layer(net['leftbranch%d' % (l - 1)], axes=(0, 1)) 141 | print('left branch') 142 | else: 143 | net['leftbranch%d' % (l - 1)] = net['sum%d'%(l-args.residual_layers)] 144 | net['sum%d'%l]=ElemwiseSumLayer((net['rnn%d'%(l-1)],net['leftbranch%d' % (l - 1)])) 145 | else: 146 | net['sum%d'%l]=net['rnn%d'%(l-1)] 147 | 148 | net['rnn%d'%l]=net['sum%d'%l] 149 | if not args.use_bn_afterrnn: 150 | net['rnn%d'%l]=BatchNorm_step_timefirst_Layer(net['rnn%d'%l],axes= (0,1),beta=lasagne.init.Constant(args.ini_b)) 151 | 152 | ini_hid_start=0 153 | if act==tanh: 154 | ini_hid_start=-1*U_bound 155 | net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],units[l-1],hid_init=net['hiddeninput%d'%l],W_hid_to_hid=Uniform(range=(ini_hid_start,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=args.gradclipvalue) 156 | 157 | net['last_state%d'%l]=SliceLayer(net['rnn%d'%l],-1, axis=0) 158 | if l==1: 159 | net['hid_out']=net['last_state%d'%l] 160 | else: 161 | net['hid_out']=ConcatLayer([net['hid_out'], net['last_state%d'%l]],axis=1) 162 | 163 | if use_dropout and l%droplayers==0: 164 | net['rnn%d'%l]=lasagne.layers.DropoutLayer(net['rnn%d'%l], p=droprate, shared_axes=taxdrop) 165 | 166 | if args.use_bn_afterrnn: 167 | net['rnn%d'%l]=BatchNorm_step_timefirst_Layer(net['rnn%d'%l],axes= (0,1)) 168 | 169 | net['rnn%d'%num_layers]=DimshuffleLayer(net['rnn%d'%num_layers],(1,0,2)) 170 | net['reshape_rnn']=ReshapeLayer(net['rnn%d'%num_layers],(-1,units[num_layers-1])) 171 | net['out']=DenseLayer(net['reshape_rnn'],outputclass,nonlinearity=softmax)#lasagne.init.HeNormal(gain='relu'))#,W=Uniform(inial_scale) 172 | return net 173 | 174 | 175 | X_sym = T.imatrix('inputs')#,dtype=theano.config.floatX) 176 | y_sym = T.imatrix('label')#,dtype=theano.config.floatX) 177 | hid_init_sym = T.matrix()#tensor3() 178 | 179 | learn_net=build_rnn_network(rnnmodel,X_sym,hid_init_sym) 180 | print(lasagne.layers.count_params(learn_net['out'], trainable=True)) 181 | y_sym0=y_sym.reshape((-1,)) 182 | prediction,hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']],deterministic=False) # {X_sym:X_sym,hid_init_sym:hid_init_sym}, 183 | loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym0).mean() 184 | perp=T.exp(loss) 185 | bpc = (loss/np.log(2.0)) 186 | 187 | cost=loss 188 | 189 | if args.use_weightdecay_nohiddenW: 190 | params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True) 191 | for para in params: 192 | if para.name!='hidden_to_hidden.W': 193 | cost += args.decayfactor*lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100)) 194 | 195 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True) 196 | 197 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX) 198 | 199 | grads = theano.grad(cost, params) 200 | # if use_gradclip: 201 | # grads= [T.clip(g, -gradclipvalue, gradclipvalue) for g in grads] 202 | updates = opti( grads, params, learning_rate=learning_ratetrain)#rmsprop( grads, params, learning_rate=learning_ratetrain)#nesterov_momentum 203 | print('Compiling') 204 | train_fn = theano.function([X_sym, y_sym,hid_init_sym,learning_ratetrain],\ 205 | [perp, bpc, hid_rec_init], updates=updates) 206 | 207 | test_prediction, test_hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']], \ 208 | deterministic=True,batch_norm_use_averages=False)#{X_sym:X_sym,hid_init_sym:hid_init_sym}, 209 | 210 | test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, y_sym0).mean() 211 | test_perp=T.exp(test_loss) 212 | test_bpc = (test_loss/np.log(2.0)) 213 | test_fn = theano.function([X_sym, y_sym,hid_init_sym],\ 214 | [test_perp, test_bpc, test_hid_rec_init]) 215 | 216 | bn_test_prediction, bn_test_hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']], \ 217 | deterministic=True)#{X_sym:X_sym,hid_init_sym:hid_init_sym}, 218 | 219 | bn_test_loss = lasagne.objectives.categorical_crossentropy(bn_test_prediction, y_sym0).mean() 220 | bn_test_perp=T.exp(bn_test_loss) 221 | bn_test_bpc = (bn_test_loss/np.log(2.0)) 222 | bn_test_fn = theano.function([X_sym, y_sym,hid_init_sym],\ 223 | [bn_test_perp, bn_test_bpc, bn_test_hid_rec_init]) 224 | 225 | 226 | 227 | 228 | learning_rate=np.float32(lr) 229 | 230 | t_prep=0 231 | t_bpc=0 232 | count=0 233 | lastbpc=100 234 | patience=0 235 | patienceThre=args.pThre 236 | 237 | for epoci in range(1,10000): 238 | hid_init=np.zeros((batch_size, sum_units), dtype='float32') 239 | dropindex=0 240 | if args.data_aug: 241 | dropindex=np.random.randint(seq_len*5) 242 | for batchi, (x, y) in enumerate(data_iterator(train_data[dropindex:], batch_size, seq_len)): 243 | if rnnmodel==indrnn_onlyrecurrent: 244 | for para in params: 245 | if para.name=='hidden_to_hidden.W': 246 | para.set_value(np.clip(para.get_value(),-1*U_bound,U_bound)) 247 | perp, bpc, hid_init=train_fn(x, y,hid_init,learning_rate) 248 | 249 | if np.isnan(perp): 250 | print ('NaN detected in cost') 251 | assert(2==3) 252 | if np.isinf(perp): 253 | print ('INF detected in cost') 254 | assert(2==3) 255 | t_prep+=perp 256 | t_bpc+=bpc 257 | count+=1 258 | 259 | trainbpc=t_bpc/count 260 | print ('prep','bpc',t_prep/count, t_bpc/count) 261 | train_acc=t_prep/count 262 | count=0 263 | t_prep=0 264 | t_bpc=0 265 | 266 | hid_init=np.zeros((batch_size, sum_units), dtype='float32') 267 | for testbatchi, (x, y) in enumerate(data_iterator(valid_data, batch_size, seq_len)): 268 | perp, bpc, hid_init=bn_test_fn(x, y,hid_init) 269 | t_prep+=perp 270 | t_bpc+=bpc 271 | count+=1 272 | print ('bn_validprep','bn_validbpc',t_prep/count, t_bpc/count ) 273 | validbpc=t_bpc/count 274 | count=0 275 | t_prep=0 276 | t_bpc=0 277 | 278 | hid_init=np.zeros((batch_size, sum_units), dtype='float32') 279 | for testbatchi, (x, y) in enumerate(data_iterator(test_data, batch_size, seq_len)): 280 | perp, bpc, hid_init=test_fn(x, y,hid_init) 281 | t_prep+=perp 282 | t_bpc+=bpc 283 | count+=1 284 | print ('testprep','testbpc',t_prep/count, t_bpc/count ) 285 | test_acc=t_prep/count 286 | count=0 287 | t_prep=0 288 | t_bpc=0 289 | 290 | 291 | hid_init=np.zeros((batch_size, sum_units), dtype='float32') 292 | for testbatchi, (x, y) in enumerate(data_iterator(test_data, batch_size, seq_len)): 293 | perp, bpc, hid_init=bn_test_fn(x, y,hid_init) 294 | t_prep+=perp 295 | t_bpc+=bpc 296 | count+=1 297 | print ('bn_testprep','bn_testbpc',t_prep/count, t_bpc/count ) 298 | #test_acc=t_prep/count 299 | count=0 300 | t_prep=0 301 | t_bpc=0 302 | 303 | if (validbpc patienceThre: 309 | learning_rate=np.float32(learning_rate*0.2) 310 | print ('learning rate',learning_rate) 311 | lasagne.layers.set_all_param_values(learn_net['out'], best_para) 312 | patience=0 313 | if learning_rate<1e-6: 314 | break 315 | else: 316 | patience+=1 317 | 318 | save_name='indrnn_cPTB'+str(seq_len) 319 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out'])) 320 | -------------------------------------------------------------------------------- /IndRNN_onlyrecurrent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it. 6 | 7 | Please cite the following paper if you find it useful. 8 | 9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 10 | @article{li2018independently, 11 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 12 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 13 | booktitle={CVPR2018}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | import numpy as np 19 | import theano 20 | import theano.tensor as T 21 | import lasagne.nonlinearities as nonlinearities 22 | import lasagne.init as init 23 | from lasagne.utils import unroll_scan 24 | 25 | from lasagne.layers import MergeLayer, Layer 26 | from lasagne.layers import InputLayer 27 | from lasagne.layers import DenseLayer 28 | from lasagne.layers import helper 29 | import lasagne 30 | 31 | __all__ = [ 32 | "onlyRecurrentLayer", 33 | "MulLayer", 34 | "IndRNNLayer_onlyrecurrent" 35 | ] 36 | 37 | 38 | 39 | class MulLayer(lasagne.layers.Layer): 40 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 41 | super(MulLayer, self).__init__(incoming, **kwargs) 42 | num_inputs = self.input_shape[1] 43 | self.W = self.add_param(W, (num_inputs, ), name='W') 44 | 45 | def get_output_for(self, input, **kwargs): 46 | return input * self.W 47 | 48 | def get_output_shape_for(self, input_shape): 49 | return input_shape#(input_shape[0], self.num_units) 50 | 51 | 52 | 53 | 54 | class onlyRecurrentLayer(MergeLayer): 55 | """ 56 | This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input. 57 | """ 58 | def __init__(self, incoming, input_to_hidden, hidden_to_hidden, 59 | nonlinearity=nonlinearities.rectify, 60 | hid_init=init.Constant(0.), 61 | backwards=False, 62 | learn_init=False, 63 | gradient_steps=-1, 64 | grad_clipping=0, 65 | unroll_scan=False, 66 | precompute_input=True, 67 | mask_input=None, 68 | only_return_final=False, 69 | **kwargs): 70 | 71 | # This layer inherits from a MergeLayer, because it can have three 72 | # inputs - the layer input, the mask and the initial hidden state. We 73 | # will just provide the layer input as incomings, unless a mask input 74 | # or initial hidden state was provided. 75 | incomings = [incoming] 76 | self.mask_incoming_index = -1 77 | self.hid_init_incoming_index = -1 78 | if mask_input is not None: 79 | incomings.append(mask_input) 80 | self.mask_incoming_index = len(incomings)-1 81 | if isinstance(hid_init, Layer): 82 | incomings.append(hid_init) 83 | self.hid_init_incoming_index = len(incomings)-1 84 | 85 | super(onlyRecurrentLayer, self).__init__(incomings, **kwargs) 86 | 87 | input_to_hidden_in_layers = \ 88 | [layer for layer in helper.get_all_layers(input_to_hidden) 89 | if isinstance(layer, InputLayer)] 90 | if len(input_to_hidden_in_layers) != 1: 91 | raise ValueError( 92 | '`input_to_hidden` must have exactly one InputLayer, but it ' 93 | 'has {}'.format(len(input_to_hidden_in_layers))) 94 | 95 | hidden_to_hidden_in_lyrs = \ 96 | [layer for layer in helper.get_all_layers(hidden_to_hidden) 97 | if isinstance(layer, InputLayer)] 98 | if len(hidden_to_hidden_in_lyrs) != 1: 99 | raise ValueError( 100 | '`hidden_to_hidden` must have exactly one InputLayer, but it ' 101 | 'has {}'.format(len(hidden_to_hidden_in_lyrs))) 102 | hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0] 103 | 104 | self.input_to_hidden = input_to_hidden 105 | self.hidden_to_hidden = hidden_to_hidden 106 | self.learn_init = learn_init 107 | self.backwards = backwards 108 | self.gradient_steps = gradient_steps 109 | self.grad_clipping = grad_clipping 110 | self.unroll_scan = unroll_scan 111 | self.precompute_input = precompute_input 112 | self.only_return_final = only_return_final 113 | 114 | 115 | if unroll_scan and gradient_steps != -1: 116 | raise ValueError( 117 | "Gradient steps must be -1 when unroll_scan is true.") 118 | 119 | # Retrieve the dimensionality of the incoming layer 120 | input_shape = self.input_shapes[0] 121 | 122 | if nonlinearity is None: 123 | self.nonlinearity = nonlinearities.identity 124 | else: 125 | self.nonlinearity = nonlinearity 126 | 127 | # Initialize hidden state 128 | if isinstance(hid_init, Layer): 129 | self.hid_init = hid_init 130 | else: 131 | self.hid_init = self.add_param( 132 | hid_init, (1,) + hidden_to_hidden.output_shape[1:], 133 | name="hid_init", trainable=learn_init, regularizable=False) 134 | 135 | def get_params(self, **tags): 136 | # Get all parameters from this layer, the master layer 137 | params = super(onlyRecurrentLayer, self).get_params(**tags) 138 | # Combine with all parameters from the child layers 139 | params += helper.get_all_params(self.input_to_hidden, **tags) 140 | params += helper.get_all_params(self.hidden_to_hidden, **tags) 141 | return params 142 | 143 | def get_output_shape_for(self, input_shapes): 144 | # The shape of the input to this layer will be the first element 145 | # of input_shapes, whether or not a mask input is being used. 146 | input_shape = input_shapes[0] 147 | # When only_return_final is true, the second (sequence step) dimension 148 | # will be flattened 149 | if self.only_return_final: 150 | return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:] 151 | # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...) 152 | else: 153 | return ((input_shape[0], input_shape[1]) + 154 | self.hidden_to_hidden.output_shape[1:]) 155 | 156 | def get_output_for(self, inputs, **kwargs): 157 | # Retrieve the layer input 158 | input = inputs[0] 159 | # Retrieve the mask when it is supplied 160 | mask = None 161 | hid_init = None 162 | if self.mask_incoming_index > 0: 163 | mask = inputs[self.mask_incoming_index] 164 | if self.hid_init_incoming_index > 0: 165 | hid_init = inputs[self.hid_init_incoming_index] 166 | 167 | # Input should be provided as (n_batch, n_time_steps, n_features) 168 | # but scan requires the iterable dimension to be first 169 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 170 | #input = input.dimshuffle(1, 0, *range(2, input.ndim)) 171 | seq_len, num_batch = input.shape[0], input.shape[1] 172 | 173 | # We will always pass the hidden-to-hidden layer params to step 174 | non_seqs = helper.get_all_params(self.hidden_to_hidden) 175 | 176 | # Create single recurrent computation step function 177 | def step(input_n, hid_previous, *args): 178 | # Compute the hidden-to-hidden activation 179 | hid_pre = helper.get_output( 180 | self.hidden_to_hidden, hid_previous, **kwargs) 181 | 182 | hid_pre += input_n 183 | 184 | # Clip gradients 185 | if self.grad_clipping: 186 | hid_pre = theano.gradient.grad_clip( 187 | hid_pre, -self.grad_clipping, self.grad_clipping) 188 | 189 | return self.nonlinearity(hid_pre) 190 | 191 | def step_masked(input_n, mask_n, hid_previous, *args): 192 | # Skip over any input with mask 0 by copying the previous 193 | # hidden state; proceed normally for any input with mask 1. 194 | hid = step(input_n, hid_previous, *args) 195 | hid_out = T.switch(mask_n, hid, hid_previous) 196 | return [hid_out] 197 | 198 | if mask is not None: 199 | mask = mask.dimshuffle(1, 0, 'x') 200 | sequences = [input, mask] 201 | step_fun = step_masked 202 | else: 203 | sequences = input 204 | step_fun = step 205 | 206 | if not isinstance(self.hid_init, Layer): 207 | # The code below simply repeats self.hid_init num_batch times in 208 | # its first dimension. Turns out using a dot product and a 209 | # dimshuffle is faster than T.repeat. 210 | dot_dims = (list(range(1, self.hid_init.ndim - 1)) + 211 | [0, self.hid_init.ndim - 1]) 212 | hid_init = T.dot(T.ones((num_batch, 1)), 213 | self.hid_init.dimshuffle(dot_dims)) 214 | 215 | if self.unroll_scan: 216 | # Retrieve the dimensionality of the incoming layer 217 | input_shape = self.input_shapes[0] 218 | # Explicitly unroll the recurrence instead of using scan 219 | hid_out = unroll_scan( 220 | fn=step_fun, 221 | sequences=sequences, 222 | outputs_info=[hid_init], 223 | go_backwards=self.backwards, 224 | non_sequences=non_seqs, 225 | n_steps=input_shape[1])[0] 226 | else: 227 | # Scan op iterates over first dimension of input and repeatedly 228 | # applies the step function 229 | hid_out = theano.scan( 230 | fn=step_fun, 231 | sequences=sequences, 232 | go_backwards=self.backwards, 233 | outputs_info=[hid_init], 234 | non_sequences=non_seqs, 235 | truncate_gradient=self.gradient_steps, 236 | strict=True)[0] 237 | 238 | # When it is requested that we only return the final sequence step, 239 | # we need to slice it out immediately after scan is applied 240 | if self.only_return_final: 241 | hid_out = hid_out[-1] 242 | else: 243 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 244 | #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) 245 | 246 | # if scan is backward reverse the output 247 | if self.backwards: 248 | hid_out = hid_out[::-1,:] 249 | 250 | return hid_out 251 | 252 | 253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer): 254 | 255 | def __init__(self, incoming, num_units, 256 | #W_in_to_hid=init.Uniform(), 257 | W_hid_to_hid=init.Uniform(), 258 | #b=init.Constant(0.), 259 | nonlinearity=nonlinearities.rectify, 260 | hid_init=init.Constant(0.), 261 | backwards=False, 262 | learn_init=False, 263 | gradient_steps=-1, 264 | grad_clipping=0, 265 | unroll_scan=False, 266 | precompute_input=True, 267 | mask_input=None, 268 | only_return_final=False, 269 | **kwargs): 270 | 271 | if isinstance(incoming, tuple): 272 | input_shape = incoming 273 | else: 274 | input_shape = incoming.output_shape 275 | # Retrieve the supplied name, if it exists; otherwise use '' 276 | if 'name' in kwargs: 277 | basename = kwargs['name'] + '.' 278 | # Create a separate version of kwargs for the contained layers 279 | # which does not include 'name' 280 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 281 | if key != 'name') 282 | else: 283 | basename = '' 284 | layer_kwargs = kwargs 285 | # We will be passing the input at each time step to the dense layer, 286 | # so we need to remove the second dimension (the time dimension) 287 | in_to_hid = InputLayer(input_shape) 288 | 289 | # in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 290 | # num_units, W=W_in_to_hid, b=b, 291 | # nonlinearity=None, 292 | # name=basename + 'input_to_hidden', 293 | # **layer_kwargs) 294 | # The hidden-to-hidden layer expects its inputs to have num_units 295 | # features because it recycles the previous hidden state 296 | 297 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 298 | W=W_hid_to_hid, 299 | name=basename + 'hidden_to_hidden', 300 | **layer_kwargs) 301 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 302 | # num_units, W=W_hid_to_hid, b=None, 303 | # nonlinearity=None, 304 | # name=basename + 'hidden_to_hidden', 305 | # **layer_kwargs) 306 | 307 | # Make child layer parameters intuitively accessible 308 | #self.W_in_to_hid = in_to_hid.W 309 | self.W_hid_to_hid = hid_to_hid.W 310 | #self.b = in_to_hid.b 311 | 312 | # Just use the CustomRecurrentLayer with the DenseLayers we created 313 | super(IndRNNLayer_onlyrecurrent, self).__init__( 314 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 315 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 316 | gradient_steps=gradient_steps, 317 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 318 | precompute_input=precompute_input, mask_input=mask_input, 319 | only_return_final=only_return_final, **kwargs) 320 | -------------------------------------------------------------------------------- /cPTB/IndRNN_onlyrecurrent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it. 6 | 7 | Please cite the following paper if you find it useful. 8 | 9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 10 | @article{li2018independently, 11 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 12 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 13 | booktitle={CVPR2018}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | import numpy as np 19 | import theano 20 | import theano.tensor as T 21 | import lasagne.nonlinearities as nonlinearities 22 | import lasagne.init as init 23 | from lasagne.utils import unroll_scan 24 | 25 | from lasagne.layers import MergeLayer, Layer 26 | from lasagne.layers import InputLayer 27 | from lasagne.layers import DenseLayer 28 | from lasagne.layers import helper 29 | import lasagne 30 | 31 | __all__ = [ 32 | "onlyRecurrentLayer", 33 | "MulLayer", 34 | "IndRNNLayer_onlyrecurrent" 35 | ] 36 | 37 | 38 | 39 | class MulLayer(lasagne.layers.Layer): 40 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 41 | super(MulLayer, self).__init__(incoming, **kwargs) 42 | num_inputs = self.input_shape[1] 43 | self.W = self.add_param(W, (num_inputs, ), name='W') 44 | 45 | def get_output_for(self, input, **kwargs): 46 | return input * self.W 47 | 48 | def get_output_shape_for(self, input_shape): 49 | return input_shape#(input_shape[0], self.num_units) 50 | 51 | 52 | 53 | 54 | class onlyRecurrentLayer(MergeLayer): 55 | """ 56 | This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input. 57 | """ 58 | def __init__(self, incoming, input_to_hidden, hidden_to_hidden, 59 | nonlinearity=nonlinearities.rectify, 60 | hid_init=init.Constant(0.), 61 | backwards=False, 62 | learn_init=False, 63 | gradient_steps=-1, 64 | grad_clipping=0, 65 | unroll_scan=False, 66 | precompute_input=True, 67 | mask_input=None, 68 | only_return_final=False, 69 | **kwargs): 70 | 71 | # This layer inherits from a MergeLayer, because it can have three 72 | # inputs - the layer input, the mask and the initial hidden state. We 73 | # will just provide the layer input as incomings, unless a mask input 74 | # or initial hidden state was provided. 75 | incomings = [incoming] 76 | self.mask_incoming_index = -1 77 | self.hid_init_incoming_index = -1 78 | if mask_input is not None: 79 | incomings.append(mask_input) 80 | self.mask_incoming_index = len(incomings)-1 81 | if isinstance(hid_init, Layer): 82 | incomings.append(hid_init) 83 | self.hid_init_incoming_index = len(incomings)-1 84 | 85 | super(onlyRecurrentLayer, self).__init__(incomings, **kwargs) 86 | 87 | input_to_hidden_in_layers = \ 88 | [layer for layer in helper.get_all_layers(input_to_hidden) 89 | if isinstance(layer, InputLayer)] 90 | if len(input_to_hidden_in_layers) != 1: 91 | raise ValueError( 92 | '`input_to_hidden` must have exactly one InputLayer, but it ' 93 | 'has {}'.format(len(input_to_hidden_in_layers))) 94 | 95 | hidden_to_hidden_in_lyrs = \ 96 | [layer for layer in helper.get_all_layers(hidden_to_hidden) 97 | if isinstance(layer, InputLayer)] 98 | if len(hidden_to_hidden_in_lyrs) != 1: 99 | raise ValueError( 100 | '`hidden_to_hidden` must have exactly one InputLayer, but it ' 101 | 'has {}'.format(len(hidden_to_hidden_in_lyrs))) 102 | hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0] 103 | 104 | self.input_to_hidden = input_to_hidden 105 | self.hidden_to_hidden = hidden_to_hidden 106 | self.learn_init = learn_init 107 | self.backwards = backwards 108 | self.gradient_steps = gradient_steps 109 | self.grad_clipping = grad_clipping 110 | self.unroll_scan = unroll_scan 111 | self.precompute_input = precompute_input 112 | self.only_return_final = only_return_final 113 | 114 | 115 | if unroll_scan and gradient_steps != -1: 116 | raise ValueError( 117 | "Gradient steps must be -1 when unroll_scan is true.") 118 | 119 | # Retrieve the dimensionality of the incoming layer 120 | input_shape = self.input_shapes[0] 121 | 122 | if nonlinearity is None: 123 | self.nonlinearity = nonlinearities.identity 124 | else: 125 | self.nonlinearity = nonlinearity 126 | 127 | # Initialize hidden state 128 | if isinstance(hid_init, Layer): 129 | self.hid_init = hid_init 130 | else: 131 | self.hid_init = self.add_param( 132 | hid_init, (1,) + hidden_to_hidden.output_shape[1:], 133 | name="hid_init", trainable=learn_init, regularizable=False) 134 | 135 | def get_params(self, **tags): 136 | # Get all parameters from this layer, the master layer 137 | params = super(onlyRecurrentLayer, self).get_params(**tags) 138 | # Combine with all parameters from the child layers 139 | params += helper.get_all_params(self.input_to_hidden, **tags) 140 | params += helper.get_all_params(self.hidden_to_hidden, **tags) 141 | return params 142 | 143 | def get_output_shape_for(self, input_shapes): 144 | # The shape of the input to this layer will be the first element 145 | # of input_shapes, whether or not a mask input is being used. 146 | input_shape = input_shapes[0] 147 | # When only_return_final is true, the second (sequence step) dimension 148 | # will be flattened 149 | if self.only_return_final: 150 | return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:] 151 | # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...) 152 | else: 153 | return ((input_shape[0], input_shape[1]) + 154 | self.hidden_to_hidden.output_shape[1:]) 155 | 156 | def get_output_for(self, inputs, **kwargs): 157 | # Retrieve the layer input 158 | input = inputs[0] 159 | # Retrieve the mask when it is supplied 160 | mask = None 161 | hid_init = None 162 | if self.mask_incoming_index > 0: 163 | mask = inputs[self.mask_incoming_index] 164 | if self.hid_init_incoming_index > 0: 165 | hid_init = inputs[self.hid_init_incoming_index] 166 | 167 | # Input should be provided as (n_batch, n_time_steps, n_features) 168 | # but scan requires the iterable dimension to be first 169 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 170 | #input = input.dimshuffle(1, 0, *range(2, input.ndim)) 171 | seq_len, num_batch = input.shape[0], input.shape[1] 172 | 173 | # We will always pass the hidden-to-hidden layer params to step 174 | non_seqs = helper.get_all_params(self.hidden_to_hidden) 175 | 176 | # Create single recurrent computation step function 177 | def step(input_n, hid_previous, *args): 178 | # Compute the hidden-to-hidden activation 179 | hid_pre = helper.get_output( 180 | self.hidden_to_hidden, hid_previous, **kwargs) 181 | 182 | hid_pre += input_n 183 | 184 | # Clip gradients 185 | if self.grad_clipping: 186 | hid_pre = theano.gradient.grad_clip( 187 | hid_pre, -self.grad_clipping, self.grad_clipping) 188 | 189 | return self.nonlinearity(hid_pre) 190 | 191 | def step_masked(input_n, mask_n, hid_previous, *args): 192 | # Skip over any input with mask 0 by copying the previous 193 | # hidden state; proceed normally for any input with mask 1. 194 | hid = step(input_n, hid_previous, *args) 195 | hid_out = T.switch(mask_n, hid, hid_previous) 196 | return [hid_out] 197 | 198 | if mask is not None: 199 | mask = mask.dimshuffle(1, 0, 'x') 200 | sequences = [input, mask] 201 | step_fun = step_masked 202 | else: 203 | sequences = input 204 | step_fun = step 205 | 206 | if not isinstance(self.hid_init, Layer): 207 | # The code below simply repeats self.hid_init num_batch times in 208 | # its first dimension. Turns out using a dot product and a 209 | # dimshuffle is faster than T.repeat. 210 | dot_dims = (list(range(1, self.hid_init.ndim - 1)) + 211 | [0, self.hid_init.ndim - 1]) 212 | hid_init = T.dot(T.ones((num_batch, 1)), 213 | self.hid_init.dimshuffle(dot_dims)) 214 | 215 | if self.unroll_scan: 216 | # Retrieve the dimensionality of the incoming layer 217 | input_shape = self.input_shapes[0] 218 | # Explicitly unroll the recurrence instead of using scan 219 | hid_out = unroll_scan( 220 | fn=step_fun, 221 | sequences=sequences, 222 | outputs_info=[hid_init], 223 | go_backwards=self.backwards, 224 | non_sequences=non_seqs, 225 | n_steps=input_shape[1])[0] 226 | else: 227 | # Scan op iterates over first dimension of input and repeatedly 228 | # applies the step function 229 | hid_out = theano.scan( 230 | fn=step_fun, 231 | sequences=sequences, 232 | go_backwards=self.backwards, 233 | outputs_info=[hid_init], 234 | non_sequences=non_seqs, 235 | truncate_gradient=self.gradient_steps, 236 | strict=True)[0] 237 | 238 | # When it is requested that we only return the final sequence step, 239 | # we need to slice it out immediately after scan is applied 240 | if self.only_return_final: 241 | hid_out = hid_out[-1] 242 | else: 243 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 244 | #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) 245 | 246 | # if scan is backward reverse the output 247 | if self.backwards: 248 | hid_out = hid_out[::-1,:] 249 | 250 | return hid_out 251 | 252 | 253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer): 254 | 255 | def __init__(self, incoming, num_units, 256 | #W_in_to_hid=init.Uniform(), 257 | W_hid_to_hid=init.Uniform(), 258 | #b=init.Constant(0.), 259 | nonlinearity=nonlinearities.rectify, 260 | hid_init=init.Constant(0.), 261 | backwards=False, 262 | learn_init=False, 263 | gradient_steps=-1, 264 | grad_clipping=0, 265 | unroll_scan=False, 266 | precompute_input=True, 267 | mask_input=None, 268 | only_return_final=False, 269 | **kwargs): 270 | 271 | if isinstance(incoming, tuple): 272 | input_shape = incoming 273 | else: 274 | input_shape = incoming.output_shape 275 | # Retrieve the supplied name, if it exists; otherwise use '' 276 | if 'name' in kwargs: 277 | basename = kwargs['name'] + '.' 278 | # Create a separate version of kwargs for the contained layers 279 | # which does not include 'name' 280 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 281 | if key != 'name') 282 | else: 283 | basename = '' 284 | layer_kwargs = kwargs 285 | # We will be passing the input at each time step to the dense layer, 286 | # so we need to remove the second dimension (the time dimension) 287 | in_to_hid = InputLayer(input_shape) 288 | 289 | # in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 290 | # num_units, W=W_in_to_hid, b=b, 291 | # nonlinearity=None, 292 | # name=basename + 'input_to_hidden', 293 | # **layer_kwargs) 294 | # The hidden-to-hidden layer expects its inputs to have num_units 295 | # features because it recycles the previous hidden state 296 | 297 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 298 | W=W_hid_to_hid, 299 | name=basename + 'hidden_to_hidden', 300 | **layer_kwargs) 301 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 302 | # num_units, W=W_hid_to_hid, b=None, 303 | # nonlinearity=None, 304 | # name=basename + 'hidden_to_hidden', 305 | # **layer_kwargs) 306 | 307 | # Make child layer parameters intuitively accessible 308 | #self.W_in_to_hid = in_to_hid.W 309 | self.W_hid_to_hid = hid_to_hid.W 310 | #self.b = in_to_hid.b 311 | 312 | # Just use the CustomRecurrentLayer with the DenseLayers we created 313 | super(IndRNNLayer_onlyrecurrent, self).__init__( 314 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 315 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 316 | gradient_steps=gradient_steps, 317 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 318 | precompute_input=precompute_input, mask_input=mask_input, 319 | only_return_final=only_return_final, **kwargs) 320 | -------------------------------------------------------------------------------- /mnist/IndRNN_onlyrecurrent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it. 6 | 7 | Please cite the following paper if you find it useful. 8 | 9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 10 | @article{li2018independently, 11 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 12 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 13 | booktitle={CVPR2018}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | import numpy as np 19 | import theano 20 | import theano.tensor as T 21 | import lasagne.nonlinearities as nonlinearities 22 | import lasagne.init as init 23 | from lasagne.utils import unroll_scan 24 | 25 | from lasagne.layers import MergeLayer, Layer 26 | from lasagne.layers import InputLayer 27 | from lasagne.layers import DenseLayer 28 | from lasagne.layers import helper 29 | import lasagne 30 | 31 | __all__ = [ 32 | "onlyRecurrentLayer", 33 | "MulLayer", 34 | "IndRNNLayer_onlyrecurrent" 35 | ] 36 | 37 | 38 | 39 | class MulLayer(lasagne.layers.Layer): 40 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 41 | super(MulLayer, self).__init__(incoming, **kwargs) 42 | num_inputs = self.input_shape[1] 43 | self.W = self.add_param(W, (num_inputs, ), name='W') 44 | 45 | def get_output_for(self, input, **kwargs): 46 | return input * self.W 47 | 48 | def get_output_shape_for(self, input_shape): 49 | return input_shape#(input_shape[0], self.num_units) 50 | 51 | 52 | 53 | 54 | class onlyRecurrentLayer(MergeLayer): 55 | """ 56 | This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input. 57 | """ 58 | def __init__(self, incoming, input_to_hidden, hidden_to_hidden, 59 | nonlinearity=nonlinearities.rectify, 60 | hid_init=init.Constant(0.), 61 | backwards=False, 62 | learn_init=False, 63 | gradient_steps=-1, 64 | grad_clipping=0, 65 | unroll_scan=False, 66 | precompute_input=True, 67 | mask_input=None, 68 | only_return_final=False, 69 | **kwargs): 70 | 71 | # This layer inherits from a MergeLayer, because it can have three 72 | # inputs - the layer input, the mask and the initial hidden state. We 73 | # will just provide the layer input as incomings, unless a mask input 74 | # or initial hidden state was provided. 75 | incomings = [incoming] 76 | self.mask_incoming_index = -1 77 | self.hid_init_incoming_index = -1 78 | if mask_input is not None: 79 | incomings.append(mask_input) 80 | self.mask_incoming_index = len(incomings)-1 81 | if isinstance(hid_init, Layer): 82 | incomings.append(hid_init) 83 | self.hid_init_incoming_index = len(incomings)-1 84 | 85 | super(onlyRecurrentLayer, self).__init__(incomings, **kwargs) 86 | 87 | input_to_hidden_in_layers = \ 88 | [layer for layer in helper.get_all_layers(input_to_hidden) 89 | if isinstance(layer, InputLayer)] 90 | if len(input_to_hidden_in_layers) != 1: 91 | raise ValueError( 92 | '`input_to_hidden` must have exactly one InputLayer, but it ' 93 | 'has {}'.format(len(input_to_hidden_in_layers))) 94 | 95 | hidden_to_hidden_in_lyrs = \ 96 | [layer for layer in helper.get_all_layers(hidden_to_hidden) 97 | if isinstance(layer, InputLayer)] 98 | if len(hidden_to_hidden_in_lyrs) != 1: 99 | raise ValueError( 100 | '`hidden_to_hidden` must have exactly one InputLayer, but it ' 101 | 'has {}'.format(len(hidden_to_hidden_in_lyrs))) 102 | hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0] 103 | 104 | self.input_to_hidden = input_to_hidden 105 | self.hidden_to_hidden = hidden_to_hidden 106 | self.learn_init = learn_init 107 | self.backwards = backwards 108 | self.gradient_steps = gradient_steps 109 | self.grad_clipping = grad_clipping 110 | self.unroll_scan = unroll_scan 111 | self.precompute_input = precompute_input 112 | self.only_return_final = only_return_final 113 | 114 | 115 | if unroll_scan and gradient_steps != -1: 116 | raise ValueError( 117 | "Gradient steps must be -1 when unroll_scan is true.") 118 | 119 | # Retrieve the dimensionality of the incoming layer 120 | input_shape = self.input_shapes[0] 121 | 122 | if nonlinearity is None: 123 | self.nonlinearity = nonlinearities.identity 124 | else: 125 | self.nonlinearity = nonlinearity 126 | 127 | # Initialize hidden state 128 | if isinstance(hid_init, Layer): 129 | self.hid_init = hid_init 130 | else: 131 | self.hid_init = self.add_param( 132 | hid_init, (1,) + hidden_to_hidden.output_shape[1:], 133 | name="hid_init", trainable=learn_init, regularizable=False) 134 | 135 | def get_params(self, **tags): 136 | # Get all parameters from this layer, the master layer 137 | params = super(onlyRecurrentLayer, self).get_params(**tags) 138 | # Combine with all parameters from the child layers 139 | params += helper.get_all_params(self.input_to_hidden, **tags) 140 | params += helper.get_all_params(self.hidden_to_hidden, **tags) 141 | return params 142 | 143 | def get_output_shape_for(self, input_shapes): 144 | # The shape of the input to this layer will be the first element 145 | # of input_shapes, whether or not a mask input is being used. 146 | input_shape = input_shapes[0] 147 | # When only_return_final is true, the second (sequence step) dimension 148 | # will be flattened 149 | if self.only_return_final: 150 | return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:] 151 | # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...) 152 | else: 153 | return ((input_shape[0], input_shape[1]) + 154 | self.hidden_to_hidden.output_shape[1:]) 155 | 156 | def get_output_for(self, inputs, **kwargs): 157 | # Retrieve the layer input 158 | input = inputs[0] 159 | # Retrieve the mask when it is supplied 160 | mask = None 161 | hid_init = None 162 | if self.mask_incoming_index > 0: 163 | mask = inputs[self.mask_incoming_index] 164 | if self.hid_init_incoming_index > 0: 165 | hid_init = inputs[self.hid_init_incoming_index] 166 | 167 | # Input should be provided as (n_batch, n_time_steps, n_features) 168 | # but scan requires the iterable dimension to be first 169 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 170 | #input = input.dimshuffle(1, 0, *range(2, input.ndim)) 171 | seq_len, num_batch = input.shape[0], input.shape[1] 172 | 173 | # We will always pass the hidden-to-hidden layer params to step 174 | non_seqs = helper.get_all_params(self.hidden_to_hidden) 175 | 176 | # Create single recurrent computation step function 177 | def step(input_n, hid_previous, *args): 178 | # Compute the hidden-to-hidden activation 179 | hid_pre = helper.get_output( 180 | self.hidden_to_hidden, hid_previous, **kwargs) 181 | 182 | hid_pre += input_n 183 | 184 | # Clip gradients 185 | if self.grad_clipping: 186 | hid_pre = theano.gradient.grad_clip( 187 | hid_pre, -self.grad_clipping, self.grad_clipping) 188 | 189 | return self.nonlinearity(hid_pre) 190 | 191 | def step_masked(input_n, mask_n, hid_previous, *args): 192 | # Skip over any input with mask 0 by copying the previous 193 | # hidden state; proceed normally for any input with mask 1. 194 | hid = step(input_n, hid_previous, *args) 195 | hid_out = T.switch(mask_n, hid, hid_previous) 196 | return [hid_out] 197 | 198 | if mask is not None: 199 | mask = mask.dimshuffle(1, 0, 'x') 200 | sequences = [input, mask] 201 | step_fun = step_masked 202 | else: 203 | sequences = input 204 | step_fun = step 205 | 206 | if not isinstance(self.hid_init, Layer): 207 | # The code below simply repeats self.hid_init num_batch times in 208 | # its first dimension. Turns out using a dot product and a 209 | # dimshuffle is faster than T.repeat. 210 | dot_dims = (list(range(1, self.hid_init.ndim - 1)) + 211 | [0, self.hid_init.ndim - 1]) 212 | hid_init = T.dot(T.ones((num_batch, 1)), 213 | self.hid_init.dimshuffle(dot_dims)) 214 | 215 | if self.unroll_scan: 216 | # Retrieve the dimensionality of the incoming layer 217 | input_shape = self.input_shapes[0] 218 | # Explicitly unroll the recurrence instead of using scan 219 | hid_out = unroll_scan( 220 | fn=step_fun, 221 | sequences=sequences, 222 | outputs_info=[hid_init], 223 | go_backwards=self.backwards, 224 | non_sequences=non_seqs, 225 | n_steps=input_shape[1])[0] 226 | else: 227 | # Scan op iterates over first dimension of input and repeatedly 228 | # applies the step function 229 | hid_out = theano.scan( 230 | fn=step_fun, 231 | sequences=sequences, 232 | go_backwards=self.backwards, 233 | outputs_info=[hid_init], 234 | non_sequences=non_seqs, 235 | truncate_gradient=self.gradient_steps, 236 | strict=True)[0] 237 | 238 | # When it is requested that we only return the final sequence step, 239 | # we need to slice it out immediately after scan is applied 240 | if self.only_return_final: 241 | hid_out = hid_out[-1] 242 | else: 243 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 244 | #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) 245 | 246 | # if scan is backward reverse the output 247 | if self.backwards: 248 | hid_out = hid_out[::-1,:] 249 | 250 | return hid_out 251 | 252 | 253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer): 254 | 255 | def __init__(self, incoming, num_units, 256 | #W_in_to_hid=init.Uniform(), 257 | W_hid_to_hid=init.Uniform(), 258 | #b=init.Constant(0.), 259 | nonlinearity=nonlinearities.rectify, 260 | hid_init=init.Constant(0.), 261 | backwards=False, 262 | learn_init=False, 263 | gradient_steps=-1, 264 | grad_clipping=0, 265 | unroll_scan=False, 266 | precompute_input=True, 267 | mask_input=None, 268 | only_return_final=False, 269 | **kwargs): 270 | 271 | if isinstance(incoming, tuple): 272 | input_shape = incoming 273 | else: 274 | input_shape = incoming.output_shape 275 | # Retrieve the supplied name, if it exists; otherwise use '' 276 | if 'name' in kwargs: 277 | basename = kwargs['name'] + '.' 278 | # Create a separate version of kwargs for the contained layers 279 | # which does not include 'name' 280 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 281 | if key != 'name') 282 | else: 283 | basename = '' 284 | layer_kwargs = kwargs 285 | # We will be passing the input at each time step to the dense layer, 286 | # so we need to remove the second dimension (the time dimension) 287 | in_to_hid = InputLayer(input_shape) 288 | 289 | # in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 290 | # num_units, W=W_in_to_hid, b=b, 291 | # nonlinearity=None, 292 | # name=basename + 'input_to_hidden', 293 | # **layer_kwargs) 294 | # The hidden-to-hidden layer expects its inputs to have num_units 295 | # features because it recycles the previous hidden state 296 | 297 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 298 | W=W_hid_to_hid, 299 | name=basename + 'hidden_to_hidden', 300 | **layer_kwargs) 301 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 302 | # num_units, W=W_hid_to_hid, b=None, 303 | # nonlinearity=None, 304 | # name=basename + 'hidden_to_hidden', 305 | # **layer_kwargs) 306 | 307 | # Make child layer parameters intuitively accessible 308 | #self.W_in_to_hid = in_to_hid.W 309 | self.W_hid_to_hid = hid_to_hid.W 310 | #self.b = in_to_hid.b 311 | 312 | # Just use the CustomRecurrentLayer with the DenseLayers we created 313 | super(IndRNNLayer_onlyrecurrent, self).__init__( 314 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 315 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 316 | gradient_steps=gradient_steps, 317 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 318 | precompute_input=precompute_input, mask_input=mask_input, 319 | only_return_final=only_return_final, **kwargs) 320 | -------------------------------------------------------------------------------- /wordPTB/IndRNN_onlyrecurrent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it. 6 | 7 | Please cite the following paper if you find it useful. 8 | 9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 10 | @article{li2018independently, 11 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 12 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 13 | booktitle={CVPR2018}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | import numpy as np 19 | import theano 20 | import theano.tensor as T 21 | import lasagne.nonlinearities as nonlinearities 22 | import lasagne.init as init 23 | from lasagne.utils import unroll_scan 24 | 25 | from lasagne.layers import MergeLayer, Layer 26 | from lasagne.layers import InputLayer 27 | from lasagne.layers import DenseLayer 28 | from lasagne.layers import helper 29 | import lasagne 30 | 31 | __all__ = [ 32 | "onlyRecurrentLayer", 33 | "MulLayer", 34 | "IndRNNLayer_onlyrecurrent" 35 | ] 36 | 37 | 38 | 39 | class MulLayer(lasagne.layers.Layer): 40 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 41 | super(MulLayer, self).__init__(incoming, **kwargs) 42 | num_inputs = self.input_shape[1] 43 | self.W = self.add_param(W, (num_inputs, ), name='W') 44 | 45 | def get_output_for(self, input, **kwargs): 46 | return input * self.W 47 | 48 | def get_output_shape_for(self, input_shape): 49 | return input_shape#(input_shape[0], self.num_units) 50 | 51 | 52 | 53 | 54 | class onlyRecurrentLayer(MergeLayer): 55 | """ 56 | This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input. 57 | """ 58 | def __init__(self, incoming, input_to_hidden, hidden_to_hidden, 59 | nonlinearity=nonlinearities.rectify, 60 | hid_init=init.Constant(0.), 61 | backwards=False, 62 | learn_init=False, 63 | gradient_steps=-1, 64 | grad_clipping=0, 65 | unroll_scan=False, 66 | precompute_input=True, 67 | mask_input=None, 68 | only_return_final=False, 69 | **kwargs): 70 | 71 | # This layer inherits from a MergeLayer, because it can have three 72 | # inputs - the layer input, the mask and the initial hidden state. We 73 | # will just provide the layer input as incomings, unless a mask input 74 | # or initial hidden state was provided. 75 | incomings = [incoming] 76 | self.mask_incoming_index = -1 77 | self.hid_init_incoming_index = -1 78 | if mask_input is not None: 79 | incomings.append(mask_input) 80 | self.mask_incoming_index = len(incomings)-1 81 | if isinstance(hid_init, Layer): 82 | incomings.append(hid_init) 83 | self.hid_init_incoming_index = len(incomings)-1 84 | 85 | super(onlyRecurrentLayer, self).__init__(incomings, **kwargs) 86 | 87 | input_to_hidden_in_layers = \ 88 | [layer for layer in helper.get_all_layers(input_to_hidden) 89 | if isinstance(layer, InputLayer)] 90 | if len(input_to_hidden_in_layers) != 1: 91 | raise ValueError( 92 | '`input_to_hidden` must have exactly one InputLayer, but it ' 93 | 'has {}'.format(len(input_to_hidden_in_layers))) 94 | 95 | hidden_to_hidden_in_lyrs = \ 96 | [layer for layer in helper.get_all_layers(hidden_to_hidden) 97 | if isinstance(layer, InputLayer)] 98 | if len(hidden_to_hidden_in_lyrs) != 1: 99 | raise ValueError( 100 | '`hidden_to_hidden` must have exactly one InputLayer, but it ' 101 | 'has {}'.format(len(hidden_to_hidden_in_lyrs))) 102 | hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0] 103 | 104 | self.input_to_hidden = input_to_hidden 105 | self.hidden_to_hidden = hidden_to_hidden 106 | self.learn_init = learn_init 107 | self.backwards = backwards 108 | self.gradient_steps = gradient_steps 109 | self.grad_clipping = grad_clipping 110 | self.unroll_scan = unroll_scan 111 | self.precompute_input = precompute_input 112 | self.only_return_final = only_return_final 113 | 114 | 115 | if unroll_scan and gradient_steps != -1: 116 | raise ValueError( 117 | "Gradient steps must be -1 when unroll_scan is true.") 118 | 119 | # Retrieve the dimensionality of the incoming layer 120 | input_shape = self.input_shapes[0] 121 | 122 | if nonlinearity is None: 123 | self.nonlinearity = nonlinearities.identity 124 | else: 125 | self.nonlinearity = nonlinearity 126 | 127 | # Initialize hidden state 128 | if isinstance(hid_init, Layer): 129 | self.hid_init = hid_init 130 | else: 131 | self.hid_init = self.add_param( 132 | hid_init, (1,) + hidden_to_hidden.output_shape[1:], 133 | name="hid_init", trainable=learn_init, regularizable=False) 134 | 135 | def get_params(self, **tags): 136 | # Get all parameters from this layer, the master layer 137 | params = super(onlyRecurrentLayer, self).get_params(**tags) 138 | # Combine with all parameters from the child layers 139 | params += helper.get_all_params(self.input_to_hidden, **tags) 140 | params += helper.get_all_params(self.hidden_to_hidden, **tags) 141 | return params 142 | 143 | def get_output_shape_for(self, input_shapes): 144 | # The shape of the input to this layer will be the first element 145 | # of input_shapes, whether or not a mask input is being used. 146 | input_shape = input_shapes[0] 147 | # When only_return_final is true, the second (sequence step) dimension 148 | # will be flattened 149 | if self.only_return_final: 150 | return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:] 151 | # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...) 152 | else: 153 | return ((input_shape[0], input_shape[1]) + 154 | self.hidden_to_hidden.output_shape[1:]) 155 | 156 | def get_output_for(self, inputs, **kwargs): 157 | # Retrieve the layer input 158 | input = inputs[0] 159 | # Retrieve the mask when it is supplied 160 | mask = None 161 | hid_init = None 162 | if self.mask_incoming_index > 0: 163 | mask = inputs[self.mask_incoming_index] 164 | if self.hid_init_incoming_index > 0: 165 | hid_init = inputs[self.hid_init_incoming_index] 166 | 167 | # Input should be provided as (n_batch, n_time_steps, n_features) 168 | # but scan requires the iterable dimension to be first 169 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 170 | #input = input.dimshuffle(1, 0, *range(2, input.ndim)) 171 | seq_len, num_batch = input.shape[0], input.shape[1] 172 | 173 | # We will always pass the hidden-to-hidden layer params to step 174 | non_seqs = helper.get_all_params(self.hidden_to_hidden) 175 | 176 | # Create single recurrent computation step function 177 | def step(input_n, hid_previous, *args): 178 | # Compute the hidden-to-hidden activation 179 | hid_pre = helper.get_output( 180 | self.hidden_to_hidden, hid_previous, **kwargs) 181 | 182 | hid_pre += input_n 183 | 184 | # Clip gradients 185 | if self.grad_clipping: 186 | hid_pre = theano.gradient.grad_clip( 187 | hid_pre, -self.grad_clipping, self.grad_clipping) 188 | 189 | return self.nonlinearity(hid_pre) 190 | 191 | def step_masked(input_n, mask_n, hid_previous, *args): 192 | # Skip over any input with mask 0 by copying the previous 193 | # hidden state; proceed normally for any input with mask 1. 194 | hid = step(input_n, hid_previous, *args) 195 | hid_out = T.switch(mask_n, hid, hid_previous) 196 | return [hid_out] 197 | 198 | if mask is not None: 199 | mask = mask.dimshuffle(1, 0, 'x') 200 | sequences = [input, mask] 201 | step_fun = step_masked 202 | else: 203 | sequences = input 204 | step_fun = step 205 | 206 | if not isinstance(self.hid_init, Layer): 207 | # The code below simply repeats self.hid_init num_batch times in 208 | # its first dimension. Turns out using a dot product and a 209 | # dimshuffle is faster than T.repeat. 210 | dot_dims = (list(range(1, self.hid_init.ndim - 1)) + 211 | [0, self.hid_init.ndim - 1]) 212 | hid_init = T.dot(T.ones((num_batch, 1)), 213 | self.hid_init.dimshuffle(dot_dims)) 214 | 215 | if self.unroll_scan: 216 | # Retrieve the dimensionality of the incoming layer 217 | input_shape = self.input_shapes[0] 218 | # Explicitly unroll the recurrence instead of using scan 219 | hid_out = unroll_scan( 220 | fn=step_fun, 221 | sequences=sequences, 222 | outputs_info=[hid_init], 223 | go_backwards=self.backwards, 224 | non_sequences=non_seqs, 225 | n_steps=input_shape[1])[0] 226 | else: 227 | # Scan op iterates over first dimension of input and repeatedly 228 | # applies the step function 229 | hid_out = theano.scan( 230 | fn=step_fun, 231 | sequences=sequences, 232 | go_backwards=self.backwards, 233 | outputs_info=[hid_init], 234 | non_sequences=non_seqs, 235 | truncate_gradient=self.gradient_steps, 236 | strict=True)[0] 237 | 238 | # When it is requested that we only return the final sequence step, 239 | # we need to slice it out immediately after scan is applied 240 | if self.only_return_final: 241 | hid_out = hid_out[-1] 242 | else: 243 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 244 | #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) 245 | 246 | # if scan is backward reverse the output 247 | if self.backwards: 248 | hid_out = hid_out[::-1,:] 249 | 250 | return hid_out 251 | 252 | 253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer): 254 | 255 | def __init__(self, incoming, num_units, 256 | #W_in_to_hid=init.Uniform(), 257 | W_hid_to_hid=init.Uniform(), 258 | #b=init.Constant(0.), 259 | nonlinearity=nonlinearities.rectify, 260 | hid_init=init.Constant(0.), 261 | backwards=False, 262 | learn_init=False, 263 | gradient_steps=-1, 264 | grad_clipping=0, 265 | unroll_scan=False, 266 | precompute_input=True, 267 | mask_input=None, 268 | only_return_final=False, 269 | **kwargs): 270 | 271 | if isinstance(incoming, tuple): 272 | input_shape = incoming 273 | else: 274 | input_shape = incoming.output_shape 275 | # Retrieve the supplied name, if it exists; otherwise use '' 276 | if 'name' in kwargs: 277 | basename = kwargs['name'] + '.' 278 | # Create a separate version of kwargs for the contained layers 279 | # which does not include 'name' 280 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 281 | if key != 'name') 282 | else: 283 | basename = '' 284 | layer_kwargs = kwargs 285 | # We will be passing the input at each time step to the dense layer, 286 | # so we need to remove the second dimension (the time dimension) 287 | in_to_hid = InputLayer(input_shape) 288 | 289 | # in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 290 | # num_units, W=W_in_to_hid, b=b, 291 | # nonlinearity=None, 292 | # name=basename + 'input_to_hidden', 293 | # **layer_kwargs) 294 | # The hidden-to-hidden layer expects its inputs to have num_units 295 | # features because it recycles the previous hidden state 296 | 297 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 298 | W=W_hid_to_hid, 299 | name=basename + 'hidden_to_hidden', 300 | **layer_kwargs) 301 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 302 | # num_units, W=W_hid_to_hid, b=None, 303 | # nonlinearity=None, 304 | # name=basename + 'hidden_to_hidden', 305 | # **layer_kwargs) 306 | 307 | # Make child layer parameters intuitively accessible 308 | #self.W_in_to_hid = in_to_hid.W 309 | self.W_hid_to_hid = hid_to_hid.W 310 | #self.b = in_to_hid.b 311 | 312 | # Just use the CustomRecurrentLayer with the DenseLayers we created 313 | super(IndRNNLayer_onlyrecurrent, self).__init__( 314 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 315 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 316 | gradient_steps=gradient_steps, 317 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 318 | precompute_input=precompute_input, mask_input=mask_input, 319 | only_return_final=only_return_final, **kwargs) 320 | -------------------------------------------------------------------------------- /action recognition/IndRNN_onlyrecurrent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer. 4 | 5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it. 6 | 7 | Please cite the following paper if you find it useful. 8 | 9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018. 10 | @article{li2018independently, 11 | title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN}, 12 | author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo}, 13 | booktitle={CVPR2018}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | import numpy as np 19 | import theano 20 | import theano.tensor as T 21 | import lasagne.nonlinearities as nonlinearities 22 | import lasagne.init as init 23 | from lasagne.utils import unroll_scan 24 | 25 | from lasagne.layers import MergeLayer, Layer 26 | from lasagne.layers import InputLayer 27 | from lasagne.layers import DenseLayer 28 | from lasagne.layers import helper 29 | import lasagne 30 | 31 | __all__ = [ 32 | "onlyRecurrentLayer", 33 | "MulLayer", 34 | "IndRNNLayer_onlyrecurrent" 35 | ] 36 | 37 | 38 | 39 | class MulLayer(lasagne.layers.Layer): 40 | def __init__(self, incoming, W=lasagne.init.Normal(0.01), **kwargs): 41 | super(MulLayer, self).__init__(incoming, **kwargs) 42 | num_inputs = self.input_shape[1] 43 | self.W = self.add_param(W, (num_inputs, ), name='W') 44 | 45 | def get_output_for(self, input, **kwargs): 46 | return input * self.W 47 | 48 | def get_output_shape_for(self, input_shape): 49 | return input_shape#(input_shape[0], self.num_units) 50 | 51 | 52 | 53 | 54 | class onlyRecurrentLayer(MergeLayer): 55 | """ 56 | This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input. 57 | """ 58 | def __init__(self, incoming, input_to_hidden, hidden_to_hidden, 59 | nonlinearity=nonlinearities.rectify, 60 | hid_init=init.Constant(0.), 61 | backwards=False, 62 | learn_init=False, 63 | gradient_steps=-1, 64 | grad_clipping=0, 65 | unroll_scan=False, 66 | precompute_input=True, 67 | mask_input=None, 68 | only_return_final=False, 69 | **kwargs): 70 | 71 | # This layer inherits from a MergeLayer, because it can have three 72 | # inputs - the layer input, the mask and the initial hidden state. We 73 | # will just provide the layer input as incomings, unless a mask input 74 | # or initial hidden state was provided. 75 | incomings = [incoming] 76 | self.mask_incoming_index = -1 77 | self.hid_init_incoming_index = -1 78 | if mask_input is not None: 79 | incomings.append(mask_input) 80 | self.mask_incoming_index = len(incomings)-1 81 | if isinstance(hid_init, Layer): 82 | incomings.append(hid_init) 83 | self.hid_init_incoming_index = len(incomings)-1 84 | 85 | super(onlyRecurrentLayer, self).__init__(incomings, **kwargs) 86 | 87 | input_to_hidden_in_layers = \ 88 | [layer for layer in helper.get_all_layers(input_to_hidden) 89 | if isinstance(layer, InputLayer)] 90 | if len(input_to_hidden_in_layers) != 1: 91 | raise ValueError( 92 | '`input_to_hidden` must have exactly one InputLayer, but it ' 93 | 'has {}'.format(len(input_to_hidden_in_layers))) 94 | 95 | hidden_to_hidden_in_lyrs = \ 96 | [layer for layer in helper.get_all_layers(hidden_to_hidden) 97 | if isinstance(layer, InputLayer)] 98 | if len(hidden_to_hidden_in_lyrs) != 1: 99 | raise ValueError( 100 | '`hidden_to_hidden` must have exactly one InputLayer, but it ' 101 | 'has {}'.format(len(hidden_to_hidden_in_lyrs))) 102 | hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0] 103 | 104 | self.input_to_hidden = input_to_hidden 105 | self.hidden_to_hidden = hidden_to_hidden 106 | self.learn_init = learn_init 107 | self.backwards = backwards 108 | self.gradient_steps = gradient_steps 109 | self.grad_clipping = grad_clipping 110 | self.unroll_scan = unroll_scan 111 | self.precompute_input = precompute_input 112 | self.only_return_final = only_return_final 113 | 114 | 115 | if unroll_scan and gradient_steps != -1: 116 | raise ValueError( 117 | "Gradient steps must be -1 when unroll_scan is true.") 118 | 119 | # Retrieve the dimensionality of the incoming layer 120 | input_shape = self.input_shapes[0] 121 | 122 | if nonlinearity is None: 123 | self.nonlinearity = nonlinearities.identity 124 | else: 125 | self.nonlinearity = nonlinearity 126 | 127 | # Initialize hidden state 128 | if isinstance(hid_init, Layer): 129 | self.hid_init = hid_init 130 | else: 131 | self.hid_init = self.add_param( 132 | hid_init, (1,) + hidden_to_hidden.output_shape[1:], 133 | name="hid_init", trainable=learn_init, regularizable=False) 134 | 135 | def get_params(self, **tags): 136 | # Get all parameters from this layer, the master layer 137 | params = super(onlyRecurrentLayer, self).get_params(**tags) 138 | # Combine with all parameters from the child layers 139 | params += helper.get_all_params(self.input_to_hidden, **tags) 140 | params += helper.get_all_params(self.hidden_to_hidden, **tags) 141 | return params 142 | 143 | def get_output_shape_for(self, input_shapes): 144 | # The shape of the input to this layer will be the first element 145 | # of input_shapes, whether or not a mask input is being used. 146 | input_shape = input_shapes[0] 147 | # When only_return_final is true, the second (sequence step) dimension 148 | # will be flattened 149 | if self.only_return_final: 150 | return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:] 151 | # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...) 152 | else: 153 | return ((input_shape[0], input_shape[1]) + 154 | self.hidden_to_hidden.output_shape[1:]) 155 | 156 | def get_output_for(self, inputs, **kwargs): 157 | # Retrieve the layer input 158 | input = inputs[0] 159 | # Retrieve the mask when it is supplied 160 | mask = None 161 | hid_init = None 162 | if self.mask_incoming_index > 0: 163 | mask = inputs[self.mask_incoming_index] 164 | if self.hid_init_incoming_index > 0: 165 | hid_init = inputs[self.hid_init_incoming_index] 166 | 167 | # Input should be provided as (n_batch, n_time_steps, n_features) 168 | # but scan requires the iterable dimension to be first 169 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 170 | #input = input.dimshuffle(1, 0, *range(2, input.ndim)) 171 | seq_len, num_batch = input.shape[0], input.shape[1] 172 | 173 | # We will always pass the hidden-to-hidden layer params to step 174 | non_seqs = helper.get_all_params(self.hidden_to_hidden) 175 | 176 | # Create single recurrent computation step function 177 | def step(input_n, hid_previous, *args): 178 | # Compute the hidden-to-hidden activation 179 | hid_pre = helper.get_output( 180 | self.hidden_to_hidden, hid_previous, **kwargs) 181 | 182 | hid_pre += input_n 183 | 184 | # Clip gradients 185 | if self.grad_clipping: 186 | hid_pre = theano.gradient.grad_clip( 187 | hid_pre, -self.grad_clipping, self.grad_clipping) 188 | 189 | return self.nonlinearity(hid_pre) 190 | 191 | def step_masked(input_n, mask_n, hid_previous, *args): 192 | # Skip over any input with mask 0 by copying the previous 193 | # hidden state; proceed normally for any input with mask 1. 194 | hid = step(input_n, hid_previous, *args) 195 | hid_out = T.switch(mask_n, hid, hid_previous) 196 | return [hid_out] 197 | 198 | if mask is not None: 199 | mask = mask.dimshuffle(1, 0, 'x') 200 | sequences = [input, mask] 201 | step_fun = step_masked 202 | else: 203 | sequences = input 204 | step_fun = step 205 | 206 | if not isinstance(self.hid_init, Layer): 207 | # The code below simply repeats self.hid_init num_batch times in 208 | # its first dimension. Turns out using a dot product and a 209 | # dimshuffle is faster than T.repeat. 210 | dot_dims = (list(range(1, self.hid_init.ndim - 1)) + 211 | [0, self.hid_init.ndim - 1]) 212 | hid_init = T.dot(T.ones((num_batch, 1)), 213 | self.hid_init.dimshuffle(dot_dims)) 214 | 215 | if self.unroll_scan: 216 | # Retrieve the dimensionality of the incoming layer 217 | input_shape = self.input_shapes[0] 218 | # Explicitly unroll the recurrence instead of using scan 219 | hid_out = unroll_scan( 220 | fn=step_fun, 221 | sequences=sequences, 222 | outputs_info=[hid_init], 223 | go_backwards=self.backwards, 224 | non_sequences=non_seqs, 225 | n_steps=input_shape[1])[0] 226 | else: 227 | # Scan op iterates over first dimension of input and repeatedly 228 | # applies the step function 229 | hid_out = theano.scan( 230 | fn=step_fun, 231 | sequences=sequences, 232 | go_backwards=self.backwards, 233 | outputs_info=[hid_init], 234 | non_sequences=non_seqs, 235 | truncate_gradient=self.gradient_steps, 236 | strict=True)[0] 237 | 238 | # When it is requested that we only return the final sequence step, 239 | # we need to slice it out immediately after scan is applied 240 | if self.only_return_final: 241 | hid_out = hid_out[-1] 242 | else: 243 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 244 | #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) 245 | 246 | # if scan is backward reverse the output 247 | if self.backwards: 248 | hid_out = hid_out[::-1,:] 249 | 250 | return hid_out 251 | 252 | 253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer): 254 | 255 | def __init__(self, incoming, num_units, 256 | #W_in_to_hid=init.Uniform(), 257 | W_hid_to_hid=init.Uniform(), 258 | #b=init.Constant(0.), 259 | nonlinearity=nonlinearities.rectify, 260 | hid_init=init.Constant(0.), 261 | backwards=False, 262 | learn_init=False, 263 | gradient_steps=-1, 264 | grad_clipping=0, 265 | unroll_scan=False, 266 | precompute_input=True, 267 | mask_input=None, 268 | only_return_final=False, 269 | **kwargs): 270 | 271 | if isinstance(incoming, tuple): 272 | input_shape = incoming 273 | else: 274 | input_shape = incoming.output_shape 275 | # Retrieve the supplied name, if it exists; otherwise use '' 276 | if 'name' in kwargs: 277 | basename = kwargs['name'] + '.' 278 | # Create a separate version of kwargs for the contained layers 279 | # which does not include 'name' 280 | layer_kwargs = dict((key, arg) for key, arg in kwargs.items() 281 | if key != 'name') 282 | else: 283 | basename = '' 284 | layer_kwargs = kwargs 285 | # We will be passing the input at each time step to the dense layer, 286 | # so we need to remove the second dimension (the time dimension) 287 | in_to_hid = InputLayer(input_shape) 288 | 289 | # in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]), 290 | # num_units, W=W_in_to_hid, b=b, 291 | # nonlinearity=None, 292 | # name=basename + 'input_to_hidden', 293 | # **layer_kwargs) 294 | # The hidden-to-hidden layer expects its inputs to have num_units 295 | # features because it recycles the previous hidden state 296 | 297 | hid_to_hid = MulLayer(InputLayer((None, num_units)), 298 | W=W_hid_to_hid, 299 | name=basename + 'hidden_to_hidden', 300 | **layer_kwargs) 301 | # hid_to_hid = DenseLayer(InputLayer((None, num_units)), 302 | # num_units, W=W_hid_to_hid, b=None, 303 | # nonlinearity=None, 304 | # name=basename + 'hidden_to_hidden', 305 | # **layer_kwargs) 306 | 307 | # Make child layer parameters intuitively accessible 308 | #self.W_in_to_hid = in_to_hid.W 309 | self.W_hid_to_hid = hid_to_hid.W 310 | #self.b = in_to_hid.b 311 | 312 | # Just use the CustomRecurrentLayer with the DenseLayers we created 313 | super(IndRNNLayer_onlyrecurrent, self).__init__( 314 | incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity, 315 | hid_init=hid_init, backwards=backwards, learn_init=learn_init, 316 | gradient_steps=gradient_steps, 317 | grad_clipping=grad_clipping, unroll_scan=unroll_scan, 318 | precompute_input=precompute_input, mask_input=mask_input, 319 | only_return_final=only_return_final, **kwargs) 320 | --------------------------------------------------------------------------------