├── wordPTB
    ├── data
    │   ├── .gitgnore~
    │   └── .gitgnore
    ├── results
    │   └── wPTB.PNG
    ├── README.md
    ├── IndRNN.py
    ├── BatchNorm_step_timefirst.py
    ├── reader.py
    ├── bn_eachstep_withdrop_timefirst.py
    └── IndRNN_onlyrecurrent.py
├── action recognition
    ├── __init__.py
    ├── README.md
    ├── opts.py
    ├── Indrnn_action_network.py
    ├── IndRNN.py
    ├── data_reader_numpy_test.py
    ├── Indrnn_action_train.py
    ├── data_reader_numpy_witheval.py
    └── IndRNN_onlyrecurrent.py
├── .gitignore
├── cPTB
    ├── data
    │   └── .gitignore
    ├── results
    │   └── cPTB.PNG
    ├── README.md
    ├── IndRNN.py
    ├── BatchNorm_step_timefirst.py
    ├── reader.py
    ├── bn_eachstep_withdrop_timefirst.py
    ├── penntree_charlevel_rernn.py
    └── IndRNN_onlyrecurrent.py
├── Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf
├── IndRNN.py
├── adding
    ├── IndRNN.py
    └── adding.py
├── mnist
    ├── IndRNN.py
    ├── pixelmnist.py
    ├── Data_gen.py
    ├── Data_gen_permute.py
    └── IndRNN_onlyrecurrent.py
├── README.md
└── IndRNN_onlyrecurrent.py


/wordPTB/data/.gitgnore~:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/action recognition/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | 


--------------------------------------------------------------------------------
/cPTB/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/wordPTB/data/.gitgnore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/cPTB/results/cPTB.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/cPTB/results/cPTB.PNG


--------------------------------------------------------------------------------
/wordPTB/results/wPTB.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/wordPTB/results/wPTB.PNG


--------------------------------------------------------------------------------
/Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sunnydreamrain/IndRNN_Theano_Lasagne/HEAD/Independently Recurrent Neural Network (IndRNN) Building A Longer and Deeper RNN.pdf


--------------------------------------------------------------------------------
/action recognition/README.md:
--------------------------------------------------------------------------------
 1 | ## The skeleton-based Action Recognition example  
 2 | ### Usage  
 3 | 1, First, ready the data. Two ways.  
 4 |   (1) Use your own data reader. Change the code at [Indrnn_action_train.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/action%20recognition/Indrnn_action_train.py#L69)   
 5 |   (2) Use the provided data reader. Generate the data ndarray. Download the NTU RGB+D dataset, change the skeleton into a ndarray, and keep the length and label of each data entry.  
 6 | 2, Run the code. Add the Theano flags if using GPU. `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' `
 7 |    `python -u Indrnn_action_train.py --use_bn_afterrnn --use_dropout --droprate 0.25 --use_weightdecay_nohiddenW` 
 8 |    If use the CV test setting, add `--test_CV`. For example:  
 9 |    `python -u Indrnn_action_train.py --test_CV --use_bn_afterrnn --use_dropout --droprate 0.1 --use_weightdecay_nohiddenW` 
10 |    
11 | ### Considerations
12 | 1, Usually sequence length of 20 is used for this dataset. It is short, so no need to impose the constraint on the recurrent weight (Similar results using it).  
13 | 


--------------------------------------------------------------------------------
/action recognition/opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | 
 4 | 
 5 | def train_opts(parser):
 6 |   parser.add_argument('--lr', type=np.float32, default=2e-4,help='lr')
 7 |   parser.add_argument('--batch_size', type=int, default=128, help='batch_size')
 8 |   parser.add_argument('--seq_len', type=int, default=20)
 9 |   parser.add_argument('--num_layers', type=int, default=6,help='num_layers')
10 |   parser.add_argument('--hidden_units', type=int, default=512)
11 |   parser.add_argument('--test_CV', action='store_true', default=False,help='use the CS test setting. If True, then use CV test setting.')
12 |   parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False)
13 |   parser.add_argument('--decayrate', type=np.float32, default=1e-4,help='lr')
14 | 
15 | 
16 |   parser.add_argument('--use_bn_afterrnn', action='store_true', default=False)
17 | 
18 | 
19 | 
20 |   parser.add_argument('--ini_in2hid', type=np.float32, default=0.002)
21 | 
22 |   parser.add_argument('--constrain_U', action='store_true', default=False)
23 |   parser.add_argument('--MAG', type=np.float32, default=5.0)
24 | 
25 |   parser.add_argument('--rotation_aug', action='store_true', default=False)
26 |   parser.add_argument('--eval_fold', type=int, default=5)
27 |   parser.add_argument('--ini_b', type=np.float32, default=0.0)
28 |   parser.add_argument('--end_rate', type=np.float32, default=1e-6)
29 | 
30 |   
31 |   
32 |   
33 |   parser.add_argument('--use_dropout', action='store_true', default=False)
34 |   parser.add_argument('--droprate', type=np.float32, default=0.1,help='lr')
35 |   parser.add_argument('--rec_drop', action='store_true', default=False)
36 |   parser.add_argument('--drop_layers', type=int, default=1)
37 |   parser.add_argument('--conv_drop', action='store_true', default=False)
38 |   
39 |   
40 |   
41 |   
42 |   
43 | 


--------------------------------------------------------------------------------
/cPTB/README.md:
--------------------------------------------------------------------------------
 1 | ## The language modeling example using character-level Penn Treebank (PTB-c)  
 2 | ### Usage
 3 | 1, First, download the data and add it to the `data` folder.  
 4 | >> The PTB dataset used comes from Tomas Mikolov's webpage:  
 5 | >> http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz  
 6 | 
 7 | 2, Run the code using the generally stacked IndRNN or the residual IndRNN.  
 8 | 
 9 | >> Stacked IndRNN: `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_dropout --num_layers 6 --droprate 0.25 --seq_len 150 --use_weightdecay_nohiddenW`  
10 |  
11 | >> Residual IndRNN: `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_residual --num_layers 11 --use_dropout --droprate 0.3 --seq_len 150 --use_weightdecay_nohiddenW`  
12 | >> The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results.
13 | 
14 | >> For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation.  
15 | 
16 | >> `data_aug` here only provides different start for each training epoch to provide stable statistics for BN.  
17 | 
18 | 3, Longer sequence performs better in our experiments, showing that longer dependency helps.   
19 | 
20 | ![alt text](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/cPTB/results/cPTB.PNG)
21 | 


--------------------------------------------------------------------------------
/wordPTB/README.md:
--------------------------------------------------------------------------------
 1 | ## The language modeling example using word-level Penn Treebank (PTB-c)  
 2 | ### Usage
 3 | 1, First, download the data and add it to the `data` folder.  
 4 | >> The PTB dataset used comes from Tomas Mikolov's webpage:  
 5 | >> http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz  
 6 | 
 7 | 2, Run the code using the generally stacked IndRNN or the residual IndRNN.  
 8 | 
 9 | >> Residual IndRNN: `python -u penntree_wordlevel_rernn_WT.py --word_level --w_tying --data_aug --lr 5e-4 --seq_len 50 --use_residual --num_layers 11 --hidden_units 2000 --batch_size 128 --use_dropout --drop_embedding --droprate_last 0.8 --droprate 0.35 --ini_normal --U_std 0.2 --U_mean 0.4 --ini_last 0.03`  
10 | >> The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results.
11 | 
12 | >> For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation.  
13 | 
14 | >> `data_aug` here only provides different start for each training epoch to provide stable statistics for BN.  
15 | 
16 | 3, This is a rather small dataset for word-level language modelling, so initialization matters. The example running configuration may not be best, but shows a better performance than the existing models (except the neural architecture search which constructs new models while learning).   
17 | 
18 | ![alt text](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/wordPTB/results/wPTB.PNG)
19 | 


--------------------------------------------------------------------------------
/action recognition/Indrnn_action_network.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import argparse
 4 | import os
 5 | import time
 6 | import theano
 7 | import numpy as np
 8 | import theano.tensor as T
 9 | import lasagne
10 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,BatchNormLayer
11 | from lasagne.layers import DenseLayer,ElemwiseSumLayer,SliceLayer
12 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer
13 | from lasagne.nonlinearities import softmax, rectify,tanh
14 | from lasagne.init import Uniform,Normal,HeNormal
15 | 
16 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent
17 | 
18 | import opts
19 | 
20 | sys.setrecursionlimit(10000)
21 | parser = argparse.ArgumentParser(description='network')
22 | 
23 | #parser.set_defaults(use_weightdecay_nohiddenW=True, use_bn=True, use_birnn=True)
24 | 
25 | opts.train_opts(parser)
26 | args = parser.parse_args()
27 | 
28 | batch_size = args.batch_size
29 | seq_len=args.seq_len
30 | num_layers=args.num_layers
31 | hidden_units=args.hidden_units
32 | outputclass=60
33 | indim=50#150
34 | droprate=args.droprate
35 | gradclipvalue=10
36 | act=rectify
37 | U_bound=pow(args.MAG, 1.0 / seq_len)
38 | U_lowbound=pow(1.0/args.MAG, 1.0 / seq_len)
39 |   
40 | rnnmodel=indrnn_onlyrecurrent
41 | 
42 | 
43 | ini_W=HeNormal(gain=np.sqrt(2)/np.sqrt(args.seq_len))
44 | if args.use_bn_afterrnn:
45 |   ini_W=Uniform(args.ini_in2hid)
46 |   
47 | def build_indrnn_network(X_sym):
48 |     net = {}        
49 |     net['input0'] = InputLayer((batch_size, seq_len, indim, 3),X_sym)
50 |     net['input']=ReshapeLayer(net['input0'], (batch_size, seq_len, indim*3))    
51 |     net['rnn0']=DimshuffleLayer(net['input'],(1,0,2))
52 |     for l in range(1, num_layers+1):
53 |       hidini=0
54 |       if l==num_layers:
55 |         hidini=U_lowbound
56 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1))                
57 |       net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],hidden_units,W=ini_W,b=lasagne.init.Constant(args.ini_b),nonlinearity=None)         #
58 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size,  -1))  
59 |       if args.conv_drop:
60 |         net['rnn%d'%(l-1)]=DropoutLayer(net['rnn%d'%(l-1)], p=droprate, shared_axes=(0,))    
61 |       net['rnn%d'%l]=net['rnn%d'%(l-1)]
62 |       if not args.use_bn_afterrnn:
63 |         net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],beta=lasagne.init.Constant(args.ini_b),axes= (0,1))    
64 |                
65 |       net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],hidden_units,W_hid_to_hid=Uniform(range=(hidini,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=gradclipvalue)
66 |                          
67 |       if args.use_bn_afterrnn:
68 |         net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1))
69 |       if args.use_dropout and l%args.drop_layers==0:
70 |         net['rnn%d'%l]=DropoutLayer(net['rnn%d'%l], p=droprate, shared_axes=(0,))        
71 |         
72 |     net['rnn%d'%num_layers]=lasagne.layers.SliceLayer(net['rnn%d'%num_layers],indices=-1, axis=0)      
73 |     net['out']=DenseLayer(net['rnn%d'%num_layers],outputclass,nonlinearity=softmax)
74 |     return net
75 | 


--------------------------------------------------------------------------------
/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/adding/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/cPTB/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/mnist/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/wordPTB/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/action recognition/IndRNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN. The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Please cite the following paper if you find it useful.
  6 | 
  7 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
  8 | @article{li2018independently,
  9 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 10 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 11 |   booktitle={CVPR2018},
 12 |   year={2018}
 13 | }
 14 | """
 15 | import numpy as np
 16 | import theano
 17 | import theano.tensor as T
 18 | import lasagne.nonlinearities as nonlinearities
 19 | import lasagne.init as init
 20 | from lasagne.utils import unroll_scan
 21 | 
 22 | from lasagne.layers import MergeLayer, Layer
 23 | from lasagne.layers import InputLayer
 24 | from lasagne.layers import DenseLayer
 25 | from lasagne.layers import CustomRecurrentLayer
 26 | import lasagne
 27 | 
 28 | __all__ = [
 29 |     "MulLayer",
 30 |     "IndRNNLayer"
 31 | ]
 32 | 
 33 | 
 34 | 
 35 | class MulLayer(lasagne.layers.Layer):
 36 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 37 |         super(MulLayer, self).__init__(incoming, **kwargs)
 38 |         num_inputs = self.input_shape[1]
 39 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 40 | 
 41 |     def get_output_for(self, input, **kwargs):
 42 |         return input * self.W
 43 | 
 44 |     def get_output_shape_for(self, input_shape):
 45 |         return input_shape#(input_shape[0], self.num_units)
 46 | 
 47 | 
 48 | class IndRNNLayer(CustomRecurrentLayer):
 49 | 
 50 |     def __init__(self, incoming, num_units,
 51 |                  W_in_to_hid=init.Uniform(),
 52 |                  W_hid_to_hid=init.Uniform(),
 53 |                  b=init.Constant(0.),
 54 |                  nonlinearity=nonlinearities.rectify,
 55 |                  hid_init=init.Constant(0.),
 56 |                  backwards=False,
 57 |                  learn_init=False,
 58 |                  gradient_steps=-1,
 59 |                  grad_clipping=0,
 60 |                  unroll_scan=False,
 61 |                  precompute_input=True,
 62 |                  mask_input=None,
 63 |                  only_return_final=False,
 64 |                  **kwargs):
 65 | 
 66 |         if isinstance(incoming, tuple):
 67 |             input_shape = incoming
 68 |         else:
 69 |             input_shape = incoming.output_shape
 70 |         # Retrieve the supplied name, if it exists; otherwise use ''
 71 |         if 'name' in kwargs:
 72 |             basename = kwargs['name'] + '.'
 73 |             # Create a separate version of kwargs for the contained layers
 74 |             # which does not include 'name'
 75 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
 76 |                                 if key != 'name')
 77 |         else:
 78 |             basename = ''
 79 |             layer_kwargs = kwargs
 80 |         # We will be passing the input at each time step to the dense layer,
 81 |         # so we need to remove the second dimension (the time dimension)
 82 |         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
 83 |                                num_units, W=W_in_to_hid, b=b,
 84 |                                nonlinearity=None,
 85 |                                name=basename + 'input_to_hidden',
 86 |                                **layer_kwargs)
 87 |         # The hidden-to-hidden layer expects its inputs to have num_units
 88 |         # features because it recycles the previous hidden state
 89 |         
 90 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
 91 |                                  W=W_hid_to_hid, 
 92 |                                 name=basename + 'hidden_to_hidden',
 93 |                                 **layer_kwargs)
 94 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
 95 | #                                 num_units, W=W_hid_to_hid, b=None,
 96 | #                                 nonlinearity=None,
 97 | #                                 name=basename + 'hidden_to_hidden',
 98 | #                                 **layer_kwargs)
 99 | 
100 |         # Make child layer parameters intuitively accessible
101 |         self.W_in_to_hid = in_to_hid.W
102 |         self.W_hid_to_hid = hid_to_hid.W
103 |         self.b = in_to_hid.b
104 | 
105 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
106 |         super(IndRNNLayer, self).__init__(
107 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
108 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
109 |             gradient_steps=gradient_steps,
110 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
111 |             precompute_input=precompute_input, mask_input=mask_input,
112 |             only_return_final=only_return_final, **kwargs)
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/cPTB/BatchNorm_step_timefirst.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is adapted from the BatchNormLayer of Lasagne.
  4 | It implemens the batch noralization function where the first dimension of input is TIMESTEPS. It calculates (and averages over) mean and variance for each step over the batch_size dimension.
  5 | """
  6 | 
  7 | import theano
  8 | import theano.tensor as T
  9 | 
 10 | from lasagne import init
 11 | from lasagne import nonlinearities
 12 | 
 13 | from lasagne.layers import Layer
 14 | 
 15 | class BatchNorm_step_timefirst_Layer(Layer):
 16 |     def __init__(self, incoming, axes='auto', epsilon=1e-4, alpha=0.1,
 17 |                  beta=init.Constant(0), gamma=init.Constant(1),
 18 |                  mean=init.Constant(0), inv_std=init.Constant(1), **kwargs):
 19 |         super(BatchNorm_step_timefirst_Layer, self).__init__(incoming, **kwargs)
 20 | 
 21 |         if axes == 'auto':
 22 |             # default: normalize over all but the second axis
 23 |             axes = (0,) + tuple(range(2, len(self.input_shape)))
 24 |         elif isinstance(axes, int):
 25 |             axes = (axes,)
 26 |         self.axes = axes
 27 |         if len(axes)==1:
 28 |           self.mean_axes=self.axes
 29 |         else:
 30 |           self.mean_axes=(axes[1],)
 31 | 
 32 |         self.epsilon = epsilon
 33 |         self.alpha = alpha
 34 | 
 35 |         # create parameters, ignoring all dimensions in axes
 36 |         shape = [size for axis, size in enumerate(self.input_shape)
 37 |                  if axis not in self.axes]
 38 |         meanshape = [size for axis, size in enumerate(self.input_shape)
 39 |                  if axis not in self.mean_axes]
 40 |         if any(size is None for size in shape):
 41 |             raise ValueError("BatchNormLayer needs specified input sizes for "
 42 |                              "all axes not normalized over.")
 43 |         if beta is None:
 44 |             self.beta = None
 45 |         else:
 46 |             self.beta = self.add_param(beta, shape, 'beta',
 47 |                                        trainable=True, regularizable=False)
 48 |         if gamma is None:
 49 |             self.gamma = None
 50 |         else:
 51 |             self.gamma = self.add_param(gamma, shape, 'gamma',
 52 |                                         trainable=True, regularizable=True)
 53 |         self.mean = self.add_param(mean, meanshape, 'mean',
 54 |                                    trainable=False, regularizable=False)
 55 |         self.inv_std = self.add_param(inv_std, meanshape, 'inv_std',
 56 |                                       trainable=False, regularizable=False)
 57 | 
 58 |     def get_output_for(self, input, deterministic=False,
 59 |                        batch_norm_use_averages=None,
 60 |                        batch_norm_update_averages=None, **kwargs):
 61 |         input_mean = input.mean(self.mean_axes)
 62 |         input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon))
 63 | 
 64 |         # Decide whether to use the stored averages or mini-batch statistics
 65 |         if batch_norm_use_averages is None:
 66 |             batch_norm_use_averages = deterministic
 67 |         use_averages = batch_norm_use_averages
 68 | 
 69 |         if use_averages:
 70 |             mean = self.mean
 71 |             inv_std = self.inv_std
 72 |         else:
 73 |             mean = input_mean
 74 |             inv_std = input_inv_std
 75 | 
 76 |         # Decide whether to update the stored averages
 77 |         if batch_norm_update_averages is None:
 78 |             batch_norm_update_averages = not deterministic
 79 |         update_averages = batch_norm_update_averages
 80 | 
 81 |         if update_averages:
 82 |             # Trick: To update the stored statistics, we create memory-aliased
 83 |             # clones of the stored statistics:
 84 |             running_mean = theano.clone(self.mean, share_inputs=False)
 85 |             running_inv_std = theano.clone(self.inv_std, share_inputs=False)
 86 |             # set a default update for them:
 87 |             running_mean.default_update = ((1 - self.alpha) * running_mean +
 88 |                                            self.alpha * input_mean)
 89 |             running_inv_std.default_update = ((1 - self.alpha) *
 90 |                                               running_inv_std +
 91 |                                               self.alpha * input_inv_std)
 92 |             # and make sure they end up in the graph without participating in
 93 |             # the computation (this way their default_update will be collected
 94 |             # and applied, but the computation will be optimized away):
 95 |             mean += 0 * running_mean
 96 |             inv_std += 0 * running_inv_std
 97 | 
 98 |         # prepare dimshuffle pattern inserting broadcastable axes as needed
 99 |         param_axes = iter(range(input.ndim - len(self.axes)))
100 |         pattern = ['x' if input_axis in self.axes
101 |                    else next(param_axes)
102 |                    for input_axis in range(input.ndim)]
103 | 
104 |         # apply dimshuffle pattern to all parameters
105 |         beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
106 |         gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
107 |         
108 |         mean_param_axes = iter(range(input.ndim - len(self.mean_axes)))
109 |         mean_pattern = ['x' if input_axis in self.mean_axes
110 |                    else next(mean_param_axes)
111 |                    for input_axis in range(input.ndim)]        
112 |         mean = mean.dimshuffle(mean_pattern)
113 |         inv_std = inv_std.dimshuffle(mean_pattern)
114 | 
115 |         # normalize
116 |         normalized = (input - mean) * (gamma * inv_std) + beta
117 |         return normalized
118 | 
119 | 


--------------------------------------------------------------------------------
/wordPTB/BatchNorm_step_timefirst.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is adapted from the BatchNormLayer of Lasagne.
  4 | It implemens the batch noralization function where the first dimension of input is TIMESTEPS. It calculates (and averages over) mean and variance for each step over the batch_size dimension.
  5 | """
  6 | 
  7 | import theano
  8 | import theano.tensor as T
  9 | 
 10 | from lasagne import init
 11 | from lasagne import nonlinearities
 12 | 
 13 | from lasagne.layers import Layer
 14 | 
 15 | class BatchNorm_step_timefirst_Layer(Layer):
 16 |     def __init__(self, incoming, axes='auto', epsilon=1e-4, alpha=0.1,
 17 |                  beta=init.Constant(0), gamma=init.Constant(1),
 18 |                  mean=init.Constant(0), inv_std=init.Constant(1), **kwargs):
 19 |         super(BatchNorm_step_timefirst_Layer, self).__init__(incoming, **kwargs)
 20 | 
 21 |         if axes == 'auto':
 22 |             # default: normalize over all but the second axis
 23 |             axes = (0,) + tuple(range(2, len(self.input_shape)))
 24 |         elif isinstance(axes, int):
 25 |             axes = (axes,)
 26 |         self.axes = axes
 27 |         if len(axes)==1:
 28 |           self.mean_axes=self.axes
 29 |         else:
 30 |           self.mean_axes=(axes[1],)
 31 | 
 32 |         self.epsilon = epsilon
 33 |         self.alpha = alpha
 34 | 
 35 |         # create parameters, ignoring all dimensions in axes
 36 |         shape = [size for axis, size in enumerate(self.input_shape)
 37 |                  if axis not in self.axes]
 38 |         meanshape = [size for axis, size in enumerate(self.input_shape)
 39 |                  if axis not in self.mean_axes]
 40 |         if any(size is None for size in shape):
 41 |             raise ValueError("BatchNormLayer needs specified input sizes for "
 42 |                              "all axes not normalized over.")
 43 |         if beta is None:
 44 |             self.beta = None
 45 |         else:
 46 |             self.beta = self.add_param(beta, shape, 'beta',
 47 |                                        trainable=True, regularizable=False)
 48 |         if gamma is None:
 49 |             self.gamma = None
 50 |         else:
 51 |             self.gamma = self.add_param(gamma, shape, 'gamma',
 52 |                                         trainable=True, regularizable=True)
 53 |         self.mean = self.add_param(mean, meanshape, 'mean',
 54 |                                    trainable=False, regularizable=False)
 55 |         self.inv_std = self.add_param(inv_std, meanshape, 'inv_std',
 56 |                                       trainable=False, regularizable=False)
 57 | 
 58 |     def get_output_for(self, input, deterministic=False,
 59 |                        batch_norm_use_averages=None,
 60 |                        batch_norm_update_averages=None, **kwargs):
 61 |         input_mean = input.mean(self.mean_axes)
 62 |         input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon))
 63 | 
 64 |         # Decide whether to use the stored averages or mini-batch statistics
 65 |         if batch_norm_use_averages is None:
 66 |             batch_norm_use_averages = deterministic
 67 |         use_averages = batch_norm_use_averages
 68 | 
 69 |         if use_averages:
 70 |             mean = self.mean
 71 |             inv_std = self.inv_std
 72 |         else:
 73 |             mean = input_mean
 74 |             inv_std = input_inv_std
 75 | 
 76 |         # Decide whether to update the stored averages
 77 |         if batch_norm_update_averages is None:
 78 |             batch_norm_update_averages = not deterministic
 79 |         update_averages = batch_norm_update_averages
 80 | 
 81 |         if update_averages:
 82 |             # Trick: To update the stored statistics, we create memory-aliased
 83 |             # clones of the stored statistics:
 84 |             running_mean = theano.clone(self.mean, share_inputs=False)
 85 |             running_inv_std = theano.clone(self.inv_std, share_inputs=False)
 86 |             # set a default update for them:
 87 |             running_mean.default_update = ((1 - self.alpha) * running_mean +
 88 |                                            self.alpha * input_mean)
 89 |             running_inv_std.default_update = ((1 - self.alpha) *
 90 |                                               running_inv_std +
 91 |                                               self.alpha * input_inv_std)
 92 |             # and make sure they end up in the graph without participating in
 93 |             # the computation (this way their default_update will be collected
 94 |             # and applied, but the computation will be optimized away):
 95 |             mean += 0 * running_mean
 96 |             inv_std += 0 * running_inv_std
 97 | 
 98 |         # prepare dimshuffle pattern inserting broadcastable axes as needed
 99 |         param_axes = iter(range(input.ndim - len(self.axes)))
100 |         pattern = ['x' if input_axis in self.axes
101 |                    else next(param_axes)
102 |                    for input_axis in range(input.ndim)]
103 | 
104 |         # apply dimshuffle pattern to all parameters
105 |         beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
106 |         gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
107 |         
108 |         mean_param_axes = iter(range(input.ndim - len(self.mean_axes)))
109 |         mean_pattern = ['x' if input_axis in self.mean_axes
110 |                    else next(mean_param_axes)
111 |                    for input_axis in range(input.ndim)]        
112 |         mean = mean.dimshuffle(mean_pattern)
113 |         inv_std = inv_std.dimshuffle(mean_pattern)
114 | 
115 |         # normalize
116 |         normalized = (input - mean) * (gamma * inv_std) + beta
117 |         return normalized
118 | 
119 | 


--------------------------------------------------------------------------------
/action recognition/data_reader_numpy_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import h5py
  3 | import numpy as np
  4 | import time
  5 | import random
  6 | #import glob
  7 | #import skimage.transform
  8 | #from skimage import color
  9 | import pickle
 10 | import theano
 11 | #import cv2
 12 | from multiprocessing import Pool
 13 | from threading import Thread
 14 | import os.path
 15 | #RGB_frames = '/home/sl669/caffe/colordataset/ImageNET/ILSVRC2015/Data/CLS-LOC/val/'#'/home/sl669/caffe/ucf101/framearrays/'#
 16 | 
 17 | 
 18 | 
 19 | from __main__ import test_dataset
 20 | datasets=test_dataset
 21 | class batch_thread():
 22 |   def __init__(self, result, batch_size_,seq_len):#, datasets
 23 |     self.result = result
 24 |     self.batch_size_=batch_size_
 25 |     self.datasets = datasets   
 26 |     self.seq_len=seq_len
 27 |     self.idx=-1
 28 |     
 29 |     dataname=datasets+'.npy'
 30 |     labelname=datasets+'_label.npy'
 31 |     lenname=datasets+'_len.npy'
 32 |     self.data_handle=np.load(dataname)
 33 |     self.label_handle=np.load(labelname)
 34 |     self.len_handle=np.load(lenname) 
 35 |     
 36 |     self.num_videos = len(self.data_handle)    
 37 |     self.shufflevideolist=np.arange(self.num_videos)
 38 |     np.random.shuffle(self.shufflevideolist)
 39 | 
 40 |     print ('Dataset size', self.num_videos)
 41 |   
 42 |   def __call__(self):###Be careful.  The appended data may change like pointer.
 43 |     templabel=[] 
 44 |     batch_data=[]
 45 |     tempindex=[] 
 46 |     for j in range(self.batch_size_):
 47 |       self.idx +=1
 48 |       if self.idx == self.num_videos:
 49 |         self.idx =0
 50 |         np.random.shuffle(self.shufflevideolist)
 51 |       shufflevideoindex=self.shufflevideolist[self.idx]
 52 |       
 53 |       label=self.label_handle[shufflevideoindex]     
 54 |       templabel.append(np.int32(label))  
 55 |       tempindex.append(np.int32(shufflevideoindex)) 
 56 |       dataset=self.data_handle[shufflevideoindex]
 57 |       len_data=self.len_handle[shufflevideoindex]   
 58 |       
 59 |       sample=np.zeros(tuple((self.seq_len,)+self.data_handle[shufflevideoindex].shape[1:]))
 60 |       lenperseg=len_data//self.seq_len
 61 |       if lenperseg==1 and len_data>self.seq_len:
 62 |         startid=np.random.randint(len_data-self.seq_len)
 63 |         sample=dataset[startid:startid+self.seq_len]
 64 |       elif len_data<=self.seq_len:
 65 |         startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len)))
 66 |         endid=min(self.seq_len,startid+len_data)
 67 |         datasid=0
 68 |         dataeid=len_data
 69 |         if startid+len_data>self.seq_len:
 70 |           datasid=np.random.randint(startid+len_data-self.seq_len)
 71 |           dataeid=datasid+self.seq_len-startid
 72 |         sample[startid:endid]=dataset[datasid:dataeid]
 73 |       else:      
 74 |         for framei in range(self.seq_len):        
 75 |           if framei==self.seq_len-1:
 76 |             index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1))
 77 |           else:
 78 |             index=lenperseg*framei + np.random.randint(lenperseg)    
 79 |           sample[framei]=dataset[index]
 80 |         #print (index,lenperseg)  
 81 |         
 82 |       batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well.
 83 |       #print(batch_data)       
 84 | 
 85 |        
 86 |     self.result['data']=np.asarray(batch_data,dtype=np.float32)
 87 |     self.result['label']= np.asarray(templabel,dtype=np.int32)   
 88 |     self.result['index']= np.asarray(tempindex,dtype=np.int32)   
 89 |       
 90 |       
 91 |   def GetDatasetSize(self):
 92 |     return self.num_videos
 93 | 
 94 | 
 95 | 
 96 | class DataHandler(object):
 97 | 
 98 |   def __init__(self, batch_size, seq_len):#, datasets
 99 |     self.batch_size_ = batch_size		
100 |     #self.datasets = datasets    
101 |     random.seed(10)  
102 |     
103 |     self.thread_result = {}
104 |     self.thread = None
105 | 
106 |     self.batch_advancer =batch_thread(self.thread_result,self.batch_size_,seq_len)#, self.datasets
107 |     
108 |     self.datasetsize=self.batch_advancer.GetDatasetSize()
109 |     
110 |     self.dispatch_worker()
111 |     self.join_worker()
112 | 
113 | 
114 |   def GetBatch(self):
115 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
116 |     if self.thread is not None:
117 |       self.join_worker() 
118 |       
119 | #     self.batch_data_=self.thread_result['data']
120 | #     self.batch_label_=self.thread_result['label']
121 | 
122 |     self.batch_data_=self.thread_result['data']
123 |     self.batch_label_= self.thread_result['label']
124 |     self.batch_index_= self.thread_result['index']
125 |         
126 |     self.dispatch_worker()
127 |     return self.batch_data_, self.batch_label_,self.batch_index_
128 | 
129 | 
130 |     
131 | 
132 | 
133 |   def dispatch_worker(self):
134 |     assert self.thread is None
135 |     self.thread = Thread(target=self.batch_advancer)
136 |     self.thread.start()
137 | 
138 |   def join_worker(self):
139 |     assert self.thread is not None
140 |     self.thread.join()
141 |     self.thread = None
142 |     
143 |   def GetDatasetSize(self):
144 |     return self.datasetsize
145 | 
146 | 
147 | 
148 | 
149 | 
150 | def main():
151 |   dh = DataHandler(10, 30,'train_ntus')#'test_ntus.h5')#'test_ntus_allwitherror.h5')#
152 |   print (dh.GetDatasetSize)
153 | #  
154 | #   x,y,i = dh.GetBatch()
155 | #   print (x.shape)
156 | #   print (y[0:3],x[0,0,0],x[1,0,0],x[0,1,0])
157 | #   x,y,i = dh.GetBatch()
158 | #   #print (x[0,0],y)  
159 | #   print (y,x[0,0,0])
160 | #   x,y,i = dh.GetBatch()
161 | #   #print (x[0,0],y)
162 | #   print (y,x[0,0,0])
163 | #   x,y,i = dh.GetBatch()
164 | #   #print (x[0,0],y)    
165 | #   print (y,x[0,0,0])
166 | # #   exit()
167 | 
168 | if __name__ == '__main__':
169 |   main()
170 | 
171 | 


--------------------------------------------------------------------------------
/cPTB/reader.py:
--------------------------------------------------------------------------------
  1 | # This file is adapted from the tool provided with Tensorflow for
  2 | # reading the Penn Treebank dataset. The original copyright notice is
  3 | # provided below.
  4 | #
  5 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | # ==============================================================================
 19 | #from pylearn2.scripts.tutorials.grbm_smd.make_dataset import train_pkl_path
 20 | 
 21 | 
 22 | """Utilities for training on the Hutter Prize and PTB datasets."""
 23 | from __future__ import absolute_import
 24 | from __future__ import division
 25 | from __future__ import print_function
 26 | 
 27 | import collections
 28 | import os
 29 | 
 30 | import numpy as np
 31 | 
 32 | 
 33 | def _read_symbols(filename):
 34 |   with open(filename, "r") as f:
 35 |     return f.read()
 36 | 
 37 | 
 38 | def _read_words(filename):
 39 |   with open(filename, "r") as f:
 40 |     return f.read().decode("utf-8").replace("\n", "<eos>").split()
 41 | 
 42 | 
 43 | def _build_vocab(filename):
 44 |   data = _read_words(filename)
 45 | 
 46 |   counter = collections.Counter(data)
 47 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 48 | 
 49 |   words, _ = list(zip(*count_pairs))
 50 |   word_to_id = dict(zip(words, range(len(words))))
 51 | 
 52 |   return word_to_id
 53 | 
 54 | 
 55 | def _file_to_word_ids(filename, word_to_id):
 56 |   data = _read_words(filename)
 57 |   return [word_to_id[word] for word in data if word in word_to_id]
 58 | 
 59 | 
 60 | def hutter_raw_data(data_path=None, num_test_symbols=5000000):
 61 |   """Load raw data from data directory "data_path".
 62 | 
 63 |   The raw Hutter prize data is at:
 64 |   http://mattmahoney.net/dc/enwik8.zip
 65 | 
 66 |   Args:
 67 |     data_path: string path to the directory where simple-examples.tgz has
 68 |       been extracted.
 69 |     num_test_symbols: number of symbols at the end that make up the test set
 70 | 
 71 |   Returns:
 72 |     tuple (train_data, valid_data, test_data, unique)
 73 |     where each of the data objects can be passed to hutter_iterator.
 74 |   """
 75 | 
 76 |   data_path = os.path.join(data_path, "enwik8")
 77 | 
 78 |   raw_data = _read_symbols(data_path)
 79 |   raw_data = np.fromstring(raw_data, dtype=np.uint8)
 80 |   unique, data = np.unique(raw_data, return_inverse=True)
 81 |   train_data = data[: -2 * num_test_symbols]
 82 |   valid_data = data[-2 * num_test_symbols: -num_test_symbols]
 83 |   test_data = data[-num_test_symbols:]
 84 |   return train_data, valid_data, test_data, unique
 85 | 
 86 | 
 87 | def ptb_raw_data(data_path=None,filename='ptb.'):
 88 |   """Load PTB raw data from data directory "data_path".
 89 | 
 90 |   Reads PTB text files, converts strings to integer ids,
 91 |   and performs mini-batching of the inputs.
 92 | 
 93 |   The PTB dataset comes from Tomas Mikolov's webpage:
 94 | 
 95 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 96 | 
 97 |   Args:
 98 |     data_path: string path to the directory where simple-examples.tgz has
 99 |       been extracted.
100 | 
101 |   Returns:
102 |     tuple (train_data, valid_data, test_data, vocabulary)
103 |     where each of the data objects can be passed to PTBIterator.
104 |   """
105 | 
106 |   train_path = os.path.join(data_path, filename+'train.txt')
107 |   valid_path = os.path.join(data_path, filename+'valid.txt')
108 |   test_path = os.path.join(data_path, filename+'test.txt')
109 |   #print (train_path)
110 | 
111 |   word_to_id = _build_vocab(train_path)
112 |   train_data = _file_to_word_ids(train_path, word_to_id)
113 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
114 |   test_data = _file_to_word_ids(test_path, word_to_id)
115 |   vocabulary = len(word_to_id)
116 | #   save_name='ptb_char'
117 |   print ('voc',vocabulary)
118 | #   np.savez(save_name, train_data, valid_data, test_data, vocabulary)
119 |   return train_data, valid_data, test_data, vocabulary
120 | 
121 | 
122 | def data_iterator(raw_data, batch_size, num_steps):
123 |   """Iterate on the raw Hutter prize data or the raw PTB data.
124 | 
125 |   This generates batch_size pointers into the given raw data, and allows
126 |   minibatch iteration along these pointers.
127 | 
128 |   Args:
129 |     raw_data: one of the raw data outputs from hutter_raw_data or ptb_raw_data.
130 |     batch_size: int, the batch size.
131 |     num_steps: int, the number of unrolls.
132 | 
133 |   Yields:
134 |     Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
135 |     The second element of the tuple is the same data time-shifted to the
136 |     right by one.
137 | 
138 |   Raises:
139 |     ValueError: if batch_size or num_steps are too high.
140 |   """
141 |   raw_data = np.array(raw_data, dtype=np.int32)
142 | 
143 |   data_len = len(raw_data)
144 |   batch_len = data_len // batch_size
145 |   data = np.zeros([batch_size, batch_len], dtype=np.int32)
146 |   for i in range(batch_size):
147 |     data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
148 | 
149 |   epoch_size = (batch_len - 1) // num_steps
150 | 
151 |   if epoch_size == 0:
152 |     raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
153 | 
154 |   for i in range(epoch_size):
155 |     x = data[:, i*num_steps:(i+1)*num_steps]
156 |     y = data[:, i*num_steps+1:(i+1)*num_steps+1]
157 |     yield (x, y)
158 | 
159 | #ptb_raw_data('data/')
160 | 


--------------------------------------------------------------------------------
/wordPTB/reader.py:
--------------------------------------------------------------------------------
  1 | # This file is adapted from the tool provided with Tensorflow for
  2 | # reading the Penn Treebank dataset. The original copyright notice is
  3 | # provided below.
  4 | #
  5 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | # ==============================================================================
 19 | #from pylearn2.scripts.tutorials.grbm_smd.make_dataset import train_pkl_path
 20 | 
 21 | 
 22 | """Utilities for training on the Hutter Prize and PTB datasets."""
 23 | from __future__ import absolute_import
 24 | from __future__ import division
 25 | from __future__ import print_function
 26 | 
 27 | import collections
 28 | import os
 29 | 
 30 | import numpy as np
 31 | 
 32 | 
 33 | def _read_symbols(filename):
 34 |   with open(filename, "r") as f:
 35 |     return f.read()
 36 | 
 37 | 
 38 | def _read_words(filename):
 39 |   with open(filename, "r") as f:
 40 |     return f.read().decode("utf-8").replace("\n", "<eos>").split()
 41 | 
 42 | 
 43 | def _build_vocab(filename):
 44 |   data = _read_words(filename)
 45 | 
 46 |   counter = collections.Counter(data)
 47 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 48 | 
 49 |   words, _ = list(zip(*count_pairs))
 50 |   word_to_id = dict(zip(words, range(len(words))))
 51 | 
 52 |   return word_to_id
 53 | 
 54 | 
 55 | def _file_to_word_ids(filename, word_to_id):
 56 |   data = _read_words(filename)
 57 |   return [word_to_id[word] for word in data if word in word_to_id]
 58 | 
 59 | 
 60 | def hutter_raw_data(data_path=None, num_test_symbols=5000000):
 61 |   """Load raw data from data directory "data_path".
 62 | 
 63 |   The raw Hutter prize data is at:
 64 |   http://mattmahoney.net/dc/enwik8.zip
 65 | 
 66 |   Args:
 67 |     data_path: string path to the directory where simple-examples.tgz has
 68 |       been extracted.
 69 |     num_test_symbols: number of symbols at the end that make up the test set
 70 | 
 71 |   Returns:
 72 |     tuple (train_data, valid_data, test_data, unique)
 73 |     where each of the data objects can be passed to hutter_iterator.
 74 |   """
 75 | 
 76 |   data_path = os.path.join(data_path, "enwik8")
 77 | 
 78 |   raw_data = _read_symbols(data_path)
 79 |   raw_data = np.fromstring(raw_data, dtype=np.uint8)
 80 |   unique, data = np.unique(raw_data, return_inverse=True)
 81 |   train_data = data[: -2 * num_test_symbols]
 82 |   valid_data = data[-2 * num_test_symbols: -num_test_symbols]
 83 |   test_data = data[-num_test_symbols:]
 84 |   return train_data, valid_data, test_data, unique
 85 | 
 86 | 
 87 | def ptb_raw_data(data_path=None,filename='ptb.'):
 88 |   """Load PTB raw data from data directory "data_path".
 89 | 
 90 |   Reads PTB text files, converts strings to integer ids,
 91 |   and performs mini-batching of the inputs.
 92 | 
 93 |   The PTB dataset comes from Tomas Mikolov's webpage:
 94 | 
 95 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 96 | 
 97 |   Args:
 98 |     data_path: string path to the directory where simple-examples.tgz has
 99 |       been extracted.
100 | 
101 |   Returns:
102 |     tuple (train_data, valid_data, test_data, vocabulary)
103 |     where each of the data objects can be passed to PTBIterator.
104 |   """
105 | 
106 |   train_path = os.path.join(data_path, filename+'train.txt')
107 |   valid_path = os.path.join(data_path, filename+'valid.txt')
108 |   test_path = os.path.join(data_path, filename+'test.txt')
109 |   #print (train_path)
110 | 
111 |   word_to_id = _build_vocab(train_path)
112 |   train_data = _file_to_word_ids(train_path, word_to_id)
113 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
114 |   test_data = _file_to_word_ids(test_path, word_to_id)
115 |   vocabulary = len(word_to_id)
116 | #   save_name='ptb_char'
117 |   print ('voc',vocabulary)
118 | #   np.savez(save_name, train_data, valid_data, test_data, vocabulary)
119 |   return train_data, valid_data, test_data, vocabulary
120 | 
121 | 
122 | def data_iterator(raw_data, batch_size, num_steps):
123 |   """Iterate on the raw Hutter prize data or the raw PTB data.
124 | 
125 |   This generates batch_size pointers into the given raw data, and allows
126 |   minibatch iteration along these pointers.
127 | 
128 |   Args:
129 |     raw_data: one of the raw data outputs from hutter_raw_data or ptb_raw_data.
130 |     batch_size: int, the batch size.
131 |     num_steps: int, the number of unrolls.
132 | 
133 |   Yields:
134 |     Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
135 |     The second element of the tuple is the same data time-shifted to the
136 |     right by one.
137 | 
138 |   Raises:
139 |     ValueError: if batch_size or num_steps are too high.
140 |   """
141 |   raw_data = np.array(raw_data, dtype=np.int32)
142 | 
143 |   data_len = len(raw_data)
144 |   batch_len = data_len // batch_size
145 |   data = np.zeros([batch_size, batch_len], dtype=np.int32)
146 |   for i in range(batch_size):
147 |     data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
148 | 
149 |   epoch_size = (batch_len - 1) // num_steps
150 | 
151 |   if epoch_size == 0:
152 |     raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
153 | 
154 |   for i in range(epoch_size):
155 |     x = data[:, i*num_steps:(i+1)*num_steps]
156 |     y = data[:, i*num_steps+1:(i+1)*num_steps+1]
157 |     yield (x, y)
158 | 
159 | #ptb_raw_data('data/')
160 | 


--------------------------------------------------------------------------------
/cPTB/bn_eachstep_withdrop_timefirst.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from lasagne import init
  5 | from lasagne import nonlinearities
  6 | 
  7 | from lasagne.layers import Layer
  8 | from lasagne.layers import DropoutLayer
  9 | class BatchNormLayer(Layer):
 10 |     def __init__(self, incoming, axes='auto', droprate=0.2, epsilon=1e-4, alpha=0.1,sparsity=1.0,
 11 |                  beta=init.Constant(0), gamma=init.Constant(1),
 12 |                  mean=init.Constant(0), inv_std=init.Constant(1), **kwargs):
 13 |         super(BatchNormLayer, self).__init__(incoming, **kwargs)
 14 | 
 15 |         if axes == 'auto':
 16 |             # default: normalize over all but the second axis
 17 |             axes = (0,) + tuple(range(2, len(self.input_shape)))
 18 |         elif isinstance(axes, int):
 19 |             axes = (axes,)
 20 |         self.axes = axes
 21 |         if len(axes)==1:
 22 |           self.mean_axes=self.axes
 23 |         else:
 24 |           self.mean_axes=(axes[1],)
 25 | 
 26 |         self.epsilon = epsilon
 27 |         self.alpha = alpha
 28 | 
 29 |         # create parameters, ignoring all dimensions in axes
 30 |         shape = [size for axis, size in enumerate(self.input_shape)
 31 |                  if axis not in self.axes]
 32 |         meanshape = [size for axis, size in enumerate(self.input_shape)
 33 |                  if axis not in self.mean_axes]
 34 |         if any(size is None for size in shape):
 35 |             raise ValueError("BatchNormLayer needs specified input sizes for "
 36 |                              "all axes not normalized over.")
 37 |         if beta is None:
 38 |             self.beta = None
 39 |         else:
 40 |             self.beta = self.add_param(beta, shape, 'beta',
 41 |                                        trainable=True, regularizable=False)
 42 |         if gamma is None:
 43 |             self.gamma = None
 44 |         else:
 45 |             self.gamma = self.add_param(gamma, shape, 'gamma',
 46 |                                         trainable=True, regularizable=True)
 47 |         self.mean = self.add_param(mean, meanshape, 'mean',
 48 |                                    trainable=False, regularizable=False)
 49 |         self.inv_std = self.add_param(inv_std, meanshape, 'inv_std',
 50 |                                       trainable=False, regularizable=False)
 51 |         #print('here',len(self.input_shape))
 52 |         self.sparsity=sparsity
 53 |         if len(self.input_shape)==3:
 54 |           self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1],self.input_shape[2]), p=droprate, shared_axes=(0,1), **kwargs)
 55 |         else:
 56 |           self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1]), p=droprate, shared_axes=(0,), **kwargs)
 57 | 
 58 |     def get_output_for(self, input, deterministic=False,
 59 |                        batch_norm_use_averages=None,
 60 |                        batch_norm_update_averages=None, **kwargs):
 61 |         if self.sparsity==1:
 62 |           input_mean = input.mean(self.mean_axes)
 63 |           input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon))
 64 |         else:
 65 |           input_mean = input.mean(self.mean_axes)*(1.0/self.sparsity)
 66 |           input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) *(1.0/self.sparsity) -(1-self.sparsity)*T.sqr(input_mean) + self.epsilon))    
 67 | 
 68 |         # Decide whether to use the stored averages or mini-batch statistics
 69 |         if batch_norm_use_averages is None:
 70 |             batch_norm_use_averages = deterministic
 71 |         use_averages = batch_norm_use_averages
 72 | 
 73 |         if use_averages:
 74 |             mean = self.mean
 75 |             inv_std = self.inv_std
 76 |         else:
 77 |             mean = input_mean
 78 |             inv_std = input_inv_std
 79 | 
 80 |         # Decide whether to update the stored averages
 81 |         if batch_norm_update_averages is None:
 82 |             batch_norm_update_averages = not deterministic
 83 |         update_averages = batch_norm_update_averages
 84 | 
 85 |         if update_averages:
 86 |             # Trick: To update the stored statistics, we create memory-aliased
 87 |             # clones of the stored statistics:
 88 |             running_mean = theano.clone(self.mean, share_inputs=False)
 89 |             running_inv_std = theano.clone(self.inv_std, share_inputs=False)
 90 |             # set a default update for them:
 91 |             running_mean.default_update = ((1 - self.alpha) * running_mean +
 92 |                                            self.alpha * input_mean)
 93 |             running_inv_std.default_update = ((1 - self.alpha) *
 94 |                                               running_inv_std +
 95 |                                               self.alpha * input_inv_std)
 96 |             # and make sure they end up in the graph without participating in
 97 |             # the computation (this way their default_update will be collected
 98 |             # and applied, but the computation will be optimized away):
 99 |             mean += 0 * running_mean
100 |             inv_std += 0 * running_inv_std
101 | 
102 |         # prepare dimshuffle pattern inserting broadcastable axes as needed
103 |         param_axes = iter(range(input.ndim - len(self.axes)))
104 |         pattern = ['x' if input_axis in self.axes
105 |                    else next(param_axes)
106 |                    for input_axis in range(input.ndim)]
107 | 
108 |         # apply dimshuffle pattern to all parameters
109 |         beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
110 |         gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
111 |         
112 |         mean_param_axes = iter(range(input.ndim - len(self.mean_axes)))
113 |         mean_pattern = ['x' if input_axis in self.mean_axes
114 |                    else next(mean_param_axes)
115 |                    for input_axis in range(input.ndim)]        
116 |         mean = mean.dimshuffle(mean_pattern)
117 |         inv_std = inv_std.dimshuffle(mean_pattern)
118 |         
119 |         input=self.dropout.get_output_for(input,deterministic=deterministic)
120 | 
121 |         # normalize
122 |         normalized = (input - mean) * (gamma * inv_std) + beta
123 |         return normalized
124 | 
125 | 


--------------------------------------------------------------------------------
/wordPTB/bn_eachstep_withdrop_timefirst.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from lasagne import init
  5 | from lasagne import nonlinearities
  6 | 
  7 | from lasagne.layers import Layer
  8 | from lasagne.layers import DropoutLayer
  9 | class BatchNormLayer(Layer):
 10 |     def __init__(self, incoming, axes='auto', droprate=0.2, epsilon=1e-4, alpha=0.1,sparsity=1.0,
 11 |                  beta=init.Constant(0), gamma=init.Constant(1),
 12 |                  mean=init.Constant(0), inv_std=init.Constant(1), **kwargs):
 13 |         super(BatchNormLayer, self).__init__(incoming, **kwargs)
 14 | 
 15 |         if axes == 'auto':
 16 |             # default: normalize over all but the second axis
 17 |             axes = (0,) + tuple(range(2, len(self.input_shape)))
 18 |         elif isinstance(axes, int):
 19 |             axes = (axes,)
 20 |         self.axes = axes
 21 |         if len(axes)==1:
 22 |           self.mean_axes=self.axes
 23 |         else:
 24 |           self.mean_axes=(axes[1],)
 25 | 
 26 |         self.epsilon = epsilon
 27 |         self.alpha = alpha
 28 | 
 29 |         # create parameters, ignoring all dimensions in axes
 30 |         shape = [size for axis, size in enumerate(self.input_shape)
 31 |                  if axis not in self.axes]
 32 |         meanshape = [size for axis, size in enumerate(self.input_shape)
 33 |                  if axis not in self.mean_axes]
 34 |         if any(size is None for size in shape):
 35 |             raise ValueError("BatchNormLayer needs specified input sizes for "
 36 |                              "all axes not normalized over.")
 37 |         if beta is None:
 38 |             self.beta = None
 39 |         else:
 40 |             self.beta = self.add_param(beta, shape, 'beta',
 41 |                                        trainable=True, regularizable=False)
 42 |         if gamma is None:
 43 |             self.gamma = None
 44 |         else:
 45 |             self.gamma = self.add_param(gamma, shape, 'gamma',
 46 |                                         trainable=True, regularizable=True)
 47 |         self.mean = self.add_param(mean, meanshape, 'mean',
 48 |                                    trainable=False, regularizable=False)
 49 |         self.inv_std = self.add_param(inv_std, meanshape, 'inv_std',
 50 |                                       trainable=False, regularizable=False)
 51 |         #print('here',len(self.input_shape))
 52 |         self.sparsity=sparsity
 53 |         if len(self.input_shape)==3:
 54 |           self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1],self.input_shape[2]), p=droprate, shared_axes=(0,1), **kwargs)
 55 |         else:
 56 |           self.dropout=DropoutLayer((self.input_shape[0],self.input_shape[1]), p=droprate, shared_axes=(0,), **kwargs)
 57 | 
 58 |     def get_output_for(self, input, deterministic=False,
 59 |                        batch_norm_use_averages=None,
 60 |                        batch_norm_update_averages=None, **kwargs):
 61 |         if self.sparsity==1:
 62 |           input_mean = input.mean(self.mean_axes)
 63 |           input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) + self.epsilon))
 64 |         else:
 65 |           input_mean = input.mean(self.mean_axes)*(1.0/self.sparsity)
 66 |           input_inv_std = T.inv(T.sqrt(input.var(self.mean_axes) *(1.0/self.sparsity) -(1-self.sparsity)*T.sqr(input_mean) + self.epsilon))    
 67 | 
 68 |         # Decide whether to use the stored averages or mini-batch statistics
 69 |         if batch_norm_use_averages is None:
 70 |             batch_norm_use_averages = deterministic
 71 |         use_averages = batch_norm_use_averages
 72 | 
 73 |         if use_averages:
 74 |             mean = self.mean
 75 |             inv_std = self.inv_std
 76 |         else:
 77 |             mean = input_mean
 78 |             inv_std = input_inv_std
 79 | 
 80 |         # Decide whether to update the stored averages
 81 |         if batch_norm_update_averages is None:
 82 |             batch_norm_update_averages = not deterministic
 83 |         update_averages = batch_norm_update_averages
 84 | 
 85 |         if update_averages:
 86 |             # Trick: To update the stored statistics, we create memory-aliased
 87 |             # clones of the stored statistics:
 88 |             running_mean = theano.clone(self.mean, share_inputs=False)
 89 |             running_inv_std = theano.clone(self.inv_std, share_inputs=False)
 90 |             # set a default update for them:
 91 |             running_mean.default_update = ((1 - self.alpha) * running_mean +
 92 |                                            self.alpha * input_mean)
 93 |             running_inv_std.default_update = ((1 - self.alpha) *
 94 |                                               running_inv_std +
 95 |                                               self.alpha * input_inv_std)
 96 |             # and make sure they end up in the graph without participating in
 97 |             # the computation (this way their default_update will be collected
 98 |             # and applied, but the computation will be optimized away):
 99 |             mean += 0 * running_mean
100 |             inv_std += 0 * running_inv_std
101 | 
102 |         # prepare dimshuffle pattern inserting broadcastable axes as needed
103 |         param_axes = iter(range(input.ndim - len(self.axes)))
104 |         pattern = ['x' if input_axis in self.axes
105 |                    else next(param_axes)
106 |                    for input_axis in range(input.ndim)]
107 | 
108 |         # apply dimshuffle pattern to all parameters
109 |         beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
110 |         gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
111 |         
112 |         mean_param_axes = iter(range(input.ndim - len(self.mean_axes)))
113 |         mean_pattern = ['x' if input_axis in self.mean_axes
114 |                    else next(mean_param_axes)
115 |                    for input_axis in range(input.ndim)]        
116 |         mean = mean.dimshuffle(mean_pattern)
117 |         inv_std = inv_std.dimshuffle(mean_pattern)
118 |         
119 |         input=self.dropout.get_output_for(input,deterministic=deterministic)
120 | 
121 |         # normalize
122 |         normalized = (input - mean) * (gamma * inv_std) + beta
123 |         return normalized
124 | 
125 | 


--------------------------------------------------------------------------------
/action recognition/Indrnn_action_train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import argparse
  4 | import os
  5 | 
  6 | import time
  7 | 
  8 | import lasagne
  9 | import theano
 10 | import numpy as np
 11 | import theano.tensor as T
 12 | 
 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,BatchNormLayer,DenseLayer,ElemwiseSumLayer
 14 | 
 15 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer
 16 | from lasagne.layers.normalization import batch_norm
 17 | from lasagne.nonlinearities import softmax, rectify,tanh
 18 | from lasagne.init import Uniform,Normal,HeNormal
 19 | 
 20 | import opts
 21 | from Indrnn_action_network import build_indrnn_network as build_rnn_network
 22 | import Indrnn_action_network
 23 | 
 24 | sys.setrecursionlimit(10000)
 25 | 
 26 | parser = argparse.ArgumentParser(description='lstm action')
 27 | opts.train_opts(parser)
 28 | args = parser.parse_args()
 29 | print (args)  
 30 | 
 31 | 
 32 | batch_size = args.batch_size
 33 | seq_len=args.seq_len
 34 | outputclass=60
 35 | indim=50#150
 36 | lr=args.lr
 37 | opti=lasagne.updates.adam
 38 | U_bound=Indrnn_action_network.U_bound
 39 | 
 40 | X_sym = T.tensor4('inputs')#,dtype=theano.config.floatX)
 41 | y_sym = T.ivector('label')#,dtype=theano.config.floatX)   
 42 | learn_net=build_rnn_network(X_sym)
 43 | prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=False)
 44 | loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym).mean()
 45 | if args.use_weightdecay_nohiddenW:
 46 |   params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True)
 47 |   for para in params:
 48 |     if para.name!='hidden_to_hidden.W':
 49 |       loss += args.decayrate *lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100))  
 50 | acc=T.mean(lasagne.objectives.categorical_accuracy(prediction, y_sym, top_k=1),dtype=theano.config.floatX)
 51 | 
 52 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True)
 53 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX)
 54 | grads = theano.grad(loss, params)
 55 | updates = opti( grads, params, learning_rate=learning_ratetrain)
 56 | print('Compiling')
 57 | train_fn = theano.function([X_sym, y_sym,learning_ratetrain], [loss,acc], updates=updates)
 58 | 
 59 | test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True,batch_norm_use_averages=False)
 60 | test_acc=T.mean(lasagne.objectives.categorical_accuracy(test_prediction, y_sym, top_k=1),dtype=theano.config.floatX)
 61 | test_fn = theano.function([X_sym, y_sym], [test_acc,test_prediction]) 
 62 | 
 63 | bn_test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True)#,batch_norm_use_averages=True
 64 | bn_test_acc=T.mean(lasagne.objectives.categorical_accuracy(bn_test_prediction, y_sym, top_k=1),dtype=theano.config.floatX)
 65 | bn_test_fn = theano.function([X_sym, y_sym], [bn_test_acc,bn_test_prediction])       
 66 |   
 67 | 
 68 | learning_rate=np.float32(lr)
 69 | if args.test_CV:
 70 |   train_datasets='train_CV_ntus'
 71 |   test_dataset='test_CV_ntus'
 72 | else:
 73 |   train_datasets='train_ntus'
 74 |   test_dataset='test_ntus'   
 75 | from data_reader_numpy_witheval import DataHandler_train,DataHandler_eval  
 76 | from data_reader_numpy_test import DataHandler as testDataHandler
 77 | dh_train = DataHandler_train(batch_size,seq_len, args.rotation_aug)
 78 | dh_eval = DataHandler_eval(batch_size,seq_len)
 79 | dh_test= testDataHandler(batch_size,seq_len)
 80 | num_train_batches=int(np.ceil(dh_train.GetDatasetSize()/(batch_size+0.0)))
 81 | num_eval_batches=int(np.ceil(dh_eval.GetDatasetSize()/(batch_size+0.0)))
 82 | num_test_batches=int(np.ceil(dh_test.GetDatasetSize()/(batch_size+0.0)))
 83 | labelname='test_ntus_label.npy'
 84 | if args.test_CV:
 85 |   labelname='test_CV_ntus_label.npy'
 86 | testlabels=np.load(labelname)
 87 | 
 88 | aveloss=0
 89 | aveacc=0
 90 | lastacc=0
 91 | dispFreq=20
 92 | testnos=20
 93 | stepcount=0   
 94 | patience=0
 95 | patienceThre=10 
 96 | while True:
 97 |   x, y = dh_train.GetBatch()
 98 |   loss,acc=train_fn(x, y,learning_rate)
 99 |   stepcount+=1
100 |   aveloss+=loss
101 |   aveacc+=acc
102 |   
103 |   if args.constrain_U:
104 |     for para in params:
105 |       if para.name=='hidden_to_hidden.W':
106 |         para.set_value(np.clip(para.get_value(),-U_bound,U_bound))
107 |   
108 |   if np.isnan(loss):
109 |     print ('NaN detected in cost')
110 |     assert(2==3)
111 |   if np.isinf(loss):
112 |     print ('INF detected in cost')
113 |     assert(2==3) 
114 | 
115 |   if np.mod(stepcount, dispFreq) == 0:
116 |     aveloss=aveloss/dispFreq
117 |     aveacc=aveacc/dispFreq
118 |     print("lr",learning_rate,"trainingerror",aveloss,"aveacc",aveacc)
119 |     aveloss=0
120 |     aveacc=0
121 |        
122 |   if np.mod(stepcount, num_train_batches)==0:   
123 |     stepcount=0   
124 |     aveacc=0
125 |     eval_batches=num_eval_batches*args.eval_fold
126 |     for testi in range(eval_batches):
127 |       x, y = dh_eval.GetBatch()
128 |       test_acc_top1,_=test_fn(x, y)      
129 |       aveacc+=test_acc_top1   
130 |     bn_aveacc=0
131 |     for testi in range(eval_batches):
132 |       x, y = dh_eval.GetBatch()
133 |       bn_test_acc_top1,_=bn_test_fn(x, y)      
134 |       bn_aveacc+=bn_test_acc_top1         
135 |       
136 |     print ('evalacc,bn_evalacc', aveacc/eval_batches, bn_aveacc/eval_batches) 
137 |     epocacc=bn_aveacc/eval_batches
138 |     aveacc=0
139 |     
140 |     if (epocacc >lastacc):# and itericount>=0.8*rateschedulecount
141 |       best_para=lasagne.layers.get_all_param_values(learn_net['out'])  
142 |       lastacc=epocacc
143 |       patience=0
144 |     elif patience>patienceThre:
145 |       #learning_rate=np.float32(learning_rate*0.2)
146 |       print ('learning rate',learning_rate)
147 |       lasagne.layers.set_all_param_values(learn_net['out'], best_para)
148 |       patience=0
149 |       learning_rate=np.float32(learning_rate*0.1)    
150 |       if learning_rate<args.end_rate:
151 |         break
152 |     else:
153 |       patience+=1        
154 |     
155 | 
156 | 
157 | total_testdata=dh_test.GetDatasetSize()  
158 | total_ave_acc=np.zeros((total_testdata,outputclass))
159 | test_no=10
160 | aveacc=0
161 | for testi in range(num_test_batches*test_no):
162 |   x, y,index = dh_test.GetBatch()
163 |   test_acc_top1,test_prediction=test_fn(x, y)      
164 |   aveacc+=test_acc_top1
165 |   total_ave_acc[index]+=test_prediction
166 | total_ave_acc/=float(test_no)
167 | top = np.argmax(total_ave_acc, axis=-1)
168 | eval_acc=np.mean(np.equal(top, testlabels))
169 | print ('testacc', aveacc/(test_no*num_test_batches), eval_acc)
170 | 
171 | total_ave_acc=np.zeros((total_testdata,outputclass))
172 | aveacc=0
173 | for testi in range(num_test_batches*test_no):
174 |   x, y,index = dh_test.GetBatch()
175 |   test_acc_top1,test_prediction=bn_test_fn(x, y)      
176 |   aveacc+=test_acc_top1
177 |   total_ave_acc[index]+=test_prediction
178 | total_ave_acc/=float(test_no)
179 | top = np.argmax(total_ave_acc, axis=-1)
180 | eval_acc=np.mean(np.equal(top, testlabels))
181 | print ('bn_testacc', aveacc/(test_no*num_test_batches), eval_acc)
182 | 
183 | save_name='action_indrnn'  
184 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out']))
185 | 


--------------------------------------------------------------------------------
/adding/adding.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import argparse
  4 | import os
  5 | 
  6 | import time
  7 | 
  8 | import lasagne
  9 | import theano
 10 | import numpy as np
 11 | import theano.tensor as T
 12 | 
 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate
 14 | from lasagne.layers import Conv2DLayer
 15 | from lasagne.layers import Pool2DLayer
 16 | from lasagne.layers import DenseLayer
 17 | 
 18 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer
 19 | from lasagne.layers.normalization import batch_norm
 20 | from lasagne.nonlinearities import softmax, rectify,tanh
 21 | 
 22 | from lasagne.layers import RecurrentLayer,LSTMLayer
 23 | from IndRNN import IndRNNLayer as indrnn
 24 | 
 25 | 
 26 | 
 27 | parser = argparse.ArgumentParser(description='IndRNN solving the adding problem')
 28 | parser.add_argument('--model', type=str, default='indrnn', help='models')
 29 | parser.add_argument('--hidden_units', type=int, default=128, help='humber of hidden units per layer')
 30 | parser.add_argument('--batch_size', type=int, default=32)
 31 | parser.add_argument('--seq_len', type=int, default=500)
 32 | parser.add_argument('--MAG', type=int, default=2)
 33 | parser.add_argument('--act', type=str, default='relu')
 34 | parser.add_argument('--lr', type=np.float32, default=2e-4, help='lr')
 35 | parser.add_argument('--in2hidW', type=np.float32, default=0.001)
 36 | parser.add_argument('--gradclipvalue', type=np.float32, default=10)
 37 | args = parser.parse_args()
 38 | 
 39 | print(args)
 40 | 
 41 | batch_size = args.batch_size
 42 | seq_len=args.seq_len
 43 | hidden_units=args.hidden_units
 44 | feature_size=2
 45 | outputclass=1
 46 | 
 47 | 
 48 | U_bound=pow(args.MAG, 1.0 / seq_len)
 49 | U_lowbound=pow(1.0/args.MAG, 1.0 / seq_len)
 50 | if args.act=='tanh':
 51 |   U_bound=pow(args.MAG/(pow(0.9,seq_len/10.0)), 1.0 / seq_len)
 52 | 
 53 | act=rectify
 54 | if args.act=='tanh':
 55 |   act=tanh
 56 | 
 57 | lr=args.lr
 58 | opti=lasagne.updates.adam#rmsprop
 59 | 
 60 | 
 61 | 
 62 | 
 63 | def build_rnn_network(rnnmodel):
 64 |     net = {}    
 65 |     net['input'] = InputLayer((batch_size, seq_len, feature_size))
 66 |     if rnnmodel==LSTMLayer:
 67 |       net['rnn']=rnnmodel(net['input'],hidden_units,forgetgate=lasagne.layers.Gate(b=lasagne.init.Constant(1.)),only_return_final=True,grad_clipping=args.gradclipvalue )
 68 |     elif act==rectify:
 69 |       net['rnn']=rnnmodel(net['input'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), W_hid_to_hid=lambda shape: np.identity(hidden_units,dtype=np.float32),nonlinearity=act,only_return_final=True,grad_clipping=args.gradclipvalue )
 70 |     elif act==tanh:
 71 |       net['rnn']=rnnmodel(net['input'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), nonlinearity=act,only_return_final=True,grad_clipping=args.gradclipvalue )
 72 |     net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=None)
 73 |     print (lasagne.layers.get_output_shape(net['out']))
 74 |     return net
 75 | 
 76 | 
 77 |   
 78 | def build_indrnn_network(res_rnnmodel):
 79 |     net = {}    
 80 |     net['input'] = InputLayer((batch_size, seq_len, feature_size))
 81 |     if act==rectify:
 82 |       net['rnn0']=res_rnnmodel(net['input'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), nonlinearity=act,W_hid_to_hid=lasagne.init.Uniform(range=(0,U_bound)),grad_clipping=args.gradclipvalue)
 83 |       net['rnn']=res_rnnmodel(net['rnn0'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), nonlinearity=act,W_hid_to_hid=lasagne.init.Uniform(range=(U_lowbound,U_bound)),only_return_final=True,grad_clipping=args.gradclipvalue)
 84 |     elif act==tanh:
 85 |       net['rnn0']=res_rnnmodel(net['input'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), nonlinearity=act,W_hid_to_hid=lasagne.init.Uniform(range=(U_bound)),grad_clipping=args.gradclipvalue)
 86 |       net['rnn']=res_rnnmodel(net['rnn0'],hidden_units,W_in_to_hid=lasagne.init.Normal(args.in2hidW), nonlinearity=act,W_hid_to_hid=lasagne.init.Uniform(range=(U_bound)),only_return_final=True,grad_clipping=args.gradclipvalue)
 87 |     net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=None)
 88 |     return net  
 89 |   
 90 | 
 91 | def generate_data(time_steps, n_data):
 92 |     x = np.asarray(np.zeros((time_steps, int(n_data), 2)),
 93 |                    dtype=theano.config.floatX)
 94 | 
 95 |     x[:,:,0] = np.asarray(np.random.uniform(low=0.,
 96 |                                             high=1.,
 97 |                                             size=(time_steps, n_data)),
 98 |                           dtype=theano.config.floatX)
 99 |     
100 |     
101 | 
102 |     inds = np.asarray(np.random.randint(time_steps//2, size=(n_data, 2)))
103 |     inds[:, 1] += time_steps//2  
104 |     
105 |     for i in range(int(n_data)):
106 |         x[inds[i, 0], i, 1] = 1.0
107 |         x[inds[i, 1], i, 1] = 1.0
108 |  
109 |     y = (x[:,:,0] * x[:,:,1]).sum(axis=0)
110 |     y = np.reshape(y, (n_data, 1))
111 |     x=np.transpose(x, (1, 0, 2))
112 | 
113 |     return x, y
114 | 
115 | 
116 |  
117 | if args.model=='rnn':
118 |   learn_net=build_rnn_network(RecurrentLayer)
119 | elif args.model=='lstm':
120 |   learn_net=build_rnn_network(LSTMLayer)
121 | elif args.model=='indrnn':
122 |   learn_net=build_indrnn_network(indrnn)  
123 | 
124 |   
125 | X_sym = T.tensor3('inputs',dtype=theano.config.floatX)
126 | y_sym = T.matrix('label',dtype=theano.config.floatX)    
127 |    
128 | prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=False)
129 | loss = lasagne.objectives.squared_error(prediction, y_sym).mean()
130 | 
131 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True)
132 | 
133 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX)
134 | 
135 | grads = theano.grad(loss, params)
136 | #grads = [T.clip(g, -1, 1) for g in grads] 
137 | updates = opti( grads, params, learning_rate=learning_ratetrain)#nesterov_momentum( loss, params, learning_rate=learning_ratetrain)#
138 | print('Compiling')
139 | train_fn = theano.function([X_sym, y_sym,learning_ratetrain], [loss,], updates=updates)
140 | test_fn = theano.function([X_sym, y_sym], [loss,])
141 |       
142 | 
143 | learning_rate=np.float32(lr)
144 | tmse=0
145 | lastmse=100
146 | count=0
147 | for batchi in range(1,10000000):
148 |   x,y=generate_data(seq_len, batch_size)
149 |   
150 |   if args.model=='indrnn':
151 |     i=0
152 |     for para in params:
153 |       if para.name=='hidden_to_hidden.W':
154 |         para.set_value(np.clip(para.get_value(),-1*U_bound,U_bound))           
155 |       i+=1  
156 |     
157 |   mse,=train_fn(x, y,learning_rate)
158 |   if np.isnan(mse):
159 |     print ('NaN detected in cost')
160 |     assert(2==3)
161 |   if np.isinf(mse):
162 |     print ('INF detected in cost')
163 |     assert(2==3)  
164 |   tmse+=mse
165 |   
166 |   if batchi%100==0:
167 |     print ('training', tmse/100.0)
168 |     count+=1
169 |     
170 |     x,y=generate_data(seq_len, 1000)
171 |     mse,=test_fn(x, y)
172 |     print ('accuracy:', mse)
173 |     
174 |     if (count>200):
175 |       learning_rate=np.float32(learning_rate*0.1)
176 |       print ('learning rate',learning_rate)
177 |       count=0      
178 |       if learning_rate<1e-6:
179 |         break
180 | 
181 |     tmse=0
182 | 
183 | save_name=args.model+str(seq_len)
184 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out']))
185 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Independently Recurrent Neural Networks
  2 | This code is to implement the [IndRNN](https://arxiv.org/abs/1803.04831). It is based on Theano and Lasagne. Please refer to [this one](https://github.com/Sunnydreamrain/IndRNN_pytorch) for pytorch.
  3 | 
  4 | Please cite the following paper if you find it useful.  
  5 | [Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.](https://arxiv.org/abs/1803.04831)
  6 | 
  7 | @inproceedings{li2018independently,  
  8 |   title={Independently recurrent neural network (indrnn): Building A longer and deeper RNN},  
  9 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},  
 10 |   booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},  
 11 |   pages={5457--5466},  
 12 |   year={2018}  
 13 | } 
 14 | 
 15 | # Summary  
 16 | In IndRNNs, neurons in each layer are independent from each other, and the cross-channel information is obtained through stacking multiple layers.  
 17 | Advantages over the RNN and/or LSTM:  
 18 | - The gradient backpropagation through time can be regulated to effectively address the gradient vanishing and exploding problems.  
 19 | - Long-term memory can be kept with IndRNNs to process long sequences. Experiments have demonstrated that an IndRNN can well process sequences over 5000 steps.  
 20 | - An IndRNN can work well with non-saturated function such as relu as activation function and be trained robustly.  
 21 | - Multiple layers of IndRNNs can be efficiently stacked, especially with residual connections over layers, to increase the depth of the network. An example of 21 layer-IndRNN is demonstrated in the experiments.  
 22 | - Behaviour of IndRNN neurons in each layer are easy to interpret due to the independence of neurons in each layer.  
 23 | 
 24 | Experiments have demonstrated that IndRNN performs much better than the traditional RNN and LSTM models on various tasks such as the adding problem, sequential MNIST classification, language modelling and action recognition.
 25 | 
 26 | # Usage 
 27 | `IndRNN.py` provides the IndRNN function as described in the paper.  
 28 | `IndRNN_onlyrecurrent.py` provides only the recurrent+activation of the IndRNN function. Therefore, processing of the input with dense connection or convolution operation is needed. This is usedful for adding batch normalization (BN) between the processing of input and activation function.
 29 | 
 30 | ### Requirements  
 31 | [Theano](http://deeplearning.net/software/theano/install.html) and [Lasagne](https://lasagne.readthedocs.io/en/latest/user/installation.html) need to be installed first.  
 32 | ```
 33 | pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
 34 | pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
 35 | ```
 36 | 
 37 | ## For the adding example   
 38 | `python -u adding.py`  
 39 | Different options are available in adding.py.  
 40 | Example: `python -u adding.py --model indrnn --seq_len 100`  
 41 | Example of using GPU: `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' python -u adding.py --model indrnn --seq_len 100`  
 42 | 
 43 | ## For the Sequential MNIST example  
 44 | `python -u pixelmnist.py --use_bn_afterrnn`   
 45 | or with options: 
 46 | `python -u adding.py --model indrnn --num_layers 6 --hidden_units 128 --use_bn_afterrnn`  
 47 | Example of using GPU: `THEANO_FLAGS='floatX=float32,device=cuda0,mode=FAST_RUN' python -u adding.py --model indrnn --num_layers 6 --hidden_units 128 --use_bn_afterrnn`  
 48 | 
 49 | For this task, the batch normalization (BN) is used. It can be used before the activation function (relu) or after it. In our experiments, it converges faster by putting BN after the activation function.  
 50 | 
 51 | ## For the language modeling example using character-level Penn Treebank (PTB-c)   
 52 | `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_dropout --num_layers 6 --droprate 0.25 --seq_len 150 --use_weightdecay_nohiddenW`  
 53 | `data_aug` here only provides different start for each training epoch to provide stable statistics for BN.  
 54 | or using the residual model:  
 55 | `python -u penntree_charlevel_rernn.py --data_aug --hidden_units 2000 --use_residual --num_layers 11 --use_dropout --droprate 0.3 --seq_len 150 --use_weightdecay_nohiddenW`  
 56 | The example code provides the very basic implementation of residual IndRNN where the number of units in all the IndRNN layers are the same and the left branch is fixed to be 1 without further using weight processing. Other network architectures can be explored which may provide better results.
 57 | 
 58 | For this task, output is provided at each time step and can only use the information before the current time step. Therefore, the statistics (mean and variance) of the batch normalization (BN) are obtained for each time step. It is used before the activation which is more robust than putting it after the activation. The main reason is that the outputs of all the IndRNN layers at the last time step is further used as initialization of the next batch. By putting BN before the activation (which is also before the recurrent accumulation), the statistics of BN is more stable than putting BN after the activation.    
 59 | 
 60 | ## For the language modeling example using word-level Penn Treebank      
 61 | Please find details in the directoy [wordPTB](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/tree/master/wordPTB).  
 62 | 
 63 | ## For the skeleton-based Action Recognition example  
 64 | Please find details in the directoy [action recognition](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/tree/master/action%20recognition).  
 65 | 
 66 | # Considerations in implementation  
 67 | ### 1, Initialization of the recurrent weights
 68 | For relu, `Uniform(0,1)` is used to make different neurons keep different kinds of memory. But for problems that only use the output of the last time step such as the adding problem, MNIST classification problem, and action recognition problem, the recurrent weights for the last IndRNN layer (caution: only the last one not all) can be initialized to be all `1` or a proper range `(1-epsilon, 1+epsilon)` where `epsilon` is a small number, since only long-term memory is needed for the output of this layer. Examples are shown in [adding.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/adding/adding.py#L49).  
 69 | 
 70 | ### 2, Constraint of the recurrent weights  
 71 | For relu, generally it can be set to `[-U_bound, U_bound]` where `U_bound=pow(args.MAG, 1.0 / seq_len)` and `MAG` can be 2 or 10 or others. If the sequence is very long, it can be `[-1, 1]` since it is very close to 1 and the precision of GPU is limited. If the sequence is short such as 20, no constraint is needed. Example of the constraint is shown at [adding.py](https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne/blob/master/adding/adding.py#L150). By the way, this constraint can also be implemented as a weight decay of ||max(0,|U|-U_bound)||.  
 72 | For simplicity, the constraint can always set to `[-1, 1]` as it can keep long-term memory already and the difference in performance is small.
 73 | 
 74 | ### 3, Usage of batch normalization (BN)  
 75 | Generally, over 3 layers, BN can help accelerate the training. BN can be used before the activation function or after it. In our experiments, we find it converges faster by putting BN after the activation function. However, for tasks such as PTB_c where the output of one batch is further used as the initialization of the next batch, it is better to put BN before activation as mentioned at the above example.  
 76 | 
 77 | ### 4, Learning rate  
 78 | In our experiments, ADAM with a learning rate of 2e-4 works well.  
 79 | 
 80 | ### 5, Weight decay  
 81 | If weight decay is used, no need to add the recurrent weights.  
 82 | 
 83 | ### 6, Usage of dropout  
 84 | Dropout (if used) is applied with the same mask over time.  
 85 | 
 86 | ### Note  
 87 | The above considerations are just suggestions. I did not explore lots of training techniques such as training methods, initialization techniques. So better results may be achieved with other options.  
 88 | 
 89 | # Other implementations
 90 | Tensorflow:  
 91 | [https://github.com/batzner/indrnn](https://github.com/batzner/indrnn)  
 92 | Keras:  
 93 | [https://github.com/titu1994/Keras-IndRNN](https://github.com/titu1994/Keras-IndRNN)  
 94 | Pytorch:  
 95 | [https://github.com/Sunnydreamrain/IndRNN_pytorch](https://github.com/Sunnydreamrain/IndRNN_pytorch)  
 96 | [https://github.com/StefOe/indrnn-pytorch](https://github.com/StefOe/indrnn-pytorch)  
 97 | [https://github.com/theSage21/IndRNN](https://github.com/theSage21/IndRNN)  
 98 | [https://github.com/zhangxu0307/Ind-RNN](https://github.com/zhangxu0307/Ind-RNN)  
 99 | CNTK:  
100 | [https://github.com/delzac/cntkx](https://github.com/delzac/cntkx)  
101 | Chainer:  
102 | [https://github.com/0shimax/chainer-IndRNN](https://github.com/0shimax/chainer-IndRNN)  
103 | 


--------------------------------------------------------------------------------
/mnist/pixelmnist.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import argparse
  4 | import os
  5 | 
  6 | import time
  7 | from collections import OrderedDict
  8 | import lasagne
  9 | import theano
 10 | import numpy as np
 11 | import theano.tensor as T
 12 | 
 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate,DenseLayer,ElemwiseSumLayer
 14 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer,BatchNormLayer
 15 | from lasagne.nonlinearities import softmax, rectify,tanh,leaky_rectify
 16 | from lasagne.init import Uniform, Normal,HeNormal
 17 | 
 18 | from lasagne.layers import RecurrentLayer,LSTMLayer
 19 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent
 20 | 
 21 | parser = argparse.ArgumentParser(description='IndRNN solving the pixel MNIST problem')
 22 | parser.add_argument('--model', type=str, default='indrnn', help='models')
 23 | parser.add_argument('--num_layers', type=int, default=6)
 24 | parser.add_argument('--hidden_units', type=int, default=128)
 25 | parser.add_argument('--batch_size', type=int, default=32)
 26 | parser.add_argument('--lr', type=np.float32, default=2e-4, help='lr')
 27 | parser.add_argument('--ini', type=np.float32, default=0.001, help='ini')
 28 | parser.add_argument('--gradclipvalue', type=np.float32, default=10)
 29 | parser.add_argument('--use_permute', action='store_true', default=False)
 30 | parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False)
 31 | parser.add_argument('--decayrate', type=np.float32, default=5e-4,help='lr')
 32 | parser.add_argument('--ini_b', type=np.float32, default=0.0)
 33 | parser.add_argument('--MAG', type=int, default=2)
 34 | 
 35 | 
 36 | parser.add_argument('--use_bn_afterrnn', action='store_true', default=False)
 37 | args = parser.parse_args()
 38 | print (args)
 39 | 
 40 | 
 41 | batch_size = args.batch_size
 42 | hidden_units=args.hidden_units
 43 | outputclass=10
 44 | 
 45 | from Data_gen import DataHandler,testDataHandler
 46 | if args.use_permute:
 47 |   from Data_gen_permute import DataHandler,testDataHandler
 48 | dh=DataHandler(batch_size)
 49 | dh_test=testDataHandler(batch_size)
 50 | x,y=dh.get_batch()
 51 | seq_len=x.shape[1]
 52 | feature_size=x.shape[2]
 53 | 
 54 | U_bound=pow(args.MAG, 1.0 / seq_len)
 55 | U_lowbound=pow(1.0/args.MAG, 1.0 / seq_len)
 56 | act=rectify
 57 | lr=args.lr
 58 | num_layers=args.num_layers
 59 | opti=lasagne.updates.adam
 60 | 
 61 | 
 62 | def build_lstm_network(rnnmodel):
 63 |     net = {}    
 64 |     net['input'] = InputLayer((batch_size, seq_len, feature_size))
 65 |     net['rnn']=rnnmodel(net['input'],hidden_units,forgetgate=lasagne.layers.Gate(b=lasagne.init.Constant(1.)),peepholes=False, only_return_final=True,grad_clipping=args.gradclipvalue)
 66 |     net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=softmax)
 67 |     return net  
 68 | def build_rnn_network(rnnmodel):
 69 |     net = {}    
 70 |     net['input'] = InputLayer((batch_size, seq_len, feature_size))
 71 |     net['rnn']=rnnmodel(net['input'],hidden_units,nonlinearity=act,W_in_to_hid=Normal(args.ini),W_hid_to_hid=lambda shape:  np.identity(hidden_units,dtype=np.float32),only_return_final=True ,grad_clipping=args.gradclipvalue)
 72 |     net['out']=DenseLayer(net['rnn'],outputclass,nonlinearity=softmax)
 73 |     return net
 74 | 
 75 | 
 76 | ini_W=HeNormal(gain=np.sqrt(2)/2.0)
 77 | if args.use_bn_afterrnn:
 78 |   ini_W=Uniform(args.ini)
 79 | 
 80 | 
 81 | def build_res_rnn_network(rnnmodel):
 82 |     net = {}        
 83 |     net['input'] = InputLayer((batch_size, seq_len, feature_size))     
 84 |     net['rnn0']=DimshuffleLayer(net['input'],(1,0,2))
 85 |     for l in range(1, num_layers+1):
 86 |       hidini=0
 87 |       if l==num_layers:
 88 |         hidini=U_lowbound
 89 | 
 90 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1))          
 91 |       net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],hidden_units,W=ini_W,b=Uniform(range=(0,args.ini_b)),nonlinearity=None)  #W=Uniform(ini_rernn_in_to_hid),         #
 92 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size,  -1))  
 93 |       
 94 |       net['rnn%d'%l]=net['rnn%d'%(l-1)]
 95 |       if not args.use_bn_afterrnn:
 96 |         net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1),beta=Uniform(range=(0,args.ini_b)))       
 97 |       
 98 |       net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],hidden_units,W_hid_to_hid=Uniform(range=(hidini,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=args.gradclipvalue)      
 99 |       if args.use_bn_afterrnn:
100 |         net['rnn%d'%l]=BatchNormLayer(net['rnn%d'%l],axes= (0,1))
101 |       if l==num_layers:  
102 |         net['rnn%d'%num_layers]=lasagne.layers.SliceLayer(net['rnn%d'%num_layers],indices=-1, axis=0)     
103 |            
104 |     net['out']=DenseLayer(net['rnn%d'%num_layers],outputclass,nonlinearity=softmax)
105 |     return net
106 | 
107 | 
108 | if args.model=='rnn':
109 |   learn_net=build_rnn_network(RecurrentLayer)
110 | elif args.model=='lstm':
111 |   learn_net=build_lstm_network(LSTMLayer)
112 | elif args.model=='indrnn':
113 |   learn_net=build_res_rnn_network(indrnn_onlyrecurrent)  
114 | 
115 | 
116 | X_sym = T.tensor3('inputs',dtype=theano.config.floatX)
117 | y_sym = T.ivector()#T.vector('label',dtype=theano.config.floatX)    
118 |    
119 | prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=False)#,batch_norm_use_averages=True
120 | loss = T.mean(lasagne.objectives.categorical_crossentropy(prediction, y_sym))
121 | acc=T.mean(lasagne.objectives.categorical_accuracy(prediction, y_sym, top_k=1),dtype=theano.config.floatX)
122 | if args.use_weightdecay_nohiddenW:
123 |   params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True)
124 |   for para in params:
125 |     if para.name!='hidden_to_hidden.W':
126 |       loss += args.decayrate *lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100))  
127 | 
128 | 
129 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True)
130 |   
131 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX)
132 | 
133 | grads = theano.grad(loss, params)
134 |   
135 | updates = opti( grads, params, learning_rate=learning_ratetrain)#nesterov_momentum( loss, params, learning_rate=learning_ratetrain)#
136 | 
137 | print('Compiling')
138 | train_fn = theano.function([X_sym, y_sym,learning_ratetrain], [loss,acc], updates=updates)
139 | 
140 | test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True,batch_norm_use_averages=False)
141 | test_loss = T.mean(lasagne.objectives.categorical_crossentropy(test_prediction, y_sym))
142 | test_acc=T.mean(lasagne.objectives.categorical_accuracy(test_prediction, y_sym, top_k=1),dtype=theano.config.floatX)
143 | test_fn = theano.function([X_sym, y_sym], [test_loss,test_acc])
144 | 
145 | bn_test_prediction = lasagne.layers.get_output(learn_net['out'], X_sym,deterministic=True)
146 | bn_test_loss = T.mean(lasagne.objectives.categorical_crossentropy(bn_test_prediction, y_sym))
147 | bn_test_acc=T.mean(lasagne.objectives.categorical_accuracy(bn_test_prediction, y_sym, top_k=1),dtype=theano.config.floatX)
148 | bn_test_fn = theano.function([X_sym, y_sym], [bn_test_loss,bn_test_acc])
149 | 
150 |       
151 | 
152 | learning_rate=np.float32(lr)
153 | print ('learning rate',learning_rate)
154 | tacc=0
155 | count=0
156 | for batchi in range(1,10000000):
157 |   x,y=dh.get_batch()
158 |   
159 |   if args.model=='indrnn':
160 |     i=0
161 |     for para in params:
162 |       if para.name=='hidden_to_hidden.W':
163 |         para.set_value(np.clip(para.get_value(),-1*U_bound,U_bound))          
164 |       i+=1   
165 | 
166 |   mse,acc=train_fn(x, y,learning_rate)
167 |   tacc+=acc
168 |   count+=1
169 |   
170 |   if batchi%1000==0:#1000
171 |     print ('train acc',tacc/count)
172 |     count=0
173 |     tacc=0
174 | 
175 |     totaltestacc=0
176 |     totatltestno=0
177 |     #learning_ratetrainbase=learning_ratetrainbase*(1 - 1e-7)
178 |     while(1):
179 |       inputs, targets = dh_test.get_batch()
180 |       test_mse,test_acc = test_fn(inputs,targets)
181 |       totaltestacc+=test_acc
182 |       totatltestno+=1
183 |       if totatltestno==dh_test.GetDatasetSize():
184 |         break
185 |     print ("accuracy: ", totaltestacc/totatltestno)   
186 | 
187 |     totaltestacc=0
188 |     totatltestno=0
189 |     while(1):
190 |       inputs, targets = dh_test.get_batch()
191 |       test_mse,test_acc = bn_test_fn(inputs,targets)
192 |       totaltestacc+=test_acc
193 |       totatltestno+=1
194 |       if totatltestno==dh_test.GetDatasetSize():
195 |         break
196 |     print ("bn_accuracy: ", totaltestacc/totatltestno)   
197 |       
198 |   if batchi%(100*6000)==0:    #dh.GetDatasetSize()==0:
199 |     learning_rate=np.float32(learning_rate*0.1)
200 |     print ('learning rate',learning_rate)                
201 |     if learning_rate<1e-8:
202 |       break
203 |       
204 | save_name='MNIST_'+args.model    
205 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out']))


--------------------------------------------------------------------------------
/mnist/Data_gen.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from threading import Thread
  3 | import sys
  4 | import os
  5 | 
  6 | 
  7 | def load_dataset():
  8 |     # We first define a download function, supporting both Python 2 and 3.
  9 |     if sys.version_info[0] == 2:
 10 |         from urllib import urlretrieve
 11 |     else:
 12 |         from urllib.request import urlretrieve
 13 | 
 14 |     def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
 15 |         print("Downloading %s" % filename)
 16 |         urlretrieve(source + filename, filename)
 17 | 
 18 |     # We then define functions for loading MNIST images and labels.
 19 |     # For convenience, they also download the requested files if needed.
 20 |     import gzip
 21 | 
 22 |     def load_mnist_images(filename):
 23 |         if not os.path.exists(filename):
 24 |             download(filename)
 25 |         # Read the inputs in Yann LeCun's binary format.
 26 |         with gzip.open(filename, 'rb') as f:
 27 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
 28 |         # The inputs are vectors now, we reshape them to monochrome 2D images,
 29 |         # following the shape convention: (examples, channels, rows, columns)
 30 |         data = data.reshape(-1, 1, 28, 28)
 31 |         # The inputs come as bytes, we convert them to float32 in range [0,1].
 32 |         # (Actually to range [0, 255/256], for compatibility to the version
 33 |         # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
 34 |         return data / np.float32(256)
 35 | 
 36 |     def load_mnist_labels(filename):
 37 |         if not os.path.exists(filename):
 38 |             download(filename)
 39 |         # Read the labels in Yann LeCun's binary format.
 40 |         with gzip.open(filename, 'rb') as f:
 41 |             data = np.frombuffer(f.read(), np.uint8, offset=8)
 42 |         # The labels are vectors of integers now, that's exactly what we want.
 43 |         return data
 44 | 
 45 |     # We can now download and read the training and test set images and labels.
 46 |     X_train = load_mnist_images('train-images-idx3-ubyte.gz')
 47 |     y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
 48 |     X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
 49 |     y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
 50 | 
 51 |     return (X_train, y_train),  (X_test, y_test)
 52 | 
 53 | 
 54 |    
 55 | (X_train, y_train), (X_test, y_test) = load_dataset()
 56 | X_train = X_train.reshape(X_train.shape[0], -1, 1)
 57 | X_test = X_test.reshape(X_test.shape[0], -1, 1)
 58 | X_train = X_train.astype('float32')
 59 | X_test = X_test.astype('float32')
 60 | X_train -= 0.5
 61 | X_test -= 0.5
 62 | X_train *= 2
 63 | X_test *= 2
 64 | 
 65 | class batch_thread():
 66 |   def __init__(self, result, batch_size_):
 67 |     self.result = result
 68 |     self.batch_size_=batch_size_
 69 |     self.indices = np.arange(len(y_train))
 70 |     np.random.shuffle(self.indices)
 71 |     self.idx=0
 72 |   def __call__(self): 
 73 |     batch_data_  = np.zeros((self.batch_size_, X_train.shape[1], X_train.shape[2]), dtype=np.float32)
 74 |     batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
 75 |     for i in range(self.batch_size_):
 76 |       batch_data_[i,:,:]=X_train[self.indices[self.idx],:,:]
 77 |       batch_label_[i]=y_train[self.indices[self.idx]]
 78 |       self.idx+=1
 79 |       if self.idx==len(self.indices):
 80 |         self.idx=0
 81 |         np.random.shuffle(self.indices)
 82 |          
 83 |     self.result['data']=batch_data_
 84 |     self.result['label']=batch_label_ 
 85 | 
 86 | 
 87 | class DataHandler(object):
 88 | 
 89 |   def __init__(self, batch_size):
 90 |     self.batch_size_ = batch_size    # batch size            
 91 | 
 92 |     self.batch_data_  = np.zeros((self.batch_size_, 3, 32, 32), dtype=np.float32)
 93 |     self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
 94 |     
 95 |     self.thread_result = {}
 96 |     self.thread = None
 97 |     self.batch_advancer =batch_thread(self.thread_result,self.batch_size_)
 98 |     
 99 |     
100 |     self.dispatch_worker()
101 |     self.join_worker()
102 | 
103 | 
104 |   def get_batch(self):
105 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
106 |     if self.thread is not None:
107 |       self.join_worker() 
108 |       
109 |     self.batch_data_=self.thread_result['data']
110 |     self.batch_label_=self.thread_result['label']
111 |     
112 |     self.dispatch_worker()
113 |     return self.batch_data_, self.batch_label_
114 | 
115 | 
116 |   def dispatch_worker(self):
117 |     assert self.thread is None
118 |     self.thread = Thread(target=self.batch_advancer)
119 |     self.thread.start()
120 | 
121 |   def join_worker(self):
122 |     assert self.thread is not None
123 |     self.thread.join()
124 |     self.thread = None
125 | 
126 |   def GetDatasetSize(self):
127 |     return len(Aug_Y_train)//(2*self.batch_size_)
128 | 
129 | 
130 | 
131 | 
132 | class testbatch_thread():
133 |   def __init__(self, result, batch_size_):
134 |     self.result = result
135 |     self.batch_size_=batch_size_
136 |     self.indices = np.arange(len(y_test))
137 |     np.random.shuffle(self.indices)
138 |     self.idx=0
139 |   def __call__(self):    
140 |     batch_data_  = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32)
141 |     batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
142 |     if self.idx+self.batch_size_>len(y_test):
143 |       batch_data_[:len(y_test)-self.idx]=X_test[self.indices[self.idx:len(y_test)],:,:]
144 |       batch_label_[:len(y_test)-self.idx]=y_test[self.indices[self.idx:len(y_test)]]
145 |       needed=self.batch_size_-(len(y_test)-self.idx)
146 |       batch_data_[len(y_test)-self.idx:]=X_test[self.indices[0:needed],:,:]
147 |       batch_label_[len(y_test)-self.idx:]=y_test[self.indices[0:needed]]
148 |       self.idx=needed
149 |     else:
150 |       batch_data_=X_test[self.indices[self.idx:self.idx+self.batch_size_],:,:]
151 |       batch_label_=y_test[self.indices[self.idx:self.idx+self.batch_size_]]
152 |       self.idx+=self.batch_size_
153 |             
154 |     self.result['data']=batch_data_
155 |     self.result['label']=batch_label_
156 |     
157 |     if self.idx==len(y_test):
158 |         self.idx=0
159 | 
160 | 
161 | class testDataHandler(object):
162 | 
163 |   def __init__(self, batch_size):
164 |     self.batch_size_ = batch_size    # batch size            
165 | 
166 |     self.batch_data_  = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32)
167 |     self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
168 |     
169 |     self.thread_result = {}
170 |     self.thread = None
171 |     self.batch_advancer =testbatch_thread(self.thread_result,self.batch_size_)
172 |     
173 |     
174 |     self.dispatch_worker()
175 |     self.join_worker()
176 | 
177 | 
178 |   def get_batch(self):
179 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
180 |     if self.thread is not None:
181 |       self.join_worker() 
182 |       
183 |     self.batch_data_=self.thread_result['data']
184 |     self.batch_label_=self.thread_result['label']
185 |     
186 |     self.dispatch_worker()
187 |     return self.batch_data_, self.batch_label_
188 | 
189 | 
190 |   def dispatch_worker(self):
191 |     assert self.thread is None
192 |     self.thread = Thread(target=self.batch_advancer)
193 |     self.thread.start()
194 | 
195 |   def join_worker(self):
196 |     assert self.thread is not None
197 |     self.thread.join()
198 |     self.thread = None
199 |   def GetDatasetSize(self):
200 |     return len(y_test)//self.batch_size_
201 |   
202 |   
203 |   
204 |   
205 |   
206 |   
207 |   
208 |   
209 |   
210 | class finaltestbatch_thread():
211 |   def __init__(self, result, batch_size_):
212 |     self.result = result
213 |     self.batch_size_=batch_size_
214 |     self.idx=0
215 |   def __call__(self):    
216 |     temp_data_  = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32)
217 |     temp_label_ = np.zeros((self.batch_size_*2), dtype=np.int32)
218 |     
219 |     temp_data_[::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,:]
220 |     temp_label_[::2]=y_test[self.idx:self.idx+self.batch_size_]
221 |     
222 |     temp_data_[1::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,::-1]
223 |     temp_label_[1::2]=y_test[self.idx:self.idx+self.batch_size_]
224 |           
225 |     self.result['data']=temp_data_
226 |     self.result['label']=temp_label_
227 |     self.idx+=self.batch_size_
228 |     if self.idx==len(y_test):
229 |         self.idx=0
230 | 
231 | 
232 | class finaltestDataHandler(object):
233 | 
234 |   def __init__(self, batch_size):
235 |     self.batch_size_ = batch_size    # batch size            
236 | 
237 |     self.batch_data_  = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32)
238 |     self.batch_label_ = np.zeros((self.batch_size_*2), dtype=np.int32)
239 |     
240 |     self.thread_result = {}
241 |     self.thread = None
242 |     self.batch_advancer =finaltestbatch_thread(self.thread_result,self.batch_size_)
243 |     
244 |     
245 |     self.dispatch_worker()
246 |     self.join_worker()
247 | 
248 | 
249 |   def get_batch(self):
250 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
251 |     if self.thread is not None:
252 |       self.join_worker() 
253 |       
254 |     self.batch_data_=self.thread_result['data']
255 |     self.batch_label_=self.thread_result['label']
256 |     
257 |     self.dispatch_worker()
258 |     return self.batch_data_, self.batch_label_
259 | 
260 | 
261 |   def dispatch_worker(self):
262 |     assert self.thread is None
263 |     self.thread = Thread(target=self.batch_advancer)
264 |     self.thread.start()
265 | 
266 |   def join_worker(self):
267 |     assert self.thread is not None
268 |     self.thread.join()
269 |     self.thread = None
270 |   def GetDatasetSize(self):
271 |     return len(y_test)//self.batch_size_
272 | 


--------------------------------------------------------------------------------
/mnist/Data_gen_permute.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from threading import Thread
  3 | import sys
  4 | import os
  5 | 
  6 | 
  7 | def load_dataset():
  8 |     # We first define a download function, supporting both Python 2 and 3.
  9 |     if sys.version_info[0] == 2:
 10 |         from urllib import urlretrieve
 11 |     else:
 12 |         from urllib.request import urlretrieve
 13 | 
 14 |     def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
 15 |         print("Downloading %s" % filename)
 16 |         urlretrieve(source + filename, filename)
 17 | 
 18 |     # We then define functions for loading MNIST images and labels.
 19 |     # For convenience, they also download the requested files if needed.
 20 |     import gzip
 21 | 
 22 |     def load_mnist_images(filename):
 23 |         if not os.path.exists(filename):
 24 |             download(filename)
 25 |         # Read the inputs in Yann LeCun's binary format.
 26 |         with gzip.open(filename, 'rb') as f:
 27 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
 28 |         # The inputs are vectors now, we reshape them to monochrome 2D images,
 29 |         # following the shape convention: (examples, channels, rows, columns)
 30 |         data = data.reshape(-1, 1, 28, 28)
 31 |         # The inputs come as bytes, we convert them to float32 in range [0,1].
 32 |         # (Actually to range [0, 255/256], for compatibility to the version
 33 |         # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
 34 |         return data / np.float32(256)
 35 | 
 36 |     def load_mnist_labels(filename):
 37 |         if not os.path.exists(filename):
 38 |             download(filename)
 39 |         # Read the labels in Yann LeCun's binary format.
 40 |         with gzip.open(filename, 'rb') as f:
 41 |             data = np.frombuffer(f.read(), np.uint8, offset=8)
 42 |         # The labels are vectors of integers now, that's exactly what we want.
 43 |         return data
 44 | 
 45 |     # We can now download and read the training and test set images and labels.
 46 |     X_train = load_mnist_images('train-images-idx3-ubyte.gz')
 47 |     y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
 48 |     X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
 49 |     y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
 50 | 
 51 |     return (X_train, y_train),  (X_test, y_test)
 52 | 
 53 | 
 54 |    
 55 | (X_train, y_train), (X_test, y_test) = load_dataset()
 56 | X_train = X_train.reshape(X_train.shape[0], -1, 1)
 57 | X_test = X_test.reshape(X_test.shape[0], -1, 1)
 58 | X_train = X_train.astype('float32')
 59 | X_test = X_test.astype('float32')
 60 | X_train -= 0.5
 61 | X_test -= 0.5
 62 | X_train *= 2
 63 | X_test *= 2
 64 |  
 65 |  
 66 | seq_len=X_train.shape[1]
 67 | if seq_len!=X_test.shape[1]:
 68 |   print ('seq len wrong')
 69 |   assert 2==3
 70 | P = np.random.permutation(seq_len)
 71 | X_train=X_train[:,P,:]
 72 | X_test=X_test[:,P,:]
 73 | 
 74 | 
 75 | 
 76 | class batch_thread():
 77 |   def __init__(self, result, batch_size_):
 78 |     self.result = result
 79 |     self.batch_size_=batch_size_
 80 |     self.indices = np.arange(len(y_train))
 81 |     np.random.shuffle(self.indices)
 82 |     self.idx=0
 83 |   def __call__(self): 
 84 |     batch_data_  = np.zeros((self.batch_size_, X_train.shape[1], X_train.shape[2]), dtype=np.float32)
 85 |     batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
 86 |     for i in range(self.batch_size_):
 87 |       batch_data_[i,:,:]=X_train[self.indices[self.idx],:,:]
 88 |       batch_label_[i]=y_train[self.indices[self.idx]]
 89 |       self.idx+=1
 90 |       if self.idx==len(self.indices):
 91 |         self.idx=0
 92 |         np.random.shuffle(self.indices)
 93 |          
 94 |     self.result['data']=batch_data_
 95 |     self.result['label']=batch_label_ 
 96 | 
 97 | 
 98 | class DataHandler(object):
 99 | 
100 |   def __init__(self, batch_size):
101 |     self.batch_size_ = batch_size    # batch size            
102 | 
103 |     self.batch_data_  = np.zeros((self.batch_size_, 3, 32, 32), dtype=np.float32)
104 |     self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
105 |     
106 |     self.thread_result = {}
107 |     self.thread = None
108 |     self.batch_advancer =batch_thread(self.thread_result,self.batch_size_)
109 |     
110 |     
111 |     self.dispatch_worker()
112 |     self.join_worker()
113 | 
114 | 
115 |   def get_batch(self):
116 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
117 |     if self.thread is not None:
118 |       self.join_worker() 
119 |       
120 |     self.batch_data_=self.thread_result['data']
121 |     self.batch_label_=self.thread_result['label']
122 |     
123 |     self.dispatch_worker()
124 |     return self.batch_data_, self.batch_label_
125 | 
126 | 
127 |   def dispatch_worker(self):
128 |     assert self.thread is None
129 |     self.thread = Thread(target=self.batch_advancer)
130 |     self.thread.start()
131 | 
132 |   def join_worker(self):
133 |     assert self.thread is not None
134 |     self.thread.join()
135 |     self.thread = None
136 | 
137 |   def GetDatasetSize(self):
138 |     return len(Aug_Y_train)//(2*self.batch_size_)
139 | 
140 | 
141 | 
142 | 
143 | class testbatch_thread():
144 |   def __init__(self, result, batch_size_):
145 |     self.result = result
146 |     self.batch_size_=batch_size_
147 |     self.indices = np.arange(len(y_test))
148 |     np.random.shuffle(self.indices)
149 |     self.idx=0
150 |   def __call__(self):    
151 |     batch_data_  = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32)
152 |     batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
153 |     if self.idx+self.batch_size_>len(y_test):
154 |       batch_data_[:len(y_test)-self.idx]=X_test[self.indices[self.idx:len(y_test)],:,:]
155 |       batch_label_[:len(y_test)-self.idx]=y_test[self.indices[self.idx:len(y_test)]]
156 |       needed=self.batch_size_-(len(y_test)-self.idx)
157 |       batch_data_[len(y_test)-self.idx:]=X_test[self.indices[0:needed],:,:]
158 |       batch_label_[len(y_test)-self.idx:]=y_test[self.indices[0:needed]]
159 |       self.idx=needed
160 |     else:
161 |       batch_data_=X_test[self.indices[self.idx:self.idx+self.batch_size_],:,:]
162 |       batch_label_=y_test[self.indices[self.idx:self.idx+self.batch_size_]]
163 |       self.idx+=self.batch_size_
164 |             
165 |     self.result['data']=batch_data_
166 |     self.result['label']=batch_label_
167 |     
168 |     if self.idx==len(y_test):
169 |         self.idx=0
170 | 
171 | 
172 | class testDataHandler(object):
173 | 
174 |   def __init__(self, batch_size):
175 |     self.batch_size_ = batch_size    # batch size            
176 | 
177 |     self.batch_data_  = np.zeros((self.batch_size_, X_test.shape[1], X_test.shape[2]), dtype=np.float32)
178 |     self.batch_label_ = np.zeros((self.batch_size_), dtype=np.int32)
179 |     
180 |     self.thread_result = {}
181 |     self.thread = None
182 |     self.batch_advancer =testbatch_thread(self.thread_result,self.batch_size_)
183 |     
184 |     
185 |     self.dispatch_worker()
186 |     self.join_worker()
187 | 
188 | 
189 |   def get_batch(self):
190 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
191 |     if self.thread is not None:
192 |       self.join_worker() 
193 |       
194 |     self.batch_data_=self.thread_result['data']
195 |     self.batch_label_=self.thread_result['label']
196 |     
197 |     self.dispatch_worker()
198 |     return self.batch_data_, self.batch_label_
199 | 
200 | 
201 |   def dispatch_worker(self):
202 |     assert self.thread is None
203 |     self.thread = Thread(target=self.batch_advancer)
204 |     self.thread.start()
205 | 
206 |   def join_worker(self):
207 |     assert self.thread is not None
208 |     self.thread.join()
209 |     self.thread = None
210 |   def GetDatasetSize(self):
211 |     return len(y_test)//self.batch_size_
212 |   
213 |   
214 |   
215 |   
216 |   
217 |   
218 |   
219 |   
220 |   
221 | class finaltestbatch_thread():
222 |   def __init__(self, result, batch_size_):
223 |     self.result = result
224 |     self.batch_size_=batch_size_
225 |     self.idx=0
226 |   def __call__(self):    
227 |     temp_data_  = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32)
228 |     temp_label_ = np.zeros((self.batch_size_*2), dtype=np.int32)
229 |     
230 |     temp_data_[::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,:]
231 |     temp_label_[::2]=y_test[self.idx:self.idx+self.batch_size_]
232 |     
233 |     temp_data_[1::2,:,:,:]=pre_X_test[self.idx:self.idx+self.batch_size_,:,:,::-1]
234 |     temp_label_[1::2]=y_test[self.idx:self.idx+self.batch_size_]
235 |           
236 |     self.result['data']=temp_data_
237 |     self.result['label']=temp_label_
238 |     self.idx+=self.batch_size_
239 |     if self.idx==len(y_test):
240 |         self.idx=0
241 | 
242 | 
243 | class finaltestDataHandler(object):
244 | 
245 |   def __init__(self, batch_size):
246 |     self.batch_size_ = batch_size    # batch size            
247 | 
248 |     self.batch_data_  = np.zeros((self.batch_size_*2, 3, 32, 32), dtype=np.float32)
249 |     self.batch_label_ = np.zeros((self.batch_size_*2), dtype=np.int32)
250 |     
251 |     self.thread_result = {}
252 |     self.thread = None
253 |     self.batch_advancer =finaltestbatch_thread(self.thread_result,self.batch_size_)
254 |     
255 |     
256 |     self.dispatch_worker()
257 |     self.join_worker()
258 | 
259 | 
260 |   def get_batch(self):
261 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
262 |     if self.thread is not None:
263 |       self.join_worker() 
264 |       
265 |     self.batch_data_=self.thread_result['data']
266 |     self.batch_label_=self.thread_result['label']
267 |     
268 |     self.dispatch_worker()
269 |     return self.batch_data_, self.batch_label_
270 | 
271 | 
272 |   def dispatch_worker(self):
273 |     assert self.thread is None
274 |     self.thread = Thread(target=self.batch_advancer)
275 |     self.thread.start()
276 | 
277 |   def join_worker(self):
278 |     assert self.thread is not None
279 |     self.thread.join()
280 |     self.thread = None
281 |   def GetDatasetSize(self):
282 |     return len(y_test)//self.batch_size_
283 | 


--------------------------------------------------------------------------------
/action recognition/data_reader_numpy_witheval.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import h5py
  3 | import numpy as np
  4 | import time
  5 | import random
  6 | #import glob
  7 | #import skimage.transform
  8 | #from skimage import color
  9 | import pickle
 10 | #import theano
 11 | #import cv2
 12 | from multiprocessing import Pool
 13 | from threading import Thread
 14 | import os.path
 15 | #RGB_frames = '/home/sl669/caffe/colordataset/ImageNET/ILSVRC2015/Data/CLS-LOC/val/'#'/home/sl669/caffe/ucf101/framearrays/'#
 16 | 
 17 | from __main__ import train_datasets
 18 | #train_datasets='train_ntus'
 19 | datasets=train_datasets
 20 | dataname=datasets+'.npy'
 21 | labelname=datasets+'_label.npy'
 22 | lenname=datasets+'_len.npy'
 23 | data_handle=np.load(dataname)
 24 | label_handle=np.load(labelname)
 25 | len_handle=np.load(lenname)
 26 | num_videos = len(data_handle)  
 27 | train_no=int(num_videos*0.95)
 28 | test_no=num_videos-train_no
 29 | 
 30 | shufflevideolist=np.arange(num_videos)
 31 | np.random.shuffle(shufflevideolist)
 32 | 
 33 | shufflevideolist_train=shufflevideolist[:train_no]
 34 | shufflevideolist_test=shufflevideolist[train_no:]
 35 | 
 36 | print ('Dataset train size, test size', train_no,test_no)
 37 | 
 38 | 
 39 | def rotate( input,s,b):
 40 |   shape=input.shape
 41 |   input=input.reshape((-1,3))
 42 |   XT=input[:,0]
 43 |   YT=input[:,1]
 44 |   ZT=input[:,2]
 45 |   s=s/180.0*np.pi
 46 |   b=b/180.0*np.pi
 47 |   RX = XT*np.cos(b) - ZT*np.sin(b) + ZT*np.sin(b)*np.cos(s) + YT*np.sin(b)*np.sin(s) - ZT*np.sin(b)*(np.cos(s) - 1);
 48 |   RY = YT*np.cos(s);
 49 |   RZ = ZT*np.cos(b)*np.cos(s) - ZT*(np.cos(b) - 1) - XT*np.sin(b) + YT*np.cos(b)*np.sin(s) - ZT*np.cos(b)*(np.cos(s) - 1);
 50 |   RX=RX.reshape((-1,1))
 51 |   RY=RY.reshape((-1,1))
 52 |   RZ=RZ.reshape((-1,1))
 53 |   output=np.concatenate([RX,RY,RZ],axis=1)
 54 |   output=output.reshape(shape)
 55 |   #print(shape,output.shape,input.shape)
 56 |   return output 
 57 | 
 58 | class batch_thread_train():
 59 |   def __init__(self, result, batch_size_,seq_len,use_rotation=False):
 60 |     self.result = result
 61 |     self.batch_size_=batch_size_
 62 |     self.seq_len=seq_len
 63 |     self.idx=0    
 64 |     self.use_rotation=use_rotation
 65 |   
 66 |   def __call__(self):###Be careful.  The appended data may change like pointer.
 67 |     templabel=[] 
 68 |     batch_data=[]
 69 |     for j in range(self.batch_size_):
 70 |       self.idx +=1
 71 |       if self.idx == train_no:
 72 |         self.idx =0
 73 |         np.random.shuffle(shufflevideolist_train)
 74 |       shufflevideoindex=shufflevideolist_train[self.idx]
 75 |       
 76 |       
 77 |       label=label_handle[shufflevideoindex]     
 78 |       templabel.append(np.int32(label))  
 79 |       dataset=data_handle[shufflevideoindex]
 80 |       len_data=len_handle[shufflevideoindex]   
 81 |       
 82 |       sample=np.zeros(tuple((self.seq_len,)+data_handle[shufflevideoindex].shape[1:]))
 83 |       lenperseg=len_data//self.seq_len
 84 |       if lenperseg==1 and len_data>self.seq_len:
 85 |         startid=np.random.randint(len_data-self.seq_len)
 86 |         sample=dataset[startid:startid+self.seq_len]
 87 |         #print('wrong data length first')
 88 |       elif len_data<=self.seq_len:
 89 |         startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len)))
 90 |         endid=min(self.seq_len,startid+len_data)
 91 |         datasid=0
 92 |         dataeid=len_data
 93 |         if startid+len_data>self.seq_len:
 94 |           datasid=np.random.randint(startid+len_data-self.seq_len)
 95 |           dataeid=datasid+self.seq_len-startid
 96 |         sample[startid:endid]=dataset[datasid:dataeid]
 97 |       else:      
 98 |         for framei in range(self.seq_len):        
 99 |           if framei==self.seq_len-1:
100 |             index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1))
101 |           else:
102 |             index=lenperseg*framei + np.random.randint(lenperseg)    
103 |           sample[framei]=dataset[index]
104 |           
105 |       #print(sample)
106 |       if self.use_rotation:
107 |         if np.random.randint(2):
108 |           s=np.random.randint(2)*45#random(1)*45
109 |           b=np.random.randint(2)*45#random(1)*45
110 |           #print(sample.shape)
111 |           sample=rotate(sample,s,b)
112 |         #print (index,lenperseg)  
113 | #       rframei=np.random.randint(len_data)  
114 | #       tmean=(dataset[rframei,0,:]+dataset[rframei,12,:]+dataset[rframei,16,:])/3
115 | #       sample=sample-tmean  
116 |       batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well.
117 |       #print(batch_data)       
118 |       
119 |     self.result['data']=np.asarray(batch_data,dtype=np.float32)
120 |     self.result['label']= np.asarray(templabel,dtype=np.int32)   
121 | 
122 | class DataHandler_train(object):
123 | 
124 |   def __init__(self, batch_size, seq_len, use_rotation=False):#datasets,
125 |     self.batch_size_ = batch_size		
126 |     #self.datasets = datasets    
127 |     random.seed(10)  
128 |     
129 |     self.thread_result = {}
130 |     self.thread = None
131 | 
132 |     self.batch_advancer =batch_thread_train(self.thread_result,self.batch_size_,seq_len,use_rotation)
133 |     
134 |     self.dispatch_worker()
135 |     self.join_worker()
136 | 
137 | 
138 |   def GetBatch(self):
139 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
140 |     if self.thread is not None:
141 |       self.join_worker() 
142 | 
143 |     self.batch_data_=self.thread_result['data']
144 |     self.batch_label_= self.thread_result['label']
145 |         
146 |     self.dispatch_worker()
147 |     return self.batch_data_, self.batch_label_
148 | 
149 |   def dispatch_worker(self):
150 |     assert self.thread is None
151 |     self.thread = Thread(target=self.batch_advancer)
152 |     self.thread.start()
153 | 
154 |   def join_worker(self):
155 |     assert self.thread is not None
156 |     self.thread.join()
157 |     self.thread = None
158 |     
159 |   def GetDatasetSize(self):
160 |     return train_no
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | class batch_thread_eval():
171 |   def __init__(self, result, batch_size_,seq_len):
172 |     self.result = result
173 |     self.batch_size_=batch_size_
174 |     self.seq_len=seq_len
175 |     self.idx=0    
176 |   
177 |   def __call__(self):###Be careful.  The appended data may change like pointer.
178 |     templabel=[] 
179 |     batch_data=[]
180 |     for j in range(self.batch_size_):
181 |       self.idx +=1
182 |       if self.idx == test_no:
183 |         self.idx =0
184 |         np.random.shuffle(shufflevideolist_test)
185 |       shufflevideoindex=shufflevideolist_test[self.idx]
186 |       
187 |       
188 |       label=label_handle[shufflevideoindex]     
189 |       templabel.append(np.int32(label))  
190 |       dataset=data_handle[shufflevideoindex]
191 |       len_data=len_handle[shufflevideoindex]   
192 |       
193 |       sample=np.zeros(tuple((self.seq_len,)+data_handle[shufflevideoindex].shape[1:]))
194 |       lenperseg=len_data//self.seq_len
195 |       if lenperseg==1 and len_data>self.seq_len:
196 |         startid=np.random.randint(len_data-self.seq_len)
197 |         sample=dataset[startid:startid+self.seq_len]
198 |       elif len_data<=self.seq_len:
199 |         startid=np.random.randint(max(self.seq_len-len_data,int(0.25*self.seq_len)))
200 |         endid=min(self.seq_len,startid+len_data)
201 |         datasid=0
202 |         dataeid=len_data
203 |         if startid+len_data>self.seq_len:
204 |           datasid=np.random.randint(startid+len_data-self.seq_len)
205 |           dataeid=datasid+self.seq_len-startid
206 |         sample[startid:endid]=dataset[datasid:dataeid]
207 |       else:      
208 |         for framei in range(self.seq_len):        
209 |           if framei==self.seq_len-1:
210 |             index=lenperseg*framei + np.random.randint(len_data-lenperseg*(self.seq_len-1))
211 |           else:
212 |             index=lenperseg*framei + np.random.randint(lenperseg)    
213 |           sample[framei]=dataset[index]
214 |         #print (index,lenperseg)  
215 |         
216 |       batch_data.append(sample) ###Be careful. It has to be different. Otherwise, the appended data will change as well.
217 |       #print(batch_data)       
218 |       
219 |     self.result['data']=np.asarray(batch_data,dtype=np.float32)
220 |     self.result['label']= np.asarray(templabel,dtype=np.int32)   
221 | 
222 | class DataHandler_eval(object):
223 | 
224 |   def __init__(self, batch_size, seq_len):#, datasets
225 |     self.batch_size_ = batch_size    
226 |     #self.datasets = datasets    
227 |     random.seed(10)  
228 |     
229 |     self.thread_result = {}
230 |     self.thread = None
231 | 
232 |     self.batch_advancer =batch_thread_eval(self.thread_result,self.batch_size_,seq_len)
233 |     
234 |     self.dispatch_worker()
235 |     self.join_worker()
236 | 
237 | 
238 |   def GetBatch(self):
239 |     #self.batch_data_  = np.zeros((self.batch_size_, 3, self.seq_length_, 112, 112), dtype=np.float32)
240 |     if self.thread is not None:
241 |       self.join_worker() 
242 | 
243 |     self.batch_data_=self.thread_result['data']
244 |     self.batch_label_= self.thread_result['label']
245 |         
246 |     self.dispatch_worker()
247 |     return self.batch_data_, self.batch_label_
248 | 
249 |   def dispatch_worker(self):
250 |     assert self.thread is None
251 |     self.thread = Thread(target=self.batch_advancer)
252 |     self.thread.start()
253 | 
254 |   def join_worker(self):
255 |     assert self.thread is not None
256 |     self.thread.join()
257 |     self.thread = None
258 |     
259 |   def GetDatasetSize(self):
260 |     return test_no
261 | 
262 | 
263 | 
264 | def main():
265 |   dh = DataHandler_train(1, 30,True)#'test_ntus.h5')#'test_ntus_allwitherror.h5')#
266 |   print (dh.GetDatasetSize())
267 |   dh_eval = DataHandler_eval(10, 30)#'test_ntus.h5')#'test_ntus_allwitherror.h5')#
268 |   print (dh_eval.GetDatasetSize())
269 |  
270 |   x,y = dh.GetBatch()
271 | #   print (x.shape)
272 | #   print (y[0:3],x[0,0,0],x[1,0,0],x[0,1,0])
273 | #   x,y = dh_eval.GetBatch()
274 | #   #print (x[0,0],y)  
275 | #   print (y,x[0,0,0])
276 | #   x,y = dh.GetBatch()
277 | #   #print (x[0,0],y)
278 | #   print (y,x[0,0,0])
279 |   x,y = dh.GetBatch()
280 |   #print (x[0,0],y)    
281 |   #print (y,x[0,0,0])
282 | #   exit()
283 | 
284 | if __name__ == '__main__':
285 |   main()
286 | 
287 | 


--------------------------------------------------------------------------------
/cPTB/penntree_charlevel_rernn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import argparse
  4 | import os
  5 | 
  6 | import time
  7 | 
  8 | import lasagne
  9 | import theano
 10 | import numpy as np
 11 | import theano.tensor as T
 12 | 
 13 | from lasagne.layers import InputLayer,ReshapeLayer,DimshuffleLayer,Gate, DenseLayer
 14 | from lasagne.layers import ConcatLayer,NonlinearityLayer,DropoutLayer, SliceLayer,ElemwiseSumLayer
 15 | from lasagne.nonlinearities import softmax, rectify,tanh,very_leaky_rectify
 16 | from lasagne.init import Uniform,Normal,HeNormal
 17 | 
 18 | from IndRNN_onlyrecurrent import IndRNNLayer_onlyrecurrent as indrnn_onlyrecurrent
 19 | from BatchNorm_step_timefirst import BatchNorm_step_timefirst_Layer
 20 | 
 21 | 
 22 | np.set_printoptions(threshold=3000,edgeitems=50)
 23 | sys.setrecursionlimit(50000)
 24 | parser = argparse.ArgumentParser(description='IndRNN for the char level PennTreeBank Language Model')
 25 | parser.add_argument('--hidden_units', type=int, default=2000)
 26 | parser.add_argument('--batch_size', type=int, default=128,help='batch_size')
 27 | parser.add_argument('--seq_len', type=int, default=50,help='seq_len')
 28 | parser.add_argument('--num_layers', type=int, default=6,help='num_layers')
 29 | parser.add_argument('--lr', type=np.float32, default=2e-4, help='lr')
 30 | parser.add_argument('--act', type=str, default='relu', help='act')
 31 | parser.add_argument('--data_aug', action='store_true', default=False)
 32 | parser.add_argument('--gradclipvalue', type=np.float32, default=10,  help='gradclipvalue')
 33 | parser.add_argument('--MAG', type=int, default=2)
 34 | parser.add_argument('--fix_bound', action='store_true', default=False)
 35 | 
 36 | #bn
 37 | parser.add_argument('--use_bn_afterrnn', action='store_true', default=False)
 38 | 
 39 | #drop
 40 | parser.add_argument('--use_dropout', action='store_true', default=False)
 41 | parser.add_argument('--droprate', type=np.float32, default=0.3, help='lr')
 42 | parser.add_argument('--droplayers', type=int, default=1,help='droplayers')
 43 | 
 44 | #residual
 45 | parser.add_argument('--use_residual', action='store_true', default=False)
 46 | parser.add_argument('--residual_layers', type=int, default=2)
 47 | parser.add_argument('--residual_block', type=int, default=3)
 48 | parser.add_argument('--unit_factor', type=np.float32, default=1, help='lr')
 49 | 
 50 | #weight decay
 51 | parser.add_argument('--use_weightdecay_nohiddenW', action='store_true', default=False)
 52 | parser.add_argument('--decayfactor', type=np.float32, default=1e-4, help='decayfactor')
 53 | 
 54 | #initialization
 55 | parser.add_argument('--pThre', type=int, default=20)
 56 | parser.add_argument('--ini_in2hid', type=np.float32, default=0.005, help='ini_in2hid')
 57 | parser.add_argument('--ini_b', type=np.float32, default=0.0, help='ini_in2hid')
 58 | 
 59 | args = parser.parse_args()
 60 | print (args)
 61 | 
 62 | 
 63 | num_layers=args.num_layers
 64 | droplayers=args.droplayers
 65 | outputclass=50
 66 | batch_size = args.batch_size
 67 | seq_len=args.seq_len
 68 | hidden_units=args.hidden_units
 69 | use_dropout=args.use_dropout
 70 | lr=np.float32(args.lr)
 71 | droprate=np.float32(args.droprate)
 72 | opti=lasagne.updates.adam  
 73 | 
 74 | rnnmodel=indrnn_onlyrecurrent
 75 | act=rectify  
 76 | if args.act=='tanh':
 77 |   act=tanh  
 78 | 
 79 | 
 80 | 
 81 | from reader import data_iterator, ptb_raw_data
 82 | name_dataset='ptb.char.'
 83 | def get_raw_data(dataset='ptb',data_path='data/'):
 84 |   raw_data = ptb_raw_data(data_path,filename=name_dataset)
 85 |   return raw_data
 86 | train_data, valid_data, test_data, _ = get_raw_data('ptb')
 87 | epoch_size =((len(train_data) // batch_size) - 1) // seq_len
 88 | 
 89 | 
 90 | seq_len1=len(train_data)
 91 | U_bound=pow(args.MAG, 1.0 / seq_len1)
 92 | if args.act=='tanh':
 93 |   U_bound=pow(args.MAG/(pow(0.9,seq_len1/10.0)), 1.0 / seq_len1)
 94 | if args.fix_bound:
 95 |   U_bound=1.0
 96 | #Because the last state of one batch is used as the initial state of the next batch, the total length is used here.
 97 | # This bound can simply set to 1. (1) the sequence is too long and they are already very close to 1. 
 98 | #(2) Due to the precision of GPU, if it is rounded to a larger value, it may explode.
 99 | 
100 | taxdrop= (0,) 
101 | 
102 | ini_W=HeNormal(gain=np.sqrt(2)/2.0)
103 | if args.use_bn_afterrnn:
104 |   ini_W=Normal(args.ini_in2hid)
105 |   
106 | units=[]
107 | acc_units=[]
108 | acc_units.append(0)
109 | sum_units=0
110 | if args.unit_factor!=1 and num_layers%(args.residual_block*args.residual_layers)!=args.start_residual:
111 |   print ('layers should be layers = args.residual_block*args.residual_layers +1')
112 |   assert 2==3
113 | for l in range(num_layers):
114 |   units_inc_factor=1
115 |   if l>=1:
116 |     units_inc_factor=np.power(args.unit_factor, (l-1)//(args.residual_block*args.residual_layers))
117 |   units.append(np.int(hidden_units*units_inc_factor))
118 |   sum_units+=np.int(hidden_units*units_inc_factor)
119 |   acc_units.append(sum_units)
120 |   
121 | #print(units,acc_units)  
122 | def build_rnn_network(rnnmodel,X_sym,hid_init_sym):
123 |     net = {}    
124 |     
125 |     net['input0'] = InputLayer((batch_size, seq_len),X_sym)        
126 |     net['input']=lasagne.layers.EmbeddingLayer(net['input0'],outputclass,units[0])#,W=lasagne.init.Uniform(inial_scale)      
127 |     net['rnn0']=DimshuffleLayer(net['input'],(1,0,2)) #change to (time, batch_size,hidden_units)    
128 |       
129 |     for l in range(1, num_layers+1):
130 |       net['hiddeninput%d'%l] = InputLayer((batch_size, units[l-1]),hid_init_sym[:,acc_units[l-1]:acc_units[l]])               
131 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (batch_size* seq_len, -1))          
132 |       net['rnn%d'%(l-1)]=DenseLayer(net['rnn%d'%(l-1)],units[l-1],W=ini_W,b=lasagne.init.Constant(args.ini_b),nonlinearity=None)  #W=Uniform(ini_rernn_in_to_hid),         #
133 |       net['rnn%d'%(l-1)]=ReshapeLayer(net['rnn%d'%(l-1)], (seq_len, batch_size,  -1))  
134 | 
135 |       if args.use_residual and l>args.residual_layers and (l-1)%args.residual_layers==0:# and l!=num_layers
136 |         if units[l - 1]!=units[l - 1 - args.residual_layers]:
137 |           net['leftbranch%d' % (l - 1)] = ReshapeLayer(net['sum%d'%(l-args.residual_layers)], (batch_size * seq_len, -1))
138 |           net['leftbranch%d' % (l - 1)] = DenseLayer(net['leftbranch%d' % (l - 1)], units[l - 1], W=ini_W, nonlinearity=None)
139 |           net['leftbranch%d' % (l - 1)] = ReshapeLayer(net['leftbranch%d' % (l - 1)], (seq_len, batch_size, -1))
140 |           net['leftbranch%d' % (l - 1)] = BatchNorm_step_timefirst_Layer(net['leftbranch%d' % (l - 1)], axes=(0, 1))
141 |           print('left branch')
142 |         else:
143 |           net['leftbranch%d' % (l - 1)] = net['sum%d'%(l-args.residual_layers)]
144 |         net['sum%d'%l]=ElemwiseSumLayer((net['rnn%d'%(l-1)],net['leftbranch%d' % (l - 1)]))
145 |       else:
146 |         net['sum%d'%l]=net['rnn%d'%(l-1)]      
147 |       
148 |       net['rnn%d'%l]=net['sum%d'%l]
149 |       if not args.use_bn_afterrnn:
150 |         net['rnn%d'%l]=BatchNorm_step_timefirst_Layer(net['rnn%d'%l],axes= (0,1),beta=lasagne.init.Constant(args.ini_b))    
151 |                
152 |       ini_hid_start=0
153 |       if act==tanh:
154 |         ini_hid_start=-1*U_bound
155 |       net['rnn%d'%l]=rnnmodel(net['rnn%d'%l],units[l-1],hid_init=net['hiddeninput%d'%l],W_hid_to_hid=Uniform(range=(ini_hid_start,U_bound)),nonlinearity=act,only_return_final=False, grad_clipping=args.gradclipvalue)      
156 |                 
157 |       net['last_state%d'%l]=SliceLayer(net['rnn%d'%l],-1, axis=0)
158 |       if l==1:
159 |         net['hid_out']=net['last_state%d'%l]
160 |       else:
161 |         net['hid_out']=ConcatLayer([net['hid_out'], net['last_state%d'%l]],axis=1)     
162 |         
163 |       if use_dropout and l%droplayers==0:
164 |         net['rnn%d'%l]=lasagne.layers.DropoutLayer(net['rnn%d'%l], p=droprate, shared_axes=taxdrop)                                                              
165 | 
166 |       if args.use_bn_afterrnn:
167 |         net['rnn%d'%l]=BatchNorm_step_timefirst_Layer(net['rnn%d'%l],axes= (0,1))                                                 
168 |         
169 |     net['rnn%d'%num_layers]=DimshuffleLayer(net['rnn%d'%num_layers],(1,0,2))   
170 |     net['reshape_rnn']=ReshapeLayer(net['rnn%d'%num_layers],(-1,units[num_layers-1]))        
171 |     net['out']=DenseLayer(net['reshape_rnn'],outputclass,nonlinearity=softmax)#lasagne.init.HeNormal(gain='relu'))#,W=Uniform(inial_scale)
172 |     return net
173 |   
174 | 
175 | X_sym = T.imatrix('inputs')#,dtype=theano.config.floatX)
176 | y_sym = T.imatrix('label')#,dtype=theano.config.floatX)    
177 | hid_init_sym = T.matrix()#tensor3()
178 | 
179 | learn_net=build_rnn_network(rnnmodel,X_sym,hid_init_sym)
180 | print(lasagne.layers.count_params(learn_net['out'], trainable=True)) 
181 | y_sym0=y_sym.reshape((-1,))
182 | prediction,hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']],deterministic=False) # {X_sym:X_sym,hid_init_sym:hid_init_sym},                        
183 | loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym0).mean()
184 | perp=T.exp(loss)
185 | bpc = (loss/np.log(2.0))
186 | 
187 | cost=loss
188 |   
189 | if args.use_weightdecay_nohiddenW:
190 |   params = lasagne.layers.get_all_params(learn_net['out'], regularizable=True)
191 |   for para in params:
192 |     if para.name!='hidden_to_hidden.W':
193 |       cost += args.decayfactor*lasagne.regularization.apply_penalty(para, lasagne.regularization.l2)#*T.clip(T.abs_(para)-1,0,100))     
194 |   
195 | params = lasagne.layers.get_all_params(learn_net['out'], trainable=True)
196 | 
197 | learning_ratetrain = T.scalar(name='learning_ratetrain',dtype=theano.config.floatX)
198 | 
199 | grads = theano.grad(cost, params)
200 | # if use_gradclip:
201 | #   grads= [T.clip(g, -gradclipvalue, gradclipvalue) for g in grads]
202 | updates = opti( grads, params, learning_rate=learning_ratetrain)#rmsprop( grads, params, learning_rate=learning_ratetrain)#nesterov_momentum
203 | print('Compiling')
204 | train_fn = theano.function([X_sym, y_sym,hid_init_sym,learning_ratetrain],\
205 |                             [perp, bpc, hid_rec_init], updates=updates)
206 | 
207 | test_prediction, test_hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']], \
208 |                                                                deterministic=True,batch_norm_use_averages=False)#{X_sym:X_sym,hid_init_sym:hid_init_sym},
209 | 
210 | test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, y_sym0).mean()
211 | test_perp=T.exp(test_loss)
212 | test_bpc = (test_loss/np.log(2.0))
213 | test_fn = theano.function([X_sym, y_sym,hid_init_sym],\
214 |                             [test_perp, test_bpc, test_hid_rec_init])
215 | 
216 | bn_test_prediction, bn_test_hid_rec_init = lasagne.layers.get_output([learn_net['out'],learn_net['hid_out']], \
217 |                                                                      deterministic=True)#{X_sym:X_sym,hid_init_sym:hid_init_sym},
218 | 
219 | bn_test_loss = lasagne.objectives.categorical_crossentropy(bn_test_prediction, y_sym0).mean()
220 | bn_test_perp=T.exp(bn_test_loss)
221 | bn_test_bpc = (bn_test_loss/np.log(2.0))
222 | bn_test_fn = theano.function([X_sym, y_sym,hid_init_sym],\
223 |                             [bn_test_perp, bn_test_bpc, bn_test_hid_rec_init])
224 | 
225 | 
226 | 
227 | 
228 | learning_rate=np.float32(lr)
229 | 
230 | t_prep=0
231 | t_bpc=0
232 | count=0
233 | lastbpc=100
234 | patience=0
235 | patienceThre=args.pThre
236 | 
237 | for epoci in range(1,10000):  
238 |   hid_init=np.zeros((batch_size, sum_units), dtype='float32')
239 |   dropindex=0
240 |   if args.data_aug:
241 |     dropindex=np.random.randint(seq_len*5)  
242 |   for batchi, (x, y) in enumerate(data_iterator(train_data[dropindex:], batch_size, seq_len)):
243 |     if rnnmodel==indrnn_onlyrecurrent:
244 |       for para in params:
245 |         if para.name=='hidden_to_hidden.W':
246 |           para.set_value(np.clip(para.get_value(),-1*U_bound,U_bound)) 
247 |     perp, bpc, hid_init=train_fn(x, y,hid_init,learning_rate)
248 | 
249 |     if np.isnan(perp):
250 |       print ('NaN detected in cost')
251 |       assert(2==3)
252 |     if np.isinf(perp):
253 |       print ('INF detected in cost')
254 |       assert(2==3)  
255 |     t_prep+=perp
256 |     t_bpc+=bpc
257 |     count+=1 
258 |     
259 |   trainbpc=t_bpc/count
260 |   print ('prep','bpc',t_prep/count, t_bpc/count)
261 |   train_acc=t_prep/count
262 |   count=0
263 |   t_prep=0
264 |   t_bpc=0 
265 |   
266 |   hid_init=np.zeros((batch_size, sum_units), dtype='float32')
267 |   for testbatchi, (x, y) in enumerate(data_iterator(valid_data, batch_size, seq_len)):
268 |     perp, bpc, hid_init=bn_test_fn(x, y,hid_init)
269 |     t_prep+=perp
270 |     t_bpc+=bpc
271 |     count+=1
272 |   print ('bn_validprep','bn_validbpc',t_prep/count, t_bpc/count )
273 |   validbpc=t_bpc/count
274 |   count=0
275 |   t_prep=0
276 |   t_bpc=0
277 | 
278 |   hid_init=np.zeros((batch_size, sum_units), dtype='float32')
279 |   for testbatchi, (x, y) in enumerate(data_iterator(test_data, batch_size, seq_len)):
280 |     perp, bpc, hid_init=test_fn(x, y,hid_init)
281 |     t_prep+=perp
282 |     t_bpc+=bpc
283 |     count+=1
284 |   print ('testprep','testbpc',t_prep/count, t_bpc/count )
285 |   test_acc=t_prep/count  
286 |   count=0
287 |   t_prep=0
288 |   t_bpc=0
289 |   
290 |   
291 |   hid_init=np.zeros((batch_size, sum_units), dtype='float32')
292 |   for testbatchi, (x, y) in enumerate(data_iterator(test_data, batch_size, seq_len)):
293 |     perp, bpc, hid_init=bn_test_fn(x, y,hid_init)
294 |     t_prep+=perp
295 |     t_bpc+=bpc
296 |     count+=1
297 |   print ('bn_testprep','bn_testbpc',t_prep/count, t_bpc/count )
298 |   #test_acc=t_prep/count  
299 |   count=0
300 |   t_prep=0
301 |   t_bpc=0  
302 |   
303 |   if (validbpc <lastbpc):
304 |     best_para0=lasagne.layers.get_all_param_values(learn_net['out'])  
305 |     best_para=np.copy(best_para0) 
306 |     lastbpc=  validbpc
307 |     patience=0
308 |   elif patience>patienceThre:
309 |     learning_rate=np.float32(learning_rate*0.2)
310 |     print ('learning rate',learning_rate)
311 |     lasagne.layers.set_all_param_values(learn_net['out'], best_para)
312 |     patience=0
313 |     if learning_rate<1e-6:
314 |       break
315 |   else:
316 |     patience+=1
317 |     
318 | save_name='indrnn_cPTB'+str(seq_len)
319 | np.savez(save_name, *lasagne.layers.get_all_param_values(learn_net['out']))
320 | 


--------------------------------------------------------------------------------
/IndRNN_onlyrecurrent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it.
  6 | 
  7 | Please cite the following paper if you find it useful.
  8 | 
  9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
 10 | @article{li2018independently,
 11 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 12 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 13 |   booktitle={CVPR2018},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | import numpy as np
 19 | import theano
 20 | import theano.tensor as T
 21 | import lasagne.nonlinearities as nonlinearities
 22 | import lasagne.init as init
 23 | from lasagne.utils import unroll_scan
 24 | 
 25 | from lasagne.layers import MergeLayer, Layer
 26 | from lasagne.layers import InputLayer
 27 | from lasagne.layers import DenseLayer
 28 | from lasagne.layers import helper
 29 | import lasagne
 30 | 
 31 | __all__ = [
 32 |     "onlyRecurrentLayer",
 33 |     "MulLayer",
 34 |     "IndRNNLayer_onlyrecurrent"
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | class MulLayer(lasagne.layers.Layer):
 40 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 41 |         super(MulLayer, self).__init__(incoming, **kwargs)
 42 |         num_inputs = self.input_shape[1]
 43 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return input * self.W
 47 | 
 48 |     def get_output_shape_for(self, input_shape):
 49 |         return input_shape#(input_shape[0], self.num_units)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | class onlyRecurrentLayer(MergeLayer):
 55 |     """
 56 |     This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input.
 57 |     """
 58 |     def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
 59 |                  nonlinearity=nonlinearities.rectify,
 60 |                  hid_init=init.Constant(0.),
 61 |                  backwards=False,
 62 |                  learn_init=False,
 63 |                  gradient_steps=-1,
 64 |                  grad_clipping=0,
 65 |                  unroll_scan=False,
 66 |                  precompute_input=True,
 67 |                  mask_input=None,
 68 |                  only_return_final=False,
 69 |                  **kwargs):
 70 | 
 71 |         # This layer inherits from a MergeLayer, because it can have three
 72 |         # inputs - the layer input, the mask and the initial hidden state.  We
 73 |         # will just provide the layer input as incomings, unless a mask input
 74 |         # or initial hidden state was provided.
 75 |         incomings = [incoming]
 76 |         self.mask_incoming_index = -1
 77 |         self.hid_init_incoming_index = -1
 78 |         if mask_input is not None:
 79 |             incomings.append(mask_input)
 80 |             self.mask_incoming_index = len(incomings)-1
 81 |         if isinstance(hid_init, Layer):
 82 |             incomings.append(hid_init)
 83 |             self.hid_init_incoming_index = len(incomings)-1
 84 | 
 85 |         super(onlyRecurrentLayer, self).__init__(incomings, **kwargs)
 86 | 
 87 |         input_to_hidden_in_layers = \
 88 |             [layer for layer in helper.get_all_layers(input_to_hidden)
 89 |              if isinstance(layer, InputLayer)]
 90 |         if len(input_to_hidden_in_layers) != 1:
 91 |             raise ValueError(
 92 |                 '`input_to_hidden` must have exactly one InputLayer, but it '
 93 |                 'has {}'.format(len(input_to_hidden_in_layers)))
 94 | 
 95 |         hidden_to_hidden_in_lyrs = \
 96 |             [layer for layer in helper.get_all_layers(hidden_to_hidden)
 97 |              if isinstance(layer, InputLayer)]
 98 |         if len(hidden_to_hidden_in_lyrs) != 1:
 99 |             raise ValueError(
100 |                 '`hidden_to_hidden` must have exactly one InputLayer, but it '
101 |                 'has {}'.format(len(hidden_to_hidden_in_lyrs)))
102 |         hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0]
103 | 
104 |         self.input_to_hidden = input_to_hidden
105 |         self.hidden_to_hidden = hidden_to_hidden
106 |         self.learn_init = learn_init
107 |         self.backwards = backwards
108 |         self.gradient_steps = gradient_steps
109 |         self.grad_clipping = grad_clipping
110 |         self.unroll_scan = unroll_scan
111 |         self.precompute_input = precompute_input
112 |         self.only_return_final = only_return_final
113 |         
114 | 
115 |         if unroll_scan and gradient_steps != -1:
116 |             raise ValueError(
117 |                 "Gradient steps must be -1 when unroll_scan is true.")
118 | 
119 |         # Retrieve the dimensionality of the incoming layer
120 |         input_shape = self.input_shapes[0]
121 | 
122 |         if nonlinearity is None:
123 |             self.nonlinearity = nonlinearities.identity
124 |         else:
125 |             self.nonlinearity = nonlinearity
126 | 
127 |         # Initialize hidden state
128 |         if isinstance(hid_init, Layer):
129 |             self.hid_init = hid_init
130 |         else:
131 |             self.hid_init = self.add_param(
132 |                 hid_init, (1,) + hidden_to_hidden.output_shape[1:],
133 |                 name="hid_init", trainable=learn_init, regularizable=False)
134 | 
135 |     def get_params(self, **tags):
136 |         # Get all parameters from this layer, the master layer
137 |         params = super(onlyRecurrentLayer, self).get_params(**tags)
138 |         # Combine with all parameters from the child layers
139 |         params += helper.get_all_params(self.input_to_hidden, **tags)
140 |         params += helper.get_all_params(self.hidden_to_hidden, **tags)
141 |         return params
142 | 
143 |     def get_output_shape_for(self, input_shapes):
144 |         # The shape of the input to this layer will be the first element
145 |         # of input_shapes, whether or not a mask input is being used.
146 |         input_shape = input_shapes[0]
147 |         # When only_return_final is true, the second (sequence step) dimension
148 |         # will be flattened
149 |         if self.only_return_final:
150 |             return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
151 |         # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
152 |         else:
153 |             return ((input_shape[0], input_shape[1]) +
154 |                     self.hidden_to_hidden.output_shape[1:])
155 | 
156 |     def get_output_for(self, inputs, **kwargs):
157 |         # Retrieve the layer input
158 |         input = inputs[0]
159 |         # Retrieve the mask when it is supplied
160 |         mask = None
161 |         hid_init = None
162 |         if self.mask_incoming_index > 0:
163 |             mask = inputs[self.mask_incoming_index]
164 |         if self.hid_init_incoming_index > 0:
165 |             hid_init = inputs[self.hid_init_incoming_index]
166 | 
167 |         # Input should be provided as (n_batch, n_time_steps, n_features)
168 |         # but scan requires the iterable dimension to be first
169 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
170 |         #input = input.dimshuffle(1, 0, *range(2, input.ndim))
171 |         seq_len, num_batch = input.shape[0], input.shape[1]
172 | 
173 |         # We will always pass the hidden-to-hidden layer params to step
174 |         non_seqs = helper.get_all_params(self.hidden_to_hidden)
175 | 
176 |         # Create single recurrent computation step function
177 |         def step(input_n, hid_previous, *args):
178 |             # Compute the hidden-to-hidden activation
179 |             hid_pre = helper.get_output(
180 |                 self.hidden_to_hidden, hid_previous, **kwargs)
181 | 
182 |             hid_pre += input_n
183 | 
184 |             # Clip gradients
185 |             if self.grad_clipping:
186 |                 hid_pre = theano.gradient.grad_clip(
187 |                     hid_pre, -self.grad_clipping, self.grad_clipping)
188 | 
189 |             return self.nonlinearity(hid_pre)
190 | 
191 |         def step_masked(input_n, mask_n, hid_previous, *args):
192 |             # Skip over any input with mask 0 by copying the previous
193 |             # hidden state; proceed normally for any input with mask 1.
194 |             hid = step(input_n, hid_previous, *args)
195 |             hid_out = T.switch(mask_n, hid, hid_previous)
196 |             return [hid_out]
197 | 
198 |         if mask is not None:
199 |             mask = mask.dimshuffle(1, 0, 'x')
200 |             sequences = [input, mask]
201 |             step_fun = step_masked
202 |         else:
203 |             sequences = input
204 |             step_fun = step
205 | 
206 |         if not isinstance(self.hid_init, Layer):
207 |             # The code below simply repeats self.hid_init num_batch times in
208 |             # its first dimension.  Turns out using a dot product and a
209 |             # dimshuffle is faster than T.repeat.
210 |             dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
211 |                         [0, self.hid_init.ndim - 1])
212 |             hid_init = T.dot(T.ones((num_batch, 1)),
213 |                              self.hid_init.dimshuffle(dot_dims))
214 | 
215 |         if self.unroll_scan:
216 |             # Retrieve the dimensionality of the incoming layer
217 |             input_shape = self.input_shapes[0]
218 |             # Explicitly unroll the recurrence instead of using scan
219 |             hid_out = unroll_scan(
220 |                 fn=step_fun,
221 |                 sequences=sequences,
222 |                 outputs_info=[hid_init],
223 |                 go_backwards=self.backwards,
224 |                 non_sequences=non_seqs,
225 |                 n_steps=input_shape[1])[0]
226 |         else:
227 |             # Scan op iterates over first dimension of input and repeatedly
228 |             # applies the step function
229 |             hid_out = theano.scan(
230 |                 fn=step_fun,
231 |                 sequences=sequences,
232 |                 go_backwards=self.backwards,
233 |                 outputs_info=[hid_init],
234 |                 non_sequences=non_seqs,
235 |                 truncate_gradient=self.gradient_steps,
236 |                 strict=True)[0]
237 | 
238 |         # When it is requested that we only return the final sequence step,
239 |         # we need to slice it out immediately after scan is applied
240 |         if self.only_return_final:
241 |             hid_out = hid_out[-1]
242 |         else:
243 |             # dimshuffle back to (n_batch, n_time_steps, n_features))
244 |             #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
245 | 
246 |             # if scan is backward reverse the output
247 |             if self.backwards:
248 |                 hid_out = hid_out[::-1,:]
249 | 
250 |         return hid_out
251 | 
252 | 
253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer):
254 | 
255 |     def __init__(self, incoming, num_units,
256 |                  #W_in_to_hid=init.Uniform(),
257 |                  W_hid_to_hid=init.Uniform(),
258 |                  #b=init.Constant(0.),
259 |                  nonlinearity=nonlinearities.rectify,
260 |                  hid_init=init.Constant(0.),
261 |                  backwards=False,
262 |                  learn_init=False,
263 |                  gradient_steps=-1,
264 |                  grad_clipping=0,
265 |                  unroll_scan=False,
266 |                  precompute_input=True,
267 |                  mask_input=None,
268 |                  only_return_final=False,
269 |                  **kwargs):
270 | 
271 |         if isinstance(incoming, tuple):
272 |             input_shape = incoming
273 |         else:
274 |             input_shape = incoming.output_shape
275 |         # Retrieve the supplied name, if it exists; otherwise use ''
276 |         if 'name' in kwargs:
277 |             basename = kwargs['name'] + '.'
278 |             # Create a separate version of kwargs for the contained layers
279 |             # which does not include 'name'
280 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
281 |                                 if key != 'name')
282 |         else:
283 |             basename = ''
284 |             layer_kwargs = kwargs
285 |         # We will be passing the input at each time step to the dense layer,
286 |         # so we need to remove the second dimension (the time dimension)
287 |         in_to_hid = InputLayer(input_shape)
288 |         
289 | #         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
290 | #                                num_units, W=W_in_to_hid, b=b,
291 | #                                nonlinearity=None,
292 | #                                name=basename + 'input_to_hidden',
293 | #                                **layer_kwargs)        
294 |         # The hidden-to-hidden layer expects its inputs to have num_units
295 |         # features because it recycles the previous hidden state
296 |         
297 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
298 |                                  W=W_hid_to_hid, 
299 |                                 name=basename + 'hidden_to_hidden',
300 |                                 **layer_kwargs)
301 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
302 | #                                 num_units, W=W_hid_to_hid, b=None,
303 | #                                 nonlinearity=None,
304 | #                                 name=basename + 'hidden_to_hidden',
305 | #                                 **layer_kwargs)
306 | 
307 |         # Make child layer parameters intuitively accessible
308 |         #self.W_in_to_hid = in_to_hid.W
309 |         self.W_hid_to_hid = hid_to_hid.W
310 |         #self.b = in_to_hid.b
311 | 
312 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
313 |         super(IndRNNLayer_onlyrecurrent, self).__init__(
314 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
315 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
316 |             gradient_steps=gradient_steps,
317 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
318 |             precompute_input=precompute_input, mask_input=mask_input,
319 |             only_return_final=only_return_final, **kwargs)
320 | 


--------------------------------------------------------------------------------
/cPTB/IndRNN_onlyrecurrent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it.
  6 | 
  7 | Please cite the following paper if you find it useful.
  8 | 
  9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
 10 | @article{li2018independently,
 11 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 12 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 13 |   booktitle={CVPR2018},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | import numpy as np
 19 | import theano
 20 | import theano.tensor as T
 21 | import lasagne.nonlinearities as nonlinearities
 22 | import lasagne.init as init
 23 | from lasagne.utils import unroll_scan
 24 | 
 25 | from lasagne.layers import MergeLayer, Layer
 26 | from lasagne.layers import InputLayer
 27 | from lasagne.layers import DenseLayer
 28 | from lasagne.layers import helper
 29 | import lasagne
 30 | 
 31 | __all__ = [
 32 |     "onlyRecurrentLayer",
 33 |     "MulLayer",
 34 |     "IndRNNLayer_onlyrecurrent"
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | class MulLayer(lasagne.layers.Layer):
 40 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 41 |         super(MulLayer, self).__init__(incoming, **kwargs)
 42 |         num_inputs = self.input_shape[1]
 43 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return input * self.W
 47 | 
 48 |     def get_output_shape_for(self, input_shape):
 49 |         return input_shape#(input_shape[0], self.num_units)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | class onlyRecurrentLayer(MergeLayer):
 55 |     """
 56 |     This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input.
 57 |     """
 58 |     def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
 59 |                  nonlinearity=nonlinearities.rectify,
 60 |                  hid_init=init.Constant(0.),
 61 |                  backwards=False,
 62 |                  learn_init=False,
 63 |                  gradient_steps=-1,
 64 |                  grad_clipping=0,
 65 |                  unroll_scan=False,
 66 |                  precompute_input=True,
 67 |                  mask_input=None,
 68 |                  only_return_final=False,
 69 |                  **kwargs):
 70 | 
 71 |         # This layer inherits from a MergeLayer, because it can have three
 72 |         # inputs - the layer input, the mask and the initial hidden state.  We
 73 |         # will just provide the layer input as incomings, unless a mask input
 74 |         # or initial hidden state was provided.
 75 |         incomings = [incoming]
 76 |         self.mask_incoming_index = -1
 77 |         self.hid_init_incoming_index = -1
 78 |         if mask_input is not None:
 79 |             incomings.append(mask_input)
 80 |             self.mask_incoming_index = len(incomings)-1
 81 |         if isinstance(hid_init, Layer):
 82 |             incomings.append(hid_init)
 83 |             self.hid_init_incoming_index = len(incomings)-1
 84 | 
 85 |         super(onlyRecurrentLayer, self).__init__(incomings, **kwargs)
 86 | 
 87 |         input_to_hidden_in_layers = \
 88 |             [layer for layer in helper.get_all_layers(input_to_hidden)
 89 |              if isinstance(layer, InputLayer)]
 90 |         if len(input_to_hidden_in_layers) != 1:
 91 |             raise ValueError(
 92 |                 '`input_to_hidden` must have exactly one InputLayer, but it '
 93 |                 'has {}'.format(len(input_to_hidden_in_layers)))
 94 | 
 95 |         hidden_to_hidden_in_lyrs = \
 96 |             [layer for layer in helper.get_all_layers(hidden_to_hidden)
 97 |              if isinstance(layer, InputLayer)]
 98 |         if len(hidden_to_hidden_in_lyrs) != 1:
 99 |             raise ValueError(
100 |                 '`hidden_to_hidden` must have exactly one InputLayer, but it '
101 |                 'has {}'.format(len(hidden_to_hidden_in_lyrs)))
102 |         hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0]
103 | 
104 |         self.input_to_hidden = input_to_hidden
105 |         self.hidden_to_hidden = hidden_to_hidden
106 |         self.learn_init = learn_init
107 |         self.backwards = backwards
108 |         self.gradient_steps = gradient_steps
109 |         self.grad_clipping = grad_clipping
110 |         self.unroll_scan = unroll_scan
111 |         self.precompute_input = precompute_input
112 |         self.only_return_final = only_return_final
113 |         
114 | 
115 |         if unroll_scan and gradient_steps != -1:
116 |             raise ValueError(
117 |                 "Gradient steps must be -1 when unroll_scan is true.")
118 | 
119 |         # Retrieve the dimensionality of the incoming layer
120 |         input_shape = self.input_shapes[0]
121 | 
122 |         if nonlinearity is None:
123 |             self.nonlinearity = nonlinearities.identity
124 |         else:
125 |             self.nonlinearity = nonlinearity
126 | 
127 |         # Initialize hidden state
128 |         if isinstance(hid_init, Layer):
129 |             self.hid_init = hid_init
130 |         else:
131 |             self.hid_init = self.add_param(
132 |                 hid_init, (1,) + hidden_to_hidden.output_shape[1:],
133 |                 name="hid_init", trainable=learn_init, regularizable=False)
134 | 
135 |     def get_params(self, **tags):
136 |         # Get all parameters from this layer, the master layer
137 |         params = super(onlyRecurrentLayer, self).get_params(**tags)
138 |         # Combine with all parameters from the child layers
139 |         params += helper.get_all_params(self.input_to_hidden, **tags)
140 |         params += helper.get_all_params(self.hidden_to_hidden, **tags)
141 |         return params
142 | 
143 |     def get_output_shape_for(self, input_shapes):
144 |         # The shape of the input to this layer will be the first element
145 |         # of input_shapes, whether or not a mask input is being used.
146 |         input_shape = input_shapes[0]
147 |         # When only_return_final is true, the second (sequence step) dimension
148 |         # will be flattened
149 |         if self.only_return_final:
150 |             return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
151 |         # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
152 |         else:
153 |             return ((input_shape[0], input_shape[1]) +
154 |                     self.hidden_to_hidden.output_shape[1:])
155 | 
156 |     def get_output_for(self, inputs, **kwargs):
157 |         # Retrieve the layer input
158 |         input = inputs[0]
159 |         # Retrieve the mask when it is supplied
160 |         mask = None
161 |         hid_init = None
162 |         if self.mask_incoming_index > 0:
163 |             mask = inputs[self.mask_incoming_index]
164 |         if self.hid_init_incoming_index > 0:
165 |             hid_init = inputs[self.hid_init_incoming_index]
166 | 
167 |         # Input should be provided as (n_batch, n_time_steps, n_features)
168 |         # but scan requires the iterable dimension to be first
169 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
170 |         #input = input.dimshuffle(1, 0, *range(2, input.ndim))
171 |         seq_len, num_batch = input.shape[0], input.shape[1]
172 | 
173 |         # We will always pass the hidden-to-hidden layer params to step
174 |         non_seqs = helper.get_all_params(self.hidden_to_hidden)
175 | 
176 |         # Create single recurrent computation step function
177 |         def step(input_n, hid_previous, *args):
178 |             # Compute the hidden-to-hidden activation
179 |             hid_pre = helper.get_output(
180 |                 self.hidden_to_hidden, hid_previous, **kwargs)
181 | 
182 |             hid_pre += input_n
183 | 
184 |             # Clip gradients
185 |             if self.grad_clipping:
186 |                 hid_pre = theano.gradient.grad_clip(
187 |                     hid_pre, -self.grad_clipping, self.grad_clipping)
188 | 
189 |             return self.nonlinearity(hid_pre)
190 | 
191 |         def step_masked(input_n, mask_n, hid_previous, *args):
192 |             # Skip over any input with mask 0 by copying the previous
193 |             # hidden state; proceed normally for any input with mask 1.
194 |             hid = step(input_n, hid_previous, *args)
195 |             hid_out = T.switch(mask_n, hid, hid_previous)
196 |             return [hid_out]
197 | 
198 |         if mask is not None:
199 |             mask = mask.dimshuffle(1, 0, 'x')
200 |             sequences = [input, mask]
201 |             step_fun = step_masked
202 |         else:
203 |             sequences = input
204 |             step_fun = step
205 | 
206 |         if not isinstance(self.hid_init, Layer):
207 |             # The code below simply repeats self.hid_init num_batch times in
208 |             # its first dimension.  Turns out using a dot product and a
209 |             # dimshuffle is faster than T.repeat.
210 |             dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
211 |                         [0, self.hid_init.ndim - 1])
212 |             hid_init = T.dot(T.ones((num_batch, 1)),
213 |                              self.hid_init.dimshuffle(dot_dims))
214 | 
215 |         if self.unroll_scan:
216 |             # Retrieve the dimensionality of the incoming layer
217 |             input_shape = self.input_shapes[0]
218 |             # Explicitly unroll the recurrence instead of using scan
219 |             hid_out = unroll_scan(
220 |                 fn=step_fun,
221 |                 sequences=sequences,
222 |                 outputs_info=[hid_init],
223 |                 go_backwards=self.backwards,
224 |                 non_sequences=non_seqs,
225 |                 n_steps=input_shape[1])[0]
226 |         else:
227 |             # Scan op iterates over first dimension of input and repeatedly
228 |             # applies the step function
229 |             hid_out = theano.scan(
230 |                 fn=step_fun,
231 |                 sequences=sequences,
232 |                 go_backwards=self.backwards,
233 |                 outputs_info=[hid_init],
234 |                 non_sequences=non_seqs,
235 |                 truncate_gradient=self.gradient_steps,
236 |                 strict=True)[0]
237 | 
238 |         # When it is requested that we only return the final sequence step,
239 |         # we need to slice it out immediately after scan is applied
240 |         if self.only_return_final:
241 |             hid_out = hid_out[-1]
242 |         else:
243 |             # dimshuffle back to (n_batch, n_time_steps, n_features))
244 |             #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
245 | 
246 |             # if scan is backward reverse the output
247 |             if self.backwards:
248 |                 hid_out = hid_out[::-1,:]
249 | 
250 |         return hid_out
251 | 
252 | 
253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer):
254 | 
255 |     def __init__(self, incoming, num_units,
256 |                  #W_in_to_hid=init.Uniform(),
257 |                  W_hid_to_hid=init.Uniform(),
258 |                  #b=init.Constant(0.),
259 |                  nonlinearity=nonlinearities.rectify,
260 |                  hid_init=init.Constant(0.),
261 |                  backwards=False,
262 |                  learn_init=False,
263 |                  gradient_steps=-1,
264 |                  grad_clipping=0,
265 |                  unroll_scan=False,
266 |                  precompute_input=True,
267 |                  mask_input=None,
268 |                  only_return_final=False,
269 |                  **kwargs):
270 | 
271 |         if isinstance(incoming, tuple):
272 |             input_shape = incoming
273 |         else:
274 |             input_shape = incoming.output_shape
275 |         # Retrieve the supplied name, if it exists; otherwise use ''
276 |         if 'name' in kwargs:
277 |             basename = kwargs['name'] + '.'
278 |             # Create a separate version of kwargs for the contained layers
279 |             # which does not include 'name'
280 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
281 |                                 if key != 'name')
282 |         else:
283 |             basename = ''
284 |             layer_kwargs = kwargs
285 |         # We will be passing the input at each time step to the dense layer,
286 |         # so we need to remove the second dimension (the time dimension)
287 |         in_to_hid = InputLayer(input_shape)
288 |         
289 | #         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
290 | #                                num_units, W=W_in_to_hid, b=b,
291 | #                                nonlinearity=None,
292 | #                                name=basename + 'input_to_hidden',
293 | #                                **layer_kwargs)        
294 |         # The hidden-to-hidden layer expects its inputs to have num_units
295 |         # features because it recycles the previous hidden state
296 |         
297 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
298 |                                  W=W_hid_to_hid, 
299 |                                 name=basename + 'hidden_to_hidden',
300 |                                 **layer_kwargs)
301 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
302 | #                                 num_units, W=W_hid_to_hid, b=None,
303 | #                                 nonlinearity=None,
304 | #                                 name=basename + 'hidden_to_hidden',
305 | #                                 **layer_kwargs)
306 | 
307 |         # Make child layer parameters intuitively accessible
308 |         #self.W_in_to_hid = in_to_hid.W
309 |         self.W_hid_to_hid = hid_to_hid.W
310 |         #self.b = in_to_hid.b
311 | 
312 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
313 |         super(IndRNNLayer_onlyrecurrent, self).__init__(
314 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
315 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
316 |             gradient_steps=gradient_steps,
317 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
318 |             precompute_input=precompute_input, mask_input=mask_input,
319 |             only_return_final=only_return_final, **kwargs)
320 | 


--------------------------------------------------------------------------------
/mnist/IndRNN_onlyrecurrent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it.
  6 | 
  7 | Please cite the following paper if you find it useful.
  8 | 
  9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
 10 | @article{li2018independently,
 11 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 12 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 13 |   booktitle={CVPR2018},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | import numpy as np
 19 | import theano
 20 | import theano.tensor as T
 21 | import lasagne.nonlinearities as nonlinearities
 22 | import lasagne.init as init
 23 | from lasagne.utils import unroll_scan
 24 | 
 25 | from lasagne.layers import MergeLayer, Layer
 26 | from lasagne.layers import InputLayer
 27 | from lasagne.layers import DenseLayer
 28 | from lasagne.layers import helper
 29 | import lasagne
 30 | 
 31 | __all__ = [
 32 |     "onlyRecurrentLayer",
 33 |     "MulLayer",
 34 |     "IndRNNLayer_onlyrecurrent"
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | class MulLayer(lasagne.layers.Layer):
 40 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 41 |         super(MulLayer, self).__init__(incoming, **kwargs)
 42 |         num_inputs = self.input_shape[1]
 43 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return input * self.W
 47 | 
 48 |     def get_output_shape_for(self, input_shape):
 49 |         return input_shape#(input_shape[0], self.num_units)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | class onlyRecurrentLayer(MergeLayer):
 55 |     """
 56 |     This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input.
 57 |     """
 58 |     def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
 59 |                  nonlinearity=nonlinearities.rectify,
 60 |                  hid_init=init.Constant(0.),
 61 |                  backwards=False,
 62 |                  learn_init=False,
 63 |                  gradient_steps=-1,
 64 |                  grad_clipping=0,
 65 |                  unroll_scan=False,
 66 |                  precompute_input=True,
 67 |                  mask_input=None,
 68 |                  only_return_final=False,
 69 |                  **kwargs):
 70 | 
 71 |         # This layer inherits from a MergeLayer, because it can have three
 72 |         # inputs - the layer input, the mask and the initial hidden state.  We
 73 |         # will just provide the layer input as incomings, unless a mask input
 74 |         # or initial hidden state was provided.
 75 |         incomings = [incoming]
 76 |         self.mask_incoming_index = -1
 77 |         self.hid_init_incoming_index = -1
 78 |         if mask_input is not None:
 79 |             incomings.append(mask_input)
 80 |             self.mask_incoming_index = len(incomings)-1
 81 |         if isinstance(hid_init, Layer):
 82 |             incomings.append(hid_init)
 83 |             self.hid_init_incoming_index = len(incomings)-1
 84 | 
 85 |         super(onlyRecurrentLayer, self).__init__(incomings, **kwargs)
 86 | 
 87 |         input_to_hidden_in_layers = \
 88 |             [layer for layer in helper.get_all_layers(input_to_hidden)
 89 |              if isinstance(layer, InputLayer)]
 90 |         if len(input_to_hidden_in_layers) != 1:
 91 |             raise ValueError(
 92 |                 '`input_to_hidden` must have exactly one InputLayer, but it '
 93 |                 'has {}'.format(len(input_to_hidden_in_layers)))
 94 | 
 95 |         hidden_to_hidden_in_lyrs = \
 96 |             [layer for layer in helper.get_all_layers(hidden_to_hidden)
 97 |              if isinstance(layer, InputLayer)]
 98 |         if len(hidden_to_hidden_in_lyrs) != 1:
 99 |             raise ValueError(
100 |                 '`hidden_to_hidden` must have exactly one InputLayer, but it '
101 |                 'has {}'.format(len(hidden_to_hidden_in_lyrs)))
102 |         hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0]
103 | 
104 |         self.input_to_hidden = input_to_hidden
105 |         self.hidden_to_hidden = hidden_to_hidden
106 |         self.learn_init = learn_init
107 |         self.backwards = backwards
108 |         self.gradient_steps = gradient_steps
109 |         self.grad_clipping = grad_clipping
110 |         self.unroll_scan = unroll_scan
111 |         self.precompute_input = precompute_input
112 |         self.only_return_final = only_return_final
113 |         
114 | 
115 |         if unroll_scan and gradient_steps != -1:
116 |             raise ValueError(
117 |                 "Gradient steps must be -1 when unroll_scan is true.")
118 | 
119 |         # Retrieve the dimensionality of the incoming layer
120 |         input_shape = self.input_shapes[0]
121 | 
122 |         if nonlinearity is None:
123 |             self.nonlinearity = nonlinearities.identity
124 |         else:
125 |             self.nonlinearity = nonlinearity
126 | 
127 |         # Initialize hidden state
128 |         if isinstance(hid_init, Layer):
129 |             self.hid_init = hid_init
130 |         else:
131 |             self.hid_init = self.add_param(
132 |                 hid_init, (1,) + hidden_to_hidden.output_shape[1:],
133 |                 name="hid_init", trainable=learn_init, regularizable=False)
134 | 
135 |     def get_params(self, **tags):
136 |         # Get all parameters from this layer, the master layer
137 |         params = super(onlyRecurrentLayer, self).get_params(**tags)
138 |         # Combine with all parameters from the child layers
139 |         params += helper.get_all_params(self.input_to_hidden, **tags)
140 |         params += helper.get_all_params(self.hidden_to_hidden, **tags)
141 |         return params
142 | 
143 |     def get_output_shape_for(self, input_shapes):
144 |         # The shape of the input to this layer will be the first element
145 |         # of input_shapes, whether or not a mask input is being used.
146 |         input_shape = input_shapes[0]
147 |         # When only_return_final is true, the second (sequence step) dimension
148 |         # will be flattened
149 |         if self.only_return_final:
150 |             return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
151 |         # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
152 |         else:
153 |             return ((input_shape[0], input_shape[1]) +
154 |                     self.hidden_to_hidden.output_shape[1:])
155 | 
156 |     def get_output_for(self, inputs, **kwargs):
157 |         # Retrieve the layer input
158 |         input = inputs[0]
159 |         # Retrieve the mask when it is supplied
160 |         mask = None
161 |         hid_init = None
162 |         if self.mask_incoming_index > 0:
163 |             mask = inputs[self.mask_incoming_index]
164 |         if self.hid_init_incoming_index > 0:
165 |             hid_init = inputs[self.hid_init_incoming_index]
166 | 
167 |         # Input should be provided as (n_batch, n_time_steps, n_features)
168 |         # but scan requires the iterable dimension to be first
169 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
170 |         #input = input.dimshuffle(1, 0, *range(2, input.ndim))
171 |         seq_len, num_batch = input.shape[0], input.shape[1]
172 | 
173 |         # We will always pass the hidden-to-hidden layer params to step
174 |         non_seqs = helper.get_all_params(self.hidden_to_hidden)
175 | 
176 |         # Create single recurrent computation step function
177 |         def step(input_n, hid_previous, *args):
178 |             # Compute the hidden-to-hidden activation
179 |             hid_pre = helper.get_output(
180 |                 self.hidden_to_hidden, hid_previous, **kwargs)
181 | 
182 |             hid_pre += input_n
183 | 
184 |             # Clip gradients
185 |             if self.grad_clipping:
186 |                 hid_pre = theano.gradient.grad_clip(
187 |                     hid_pre, -self.grad_clipping, self.grad_clipping)
188 | 
189 |             return self.nonlinearity(hid_pre)
190 | 
191 |         def step_masked(input_n, mask_n, hid_previous, *args):
192 |             # Skip over any input with mask 0 by copying the previous
193 |             # hidden state; proceed normally for any input with mask 1.
194 |             hid = step(input_n, hid_previous, *args)
195 |             hid_out = T.switch(mask_n, hid, hid_previous)
196 |             return [hid_out]
197 | 
198 |         if mask is not None:
199 |             mask = mask.dimshuffle(1, 0, 'x')
200 |             sequences = [input, mask]
201 |             step_fun = step_masked
202 |         else:
203 |             sequences = input
204 |             step_fun = step
205 | 
206 |         if not isinstance(self.hid_init, Layer):
207 |             # The code below simply repeats self.hid_init num_batch times in
208 |             # its first dimension.  Turns out using a dot product and a
209 |             # dimshuffle is faster than T.repeat.
210 |             dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
211 |                         [0, self.hid_init.ndim - 1])
212 |             hid_init = T.dot(T.ones((num_batch, 1)),
213 |                              self.hid_init.dimshuffle(dot_dims))
214 | 
215 |         if self.unroll_scan:
216 |             # Retrieve the dimensionality of the incoming layer
217 |             input_shape = self.input_shapes[0]
218 |             # Explicitly unroll the recurrence instead of using scan
219 |             hid_out = unroll_scan(
220 |                 fn=step_fun,
221 |                 sequences=sequences,
222 |                 outputs_info=[hid_init],
223 |                 go_backwards=self.backwards,
224 |                 non_sequences=non_seqs,
225 |                 n_steps=input_shape[1])[0]
226 |         else:
227 |             # Scan op iterates over first dimension of input and repeatedly
228 |             # applies the step function
229 |             hid_out = theano.scan(
230 |                 fn=step_fun,
231 |                 sequences=sequences,
232 |                 go_backwards=self.backwards,
233 |                 outputs_info=[hid_init],
234 |                 non_sequences=non_seqs,
235 |                 truncate_gradient=self.gradient_steps,
236 |                 strict=True)[0]
237 | 
238 |         # When it is requested that we only return the final sequence step,
239 |         # we need to slice it out immediately after scan is applied
240 |         if self.only_return_final:
241 |             hid_out = hid_out[-1]
242 |         else:
243 |             # dimshuffle back to (n_batch, n_time_steps, n_features))
244 |             #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
245 | 
246 |             # if scan is backward reverse the output
247 |             if self.backwards:
248 |                 hid_out = hid_out[::-1,:]
249 | 
250 |         return hid_out
251 | 
252 | 
253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer):
254 | 
255 |     def __init__(self, incoming, num_units,
256 |                  #W_in_to_hid=init.Uniform(),
257 |                  W_hid_to_hid=init.Uniform(),
258 |                  #b=init.Constant(0.),
259 |                  nonlinearity=nonlinearities.rectify,
260 |                  hid_init=init.Constant(0.),
261 |                  backwards=False,
262 |                  learn_init=False,
263 |                  gradient_steps=-1,
264 |                  grad_clipping=0,
265 |                  unroll_scan=False,
266 |                  precompute_input=True,
267 |                  mask_input=None,
268 |                  only_return_final=False,
269 |                  **kwargs):
270 | 
271 |         if isinstance(incoming, tuple):
272 |             input_shape = incoming
273 |         else:
274 |             input_shape = incoming.output_shape
275 |         # Retrieve the supplied name, if it exists; otherwise use ''
276 |         if 'name' in kwargs:
277 |             basename = kwargs['name'] + '.'
278 |             # Create a separate version of kwargs for the contained layers
279 |             # which does not include 'name'
280 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
281 |                                 if key != 'name')
282 |         else:
283 |             basename = ''
284 |             layer_kwargs = kwargs
285 |         # We will be passing the input at each time step to the dense layer,
286 |         # so we need to remove the second dimension (the time dimension)
287 |         in_to_hid = InputLayer(input_shape)
288 |         
289 | #         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
290 | #                                num_units, W=W_in_to_hid, b=b,
291 | #                                nonlinearity=None,
292 | #                                name=basename + 'input_to_hidden',
293 | #                                **layer_kwargs)        
294 |         # The hidden-to-hidden layer expects its inputs to have num_units
295 |         # features because it recycles the previous hidden state
296 |         
297 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
298 |                                  W=W_hid_to_hid, 
299 |                                 name=basename + 'hidden_to_hidden',
300 |                                 **layer_kwargs)
301 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
302 | #                                 num_units, W=W_hid_to_hid, b=None,
303 | #                                 nonlinearity=None,
304 | #                                 name=basename + 'hidden_to_hidden',
305 | #                                 **layer_kwargs)
306 | 
307 |         # Make child layer parameters intuitively accessible
308 |         #self.W_in_to_hid = in_to_hid.W
309 |         self.W_hid_to_hid = hid_to_hid.W
310 |         #self.b = in_to_hid.b
311 | 
312 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
313 |         super(IndRNNLayer_onlyrecurrent, self).__init__(
314 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
315 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
316 |             gradient_steps=gradient_steps,
317 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
318 |             precompute_input=precompute_input, mask_input=mask_input,
319 |             only_return_final=only_return_final, **kwargs)
320 | 


--------------------------------------------------------------------------------
/wordPTB/IndRNN_onlyrecurrent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it.
  6 | 
  7 | Please cite the following paper if you find it useful.
  8 | 
  9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
 10 | @article{li2018independently,
 11 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 12 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 13 |   booktitle={CVPR2018},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | import numpy as np
 19 | import theano
 20 | import theano.tensor as T
 21 | import lasagne.nonlinearities as nonlinearities
 22 | import lasagne.init as init
 23 | from lasagne.utils import unroll_scan
 24 | 
 25 | from lasagne.layers import MergeLayer, Layer
 26 | from lasagne.layers import InputLayer
 27 | from lasagne.layers import DenseLayer
 28 | from lasagne.layers import helper
 29 | import lasagne
 30 | 
 31 | __all__ = [
 32 |     "onlyRecurrentLayer",
 33 |     "MulLayer",
 34 |     "IndRNNLayer_onlyrecurrent"
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | class MulLayer(lasagne.layers.Layer):
 40 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 41 |         super(MulLayer, self).__init__(incoming, **kwargs)
 42 |         num_inputs = self.input_shape[1]
 43 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return input * self.W
 47 | 
 48 |     def get_output_shape_for(self, input_shape):
 49 |         return input_shape#(input_shape[0], self.num_units)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | class onlyRecurrentLayer(MergeLayer):
 55 |     """
 56 |     This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input.
 57 |     """
 58 |     def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
 59 |                  nonlinearity=nonlinearities.rectify,
 60 |                  hid_init=init.Constant(0.),
 61 |                  backwards=False,
 62 |                  learn_init=False,
 63 |                  gradient_steps=-1,
 64 |                  grad_clipping=0,
 65 |                  unroll_scan=False,
 66 |                  precompute_input=True,
 67 |                  mask_input=None,
 68 |                  only_return_final=False,
 69 |                  **kwargs):
 70 | 
 71 |         # This layer inherits from a MergeLayer, because it can have three
 72 |         # inputs - the layer input, the mask and the initial hidden state.  We
 73 |         # will just provide the layer input as incomings, unless a mask input
 74 |         # or initial hidden state was provided.
 75 |         incomings = [incoming]
 76 |         self.mask_incoming_index = -1
 77 |         self.hid_init_incoming_index = -1
 78 |         if mask_input is not None:
 79 |             incomings.append(mask_input)
 80 |             self.mask_incoming_index = len(incomings)-1
 81 |         if isinstance(hid_init, Layer):
 82 |             incomings.append(hid_init)
 83 |             self.hid_init_incoming_index = len(incomings)-1
 84 | 
 85 |         super(onlyRecurrentLayer, self).__init__(incomings, **kwargs)
 86 | 
 87 |         input_to_hidden_in_layers = \
 88 |             [layer for layer in helper.get_all_layers(input_to_hidden)
 89 |              if isinstance(layer, InputLayer)]
 90 |         if len(input_to_hidden_in_layers) != 1:
 91 |             raise ValueError(
 92 |                 '`input_to_hidden` must have exactly one InputLayer, but it '
 93 |                 'has {}'.format(len(input_to_hidden_in_layers)))
 94 | 
 95 |         hidden_to_hidden_in_lyrs = \
 96 |             [layer for layer in helper.get_all_layers(hidden_to_hidden)
 97 |              if isinstance(layer, InputLayer)]
 98 |         if len(hidden_to_hidden_in_lyrs) != 1:
 99 |             raise ValueError(
100 |                 '`hidden_to_hidden` must have exactly one InputLayer, but it '
101 |                 'has {}'.format(len(hidden_to_hidden_in_lyrs)))
102 |         hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0]
103 | 
104 |         self.input_to_hidden = input_to_hidden
105 |         self.hidden_to_hidden = hidden_to_hidden
106 |         self.learn_init = learn_init
107 |         self.backwards = backwards
108 |         self.gradient_steps = gradient_steps
109 |         self.grad_clipping = grad_clipping
110 |         self.unroll_scan = unroll_scan
111 |         self.precompute_input = precompute_input
112 |         self.only_return_final = only_return_final
113 |         
114 | 
115 |         if unroll_scan and gradient_steps != -1:
116 |             raise ValueError(
117 |                 "Gradient steps must be -1 when unroll_scan is true.")
118 | 
119 |         # Retrieve the dimensionality of the incoming layer
120 |         input_shape = self.input_shapes[0]
121 | 
122 |         if nonlinearity is None:
123 |             self.nonlinearity = nonlinearities.identity
124 |         else:
125 |             self.nonlinearity = nonlinearity
126 | 
127 |         # Initialize hidden state
128 |         if isinstance(hid_init, Layer):
129 |             self.hid_init = hid_init
130 |         else:
131 |             self.hid_init = self.add_param(
132 |                 hid_init, (1,) + hidden_to_hidden.output_shape[1:],
133 |                 name="hid_init", trainable=learn_init, regularizable=False)
134 | 
135 |     def get_params(self, **tags):
136 |         # Get all parameters from this layer, the master layer
137 |         params = super(onlyRecurrentLayer, self).get_params(**tags)
138 |         # Combine with all parameters from the child layers
139 |         params += helper.get_all_params(self.input_to_hidden, **tags)
140 |         params += helper.get_all_params(self.hidden_to_hidden, **tags)
141 |         return params
142 | 
143 |     def get_output_shape_for(self, input_shapes):
144 |         # The shape of the input to this layer will be the first element
145 |         # of input_shapes, whether or not a mask input is being used.
146 |         input_shape = input_shapes[0]
147 |         # When only_return_final is true, the second (sequence step) dimension
148 |         # will be flattened
149 |         if self.only_return_final:
150 |             return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
151 |         # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
152 |         else:
153 |             return ((input_shape[0], input_shape[1]) +
154 |                     self.hidden_to_hidden.output_shape[1:])
155 | 
156 |     def get_output_for(self, inputs, **kwargs):
157 |         # Retrieve the layer input
158 |         input = inputs[0]
159 |         # Retrieve the mask when it is supplied
160 |         mask = None
161 |         hid_init = None
162 |         if self.mask_incoming_index > 0:
163 |             mask = inputs[self.mask_incoming_index]
164 |         if self.hid_init_incoming_index > 0:
165 |             hid_init = inputs[self.hid_init_incoming_index]
166 | 
167 |         # Input should be provided as (n_batch, n_time_steps, n_features)
168 |         # but scan requires the iterable dimension to be first
169 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
170 |         #input = input.dimshuffle(1, 0, *range(2, input.ndim))
171 |         seq_len, num_batch = input.shape[0], input.shape[1]
172 | 
173 |         # We will always pass the hidden-to-hidden layer params to step
174 |         non_seqs = helper.get_all_params(self.hidden_to_hidden)
175 | 
176 |         # Create single recurrent computation step function
177 |         def step(input_n, hid_previous, *args):
178 |             # Compute the hidden-to-hidden activation
179 |             hid_pre = helper.get_output(
180 |                 self.hidden_to_hidden, hid_previous, **kwargs)
181 | 
182 |             hid_pre += input_n
183 | 
184 |             # Clip gradients
185 |             if self.grad_clipping:
186 |                 hid_pre = theano.gradient.grad_clip(
187 |                     hid_pre, -self.grad_clipping, self.grad_clipping)
188 | 
189 |             return self.nonlinearity(hid_pre)
190 | 
191 |         def step_masked(input_n, mask_n, hid_previous, *args):
192 |             # Skip over any input with mask 0 by copying the previous
193 |             # hidden state; proceed normally for any input with mask 1.
194 |             hid = step(input_n, hid_previous, *args)
195 |             hid_out = T.switch(mask_n, hid, hid_previous)
196 |             return [hid_out]
197 | 
198 |         if mask is not None:
199 |             mask = mask.dimshuffle(1, 0, 'x')
200 |             sequences = [input, mask]
201 |             step_fun = step_masked
202 |         else:
203 |             sequences = input
204 |             step_fun = step
205 | 
206 |         if not isinstance(self.hid_init, Layer):
207 |             # The code below simply repeats self.hid_init num_batch times in
208 |             # its first dimension.  Turns out using a dot product and a
209 |             # dimshuffle is faster than T.repeat.
210 |             dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
211 |                         [0, self.hid_init.ndim - 1])
212 |             hid_init = T.dot(T.ones((num_batch, 1)),
213 |                              self.hid_init.dimshuffle(dot_dims))
214 | 
215 |         if self.unroll_scan:
216 |             # Retrieve the dimensionality of the incoming layer
217 |             input_shape = self.input_shapes[0]
218 |             # Explicitly unroll the recurrence instead of using scan
219 |             hid_out = unroll_scan(
220 |                 fn=step_fun,
221 |                 sequences=sequences,
222 |                 outputs_info=[hid_init],
223 |                 go_backwards=self.backwards,
224 |                 non_sequences=non_seqs,
225 |                 n_steps=input_shape[1])[0]
226 |         else:
227 |             # Scan op iterates over first dimension of input and repeatedly
228 |             # applies the step function
229 |             hid_out = theano.scan(
230 |                 fn=step_fun,
231 |                 sequences=sequences,
232 |                 go_backwards=self.backwards,
233 |                 outputs_info=[hid_init],
234 |                 non_sequences=non_seqs,
235 |                 truncate_gradient=self.gradient_steps,
236 |                 strict=True)[0]
237 | 
238 |         # When it is requested that we only return the final sequence step,
239 |         # we need to slice it out immediately after scan is applied
240 |         if self.only_return_final:
241 |             hid_out = hid_out[-1]
242 |         else:
243 |             # dimshuffle back to (n_batch, n_time_steps, n_features))
244 |             #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
245 | 
246 |             # if scan is backward reverse the output
247 |             if self.backwards:
248 |                 hid_out = hid_out[::-1,:]
249 | 
250 |         return hid_out
251 | 
252 | 
253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer):
254 | 
255 |     def __init__(self, incoming, num_units,
256 |                  #W_in_to_hid=init.Uniform(),
257 |                  W_hid_to_hid=init.Uniform(),
258 |                  #b=init.Constant(0.),
259 |                  nonlinearity=nonlinearities.rectify,
260 |                  hid_init=init.Constant(0.),
261 |                  backwards=False,
262 |                  learn_init=False,
263 |                  gradient_steps=-1,
264 |                  grad_clipping=0,
265 |                  unroll_scan=False,
266 |                  precompute_input=True,
267 |                  mask_input=None,
268 |                  only_return_final=False,
269 |                  **kwargs):
270 | 
271 |         if isinstance(incoming, tuple):
272 |             input_shape = incoming
273 |         else:
274 |             input_shape = incoming.output_shape
275 |         # Retrieve the supplied name, if it exists; otherwise use ''
276 |         if 'name' in kwargs:
277 |             basename = kwargs['name'] + '.'
278 |             # Create a separate version of kwargs for the contained layers
279 |             # which does not include 'name'
280 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
281 |                                 if key != 'name')
282 |         else:
283 |             basename = ''
284 |             layer_kwargs = kwargs
285 |         # We will be passing the input at each time step to the dense layer,
286 |         # so we need to remove the second dimension (the time dimension)
287 |         in_to_hid = InputLayer(input_shape)
288 |         
289 | #         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
290 | #                                num_units, W=W_in_to_hid, b=b,
291 | #                                nonlinearity=None,
292 | #                                name=basename + 'input_to_hidden',
293 | #                                **layer_kwargs)        
294 |         # The hidden-to-hidden layer expects its inputs to have num_units
295 |         # features because it recycles the previous hidden state
296 |         
297 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
298 |                                  W=W_hid_to_hid, 
299 |                                 name=basename + 'hidden_to_hidden',
300 |                                 **layer_kwargs)
301 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
302 | #                                 num_units, W=W_hid_to_hid, b=None,
303 | #                                 nonlinearity=None,
304 | #                                 name=basename + 'hidden_to_hidden',
305 | #                                 **layer_kwargs)
306 | 
307 |         # Make child layer parameters intuitively accessible
308 |         #self.W_in_to_hid = in_to_hid.W
309 |         self.W_hid_to_hid = hid_to_hid.W
310 |         #self.b = in_to_hid.b
311 | 
312 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
313 |         super(IndRNNLayer_onlyrecurrent, self).__init__(
314 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
315 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
316 |             gradient_steps=gradient_steps,
317 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
318 |             precompute_input=precompute_input, mask_input=mask_input,
319 |             only_return_final=only_return_final, **kwargs)
320 | 


--------------------------------------------------------------------------------
/action recognition/IndRNN_onlyrecurrent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This code is to implement the IndRNN (only the recurrent part). The code is based on the Lasagne implementation of RecurrentLayer.
  4 | 
  5 | Since this only contains the recurrent part of IndRNN, fully connected layers or convolutional layers are needed before it.
  6 | 
  7 | Please cite the following paper if you find it useful.
  8 | 
  9 | Shuai Li, Wanqing Li, Chris Cook, Ce Zhu, and Yanbo Gao. "Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN." CVPR 2018.
 10 | @article{li2018independently,
 11 |   title={Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN},
 12 |   author={Li, Shuai and Li, Wanqing and Cook, Chris and Zhu, Ce and Gao, Yanbo},
 13 |   booktitle={CVPR2018},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | import numpy as np
 19 | import theano
 20 | import theano.tensor as T
 21 | import lasagne.nonlinearities as nonlinearities
 22 | import lasagne.init as init
 23 | from lasagne.utils import unroll_scan
 24 | 
 25 | from lasagne.layers import MergeLayer, Layer
 26 | from lasagne.layers import InputLayer
 27 | from lasagne.layers import DenseLayer
 28 | from lasagne.layers import helper
 29 | import lasagne
 30 | 
 31 | __all__ = [
 32 |     "onlyRecurrentLayer",
 33 |     "MulLayer",
 34 |     "IndRNNLayer_onlyrecurrent"
 35 | ]
 36 | 
 37 | 
 38 | 
 39 | class MulLayer(lasagne.layers.Layer):
 40 |     def __init__(self, incoming,  W=lasagne.init.Normal(0.01), **kwargs):
 41 |         super(MulLayer, self).__init__(incoming, **kwargs)
 42 |         num_inputs = self.input_shape[1]
 43 |         self.W = self.add_param(W, (num_inputs, ), name='W')
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return input * self.W
 47 | 
 48 |     def get_output_shape_for(self, input_shape):
 49 |         return input_shape#(input_shape[0], self.num_units)
 50 | 
 51 | 
 52 | 
 53 | 
 54 | class onlyRecurrentLayer(MergeLayer):
 55 |     """
 56 |     This is slightly different from the CustomRecurrentLayer of Lasagne by removing the computation of input.
 57 |     """
 58 |     def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
 59 |                  nonlinearity=nonlinearities.rectify,
 60 |                  hid_init=init.Constant(0.),
 61 |                  backwards=False,
 62 |                  learn_init=False,
 63 |                  gradient_steps=-1,
 64 |                  grad_clipping=0,
 65 |                  unroll_scan=False,
 66 |                  precompute_input=True,
 67 |                  mask_input=None,
 68 |                  only_return_final=False,
 69 |                  **kwargs):
 70 | 
 71 |         # This layer inherits from a MergeLayer, because it can have three
 72 |         # inputs - the layer input, the mask and the initial hidden state.  We
 73 |         # will just provide the layer input as incomings, unless a mask input
 74 |         # or initial hidden state was provided.
 75 |         incomings = [incoming]
 76 |         self.mask_incoming_index = -1
 77 |         self.hid_init_incoming_index = -1
 78 |         if mask_input is not None:
 79 |             incomings.append(mask_input)
 80 |             self.mask_incoming_index = len(incomings)-1
 81 |         if isinstance(hid_init, Layer):
 82 |             incomings.append(hid_init)
 83 |             self.hid_init_incoming_index = len(incomings)-1
 84 | 
 85 |         super(onlyRecurrentLayer, self).__init__(incomings, **kwargs)
 86 | 
 87 |         input_to_hidden_in_layers = \
 88 |             [layer for layer in helper.get_all_layers(input_to_hidden)
 89 |              if isinstance(layer, InputLayer)]
 90 |         if len(input_to_hidden_in_layers) != 1:
 91 |             raise ValueError(
 92 |                 '`input_to_hidden` must have exactly one InputLayer, but it '
 93 |                 'has {}'.format(len(input_to_hidden_in_layers)))
 94 | 
 95 |         hidden_to_hidden_in_lyrs = \
 96 |             [layer for layer in helper.get_all_layers(hidden_to_hidden)
 97 |              if isinstance(layer, InputLayer)]
 98 |         if len(hidden_to_hidden_in_lyrs) != 1:
 99 |             raise ValueError(
100 |                 '`hidden_to_hidden` must have exactly one InputLayer, but it '
101 |                 'has {}'.format(len(hidden_to_hidden_in_lyrs)))
102 |         hidden_to_hidden_in_layer = hidden_to_hidden_in_lyrs[0]
103 | 
104 |         self.input_to_hidden = input_to_hidden
105 |         self.hidden_to_hidden = hidden_to_hidden
106 |         self.learn_init = learn_init
107 |         self.backwards = backwards
108 |         self.gradient_steps = gradient_steps
109 |         self.grad_clipping = grad_clipping
110 |         self.unroll_scan = unroll_scan
111 |         self.precompute_input = precompute_input
112 |         self.only_return_final = only_return_final
113 |         
114 | 
115 |         if unroll_scan and gradient_steps != -1:
116 |             raise ValueError(
117 |                 "Gradient steps must be -1 when unroll_scan is true.")
118 | 
119 |         # Retrieve the dimensionality of the incoming layer
120 |         input_shape = self.input_shapes[0]
121 | 
122 |         if nonlinearity is None:
123 |             self.nonlinearity = nonlinearities.identity
124 |         else:
125 |             self.nonlinearity = nonlinearity
126 | 
127 |         # Initialize hidden state
128 |         if isinstance(hid_init, Layer):
129 |             self.hid_init = hid_init
130 |         else:
131 |             self.hid_init = self.add_param(
132 |                 hid_init, (1,) + hidden_to_hidden.output_shape[1:],
133 |                 name="hid_init", trainable=learn_init, regularizable=False)
134 | 
135 |     def get_params(self, **tags):
136 |         # Get all parameters from this layer, the master layer
137 |         params = super(onlyRecurrentLayer, self).get_params(**tags)
138 |         # Combine with all parameters from the child layers
139 |         params += helper.get_all_params(self.input_to_hidden, **tags)
140 |         params += helper.get_all_params(self.hidden_to_hidden, **tags)
141 |         return params
142 | 
143 |     def get_output_shape_for(self, input_shapes):
144 |         # The shape of the input to this layer will be the first element
145 |         # of input_shapes, whether or not a mask input is being used.
146 |         input_shape = input_shapes[0]
147 |         # When only_return_final is true, the second (sequence step) dimension
148 |         # will be flattened
149 |         if self.only_return_final:
150 |             return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
151 |         # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
152 |         else:
153 |             return ((input_shape[0], input_shape[1]) +
154 |                     self.hidden_to_hidden.output_shape[1:])
155 | 
156 |     def get_output_for(self, inputs, **kwargs):
157 |         # Retrieve the layer input
158 |         input = inputs[0]
159 |         # Retrieve the mask when it is supplied
160 |         mask = None
161 |         hid_init = None
162 |         if self.mask_incoming_index > 0:
163 |             mask = inputs[self.mask_incoming_index]
164 |         if self.hid_init_incoming_index > 0:
165 |             hid_init = inputs[self.hid_init_incoming_index]
166 | 
167 |         # Input should be provided as (n_batch, n_time_steps, n_features)
168 |         # but scan requires the iterable dimension to be first
169 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
170 |         #input = input.dimshuffle(1, 0, *range(2, input.ndim))
171 |         seq_len, num_batch = input.shape[0], input.shape[1]
172 | 
173 |         # We will always pass the hidden-to-hidden layer params to step
174 |         non_seqs = helper.get_all_params(self.hidden_to_hidden)
175 | 
176 |         # Create single recurrent computation step function
177 |         def step(input_n, hid_previous, *args):
178 |             # Compute the hidden-to-hidden activation
179 |             hid_pre = helper.get_output(
180 |                 self.hidden_to_hidden, hid_previous, **kwargs)
181 | 
182 |             hid_pre += input_n
183 | 
184 |             # Clip gradients
185 |             if self.grad_clipping:
186 |                 hid_pre = theano.gradient.grad_clip(
187 |                     hid_pre, -self.grad_clipping, self.grad_clipping)
188 | 
189 |             return self.nonlinearity(hid_pre)
190 | 
191 |         def step_masked(input_n, mask_n, hid_previous, *args):
192 |             # Skip over any input with mask 0 by copying the previous
193 |             # hidden state; proceed normally for any input with mask 1.
194 |             hid = step(input_n, hid_previous, *args)
195 |             hid_out = T.switch(mask_n, hid, hid_previous)
196 |             return [hid_out]
197 | 
198 |         if mask is not None:
199 |             mask = mask.dimshuffle(1, 0, 'x')
200 |             sequences = [input, mask]
201 |             step_fun = step_masked
202 |         else:
203 |             sequences = input
204 |             step_fun = step
205 | 
206 |         if not isinstance(self.hid_init, Layer):
207 |             # The code below simply repeats self.hid_init num_batch times in
208 |             # its first dimension.  Turns out using a dot product and a
209 |             # dimshuffle is faster than T.repeat.
210 |             dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
211 |                         [0, self.hid_init.ndim - 1])
212 |             hid_init = T.dot(T.ones((num_batch, 1)),
213 |                              self.hid_init.dimshuffle(dot_dims))
214 | 
215 |         if self.unroll_scan:
216 |             # Retrieve the dimensionality of the incoming layer
217 |             input_shape = self.input_shapes[0]
218 |             # Explicitly unroll the recurrence instead of using scan
219 |             hid_out = unroll_scan(
220 |                 fn=step_fun,
221 |                 sequences=sequences,
222 |                 outputs_info=[hid_init],
223 |                 go_backwards=self.backwards,
224 |                 non_sequences=non_seqs,
225 |                 n_steps=input_shape[1])[0]
226 |         else:
227 |             # Scan op iterates over first dimension of input and repeatedly
228 |             # applies the step function
229 |             hid_out = theano.scan(
230 |                 fn=step_fun,
231 |                 sequences=sequences,
232 |                 go_backwards=self.backwards,
233 |                 outputs_info=[hid_init],
234 |                 non_sequences=non_seqs,
235 |                 truncate_gradient=self.gradient_steps,
236 |                 strict=True)[0]
237 | 
238 |         # When it is requested that we only return the final sequence step,
239 |         # we need to slice it out immediately after scan is applied
240 |         if self.only_return_final:
241 |             hid_out = hid_out[-1]
242 |         else:
243 |             # dimshuffle back to (n_batch, n_time_steps, n_features))
244 |             #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
245 | 
246 |             # if scan is backward reverse the output
247 |             if self.backwards:
248 |                 hid_out = hid_out[::-1,:]
249 | 
250 |         return hid_out
251 | 
252 | 
253 | class IndRNNLayer_onlyrecurrent(onlyRecurrentLayer):
254 | 
255 |     def __init__(self, incoming, num_units,
256 |                  #W_in_to_hid=init.Uniform(),
257 |                  W_hid_to_hid=init.Uniform(),
258 |                  #b=init.Constant(0.),
259 |                  nonlinearity=nonlinearities.rectify,
260 |                  hid_init=init.Constant(0.),
261 |                  backwards=False,
262 |                  learn_init=False,
263 |                  gradient_steps=-1,
264 |                  grad_clipping=0,
265 |                  unroll_scan=False,
266 |                  precompute_input=True,
267 |                  mask_input=None,
268 |                  only_return_final=False,
269 |                  **kwargs):
270 | 
271 |         if isinstance(incoming, tuple):
272 |             input_shape = incoming
273 |         else:
274 |             input_shape = incoming.output_shape
275 |         # Retrieve the supplied name, if it exists; otherwise use ''
276 |         if 'name' in kwargs:
277 |             basename = kwargs['name'] + '.'
278 |             # Create a separate version of kwargs for the contained layers
279 |             # which does not include 'name'
280 |             layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
281 |                                 if key != 'name')
282 |         else:
283 |             basename = ''
284 |             layer_kwargs = kwargs
285 |         # We will be passing the input at each time step to the dense layer,
286 |         # so we need to remove the second dimension (the time dimension)
287 |         in_to_hid = InputLayer(input_shape)
288 |         
289 | #         in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
290 | #                                num_units, W=W_in_to_hid, b=b,
291 | #                                nonlinearity=None,
292 | #                                name=basename + 'input_to_hidden',
293 | #                                **layer_kwargs)        
294 |         # The hidden-to-hidden layer expects its inputs to have num_units
295 |         # features because it recycles the previous hidden state
296 |         
297 |         hid_to_hid = MulLayer(InputLayer((None, num_units)),
298 |                                  W=W_hid_to_hid, 
299 |                                 name=basename + 'hidden_to_hidden',
300 |                                 **layer_kwargs)
301 | #         hid_to_hid = DenseLayer(InputLayer((None, num_units)),
302 | #                                 num_units, W=W_hid_to_hid, b=None,
303 | #                                 nonlinearity=None,
304 | #                                 name=basename + 'hidden_to_hidden',
305 | #                                 **layer_kwargs)
306 | 
307 |         # Make child layer parameters intuitively accessible
308 |         #self.W_in_to_hid = in_to_hid.W
309 |         self.W_hid_to_hid = hid_to_hid.W
310 |         #self.b = in_to_hid.b
311 | 
312 |         # Just use the CustomRecurrentLayer with the DenseLayers we created
313 |         super(IndRNNLayer_onlyrecurrent, self).__init__(
314 |             incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
315 |             hid_init=hid_init, backwards=backwards, learn_init=learn_init,
316 |             gradient_steps=gradient_steps,
317 |             grad_clipping=grad_clipping, unroll_scan=unroll_scan,
318 |             precompute_input=precompute_input, mask_input=mask_input,
319 |             only_return_final=only_return_final, **kwargs)
320 | 


--------------------------------------------------------------------------------