├── README.md
├── tensorflow
    ├── README.md
    └── rnn.py
├── theano
    ├── README.md
    └── rnn.py
└── torch
    ├── README.md
    └── rnn.lua


/README.md:
--------------------------------------------------------------------------------
  1 | # rnn-benchmarks
  2 | 
  3 | All benchmarks are reported for a host with the following specifications :
  4 |    * NVIDIA GeForce GTX TITAN X GPU 
  5 |    * Intel(R) Xeon(R) CPU E5-2630L v3 @ 1.80GHz
  6 |    * CUDA 7.5, cudnnv5
  7 | 
  8 | These benchmarks compare the running time of various recurrent neural networks on different deep-learning libraries.
  9 | The networks (RNN or LSTM) take as input a 3D Tensor `batch_size x seq_length x hidden_size`
 10 | and output the last hidden state, compute a MSE loss, backpropagate the errors through the network and do a simple update of the parameters (`params = params - lr*gradParams`). 
 11 | The sequence length is always set to `30`. 
 12 | The `hidden_size` specifies the size of the output and input layer of the networks.
 13 | 
 14 | The code of the scripts we ran are available. 
 15 | The implementations of each model on the different libraries each use 
 16 | the fastest implementations we were able to find. 
 17 | If you are aware of faster implementations, please let me know. 
 18 | I've reported results on Theano, Torch and TensorFlow so far, but we will try to include many more libraries in the future (including cudnn very soon).
 19 | 
 20 | The reported `Train` time is the average time needed to run (forward, backward, and update) a training example (and not a training batch), so the smaller the better.
 21 | We also report `Compile` time, which includes symbolic graph optimizations (Theano and TensorFlow compilation), as well as a forward and backward pass (to allocate memory).
 22 | While the compilation time isn't really a factor in production, it does increase debugging time, which is why we report it here.
 23 | 
 24 | ## LSTM
 25 | 
 26 | This LSTM implementation used for these benchmarks does not use peephole connections between cell and gates.
 27 | 
 28 | ### Batch Size 32
 29 | 
 30 | #### Hidden Size 128
 31 | 
 32 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 33 | | ------------- | ------------- | ------------- | ------------- |
 34 | | Theano | 7.46 | 289.6 | 99.1 |
 35 | | Torch  | 0.03 | 434.4 | 99.9 |
 36 | | TensorFlow | 3.91 | 820.0 | 266.7 |
 37 | 
 38 | 
 39 | #### Hidden Size 512
 40 | 
 41 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 42 | | ------------- | ------------- | ------------- | ------------- |
 43 | | Theano | 7.59 | 619.4 | 200.9 |
 44 | | Torch  | 0.19 | 610.7 | 201.7 |
 45 | | TensorFlow | 3.97 | 886.9 | 324.9 |
 46 | 
 47 | 
 48 | #### Hidden Size 1024
 49 | 
 50 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 51 | | ------------- | ------------- | ------------- | ------------- |
 52 | | Theano | 9.62 | 1013.5 | 324.1 |
 53 | | Torch  | 0.69 | 1139.8 | 346.3 |
 54 | | TensorFlow | 3.81 | 1329.2 | 562.7 |
 55 | 
 56 | 
 57 | ### Batch Size 128
 58 | 
 59 | #### Hidden Size 128
 60 | 
 61 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 62 | | ------------- | ------------- | ------------- | ------------- |
 63 | | Theano | 7.38 | 102.9 | 25.6 |
 64 | | Torch  | 0.03 | 109.8 | 25.2 |
 65 | | TensorFlow | 3.68 | 188.6 | 65.0 |
 66 | 
 67 | 
 68 | #### Hidden Size 512
 69 | 
 70 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 71 | | ------------- | ------------- | ------------- | ------------- |
 72 | | Theano | 7.50 | 256.0 | 62.8 |
 73 | | Torch  | 0.20 | 214.3 | 51.4 |
 74 | | TensorFlow | 3.73 | 255.2 | 114.2 |
 75 | 
 76 | #### Hidden Size 1024
 77 | 
 78 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 79 | | ------------- | ------------- | ------------- | ------------- |
 80 | | Theano | 7.45 | 583.4 | 160.2 |
 81 | | Torch  | 0.75 | 558.1 | 112.4 |
 82 | | TensorFlow | 3.84 | 592.2 | 238.1 |
 83 | 
 84 | 
 85 | ## RNN
 86 | 
 87 | This section benchmarks a simple RNN implementation.
 88 | 
 89 | ### Batch Size 32
 90 | 
 91 | #### Hidden Size 128
 92 | 
 93 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
 94 | | ------------- | ------------- | ------------- | ------------- |
 95 | | Theano | 4.31 | 104.6 | 30.9 |
 96 | | Torch  | 0.05 | 259.53 | 103.06 |
 97 | | TensorFlow | 1.64 | 278.4 | 111.5 |
 98 | 
 99 | #### Hidden Size 512
100 | 
101 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
102 | | ------------- | ------------- | ------------- | ------------- |
103 | | Theano | 4.36 | 275.2 | 102.2 |
104 | | Torch  | 0.05 | 288.2 | 114.6 |
105 | | TensorFlow | 1.62 | 349.7 | 218.4 |
106 | 
107 | #### Hidden Size 1024
108 | 
109 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
110 | | ------------- | ------------- | ------------- | ------------- |
111 | | Theano | 4.44 | 443.8 | 179.5 |
112 | | Torch  | 0.09 | 381.4 | 118.8 |
113 | | TensorFlow | 1.72 | 530.0 | 241.7 |
114 | 
115 | ### Batch Size 128
116 | 
117 | #### Hidden Size 128
118 | 
119 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
120 | | ------------- | ------------- | ------------- | ------------- |
121 | | Theano | 4.48 | 45.4 | 13.7 |
122 | | Torch  | 0.08 | 67.7 | 32.7 |
123 | | TensorFlow | 1.70 | 75.5 | 33.6 |
124 | 
125 | #### Hidden Size 512
126 | 
127 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
128 | | ------------- | ------------- | ------------- | ------------- |
129 | | Theano | 4.40 | 79.0 | 23.8 |
130 | | Torch  | 0.09 | 73.5 | 34.2 |
131 | | TensorFlow | 1.63 | 125.6 | 86.8 |
132 | 
133 | #### Hidden Size 1024
134 | 
135 | | Library | Compile (s) | Train (µs) | Forward only (µs) |
136 | | ------------- | ------------- | ------------- | ------------- |
137 | | Theano | 4.38 | 147.8 | 50.3 |
138 | | Torch  | 0.13 | 150.2 | 64.7 |
139 | | TensorFlow | 1.70 | 222.5 | 137.8 |
140 | 


--------------------------------------------------------------------------------
/tensorflow/README.md:
--------------------------------------------------------------------------------
  1 | #TensorFlow benchmarks
  2 | 
  3 | Provided by Maarten Bosma.
  4 | 
  5 | I used the build-in rnn libary. ``basic_lstm`` is the Tensorflow equivalent of FastLSTM. 
  6 | 
  7 | These results are produced using TensorFlow 0.8, cuda 7.5, cudnnv5, turned off ondemand cpu governor [1], Intel(R) Xeon(R) CPU E5-2630L v3 @ 1.80GHz, Titan X:
  8 | 
  9 | To install TensorFlow from source:
 10 |    * https://www.tensorflow.org/versions/r0.8/get_started/os_setup.html#installing-from-sources
 11 |    * http://stackoverflow.com/questions/34239537/how-to-update-tensorflow-from-source
 12 |    
 13 | ## Fast LSTM
 14 | 
 15 | 
 16 | 
 17 | ### 30 x 32 x 128
 18 | 
 19 | ```
 20 | $ python rnn.py -n basic_lstm -b 32 -l 128 -s 30
 21 | Setup : compile + forward/backward x 1
 22 | --- 3.91482686996 seconds
 23 | Forward:
 24 | --- 32000 samples in 8.53500294685 seconds (3749.266427 samples/s, 0.0002667 s/sample) ---
 25 | Forward + Backward:
 26 | --- 32000 samples in 26.2391839027 seconds (1219.550125 samples/s, 0.0008200 s/sample) ---
 27 | ``` 
 28 | 
 29 | ### 30 x 32 x 512
 30 | 
 31 | ```
 32 | python rnn.py -n basic_lstm -b 32 -l 512 -s 30
 33 | Setup : compile + forward/backward x 1
 34 | --- 3.97159981728 seconds
 35 | Forward:
 36 | --- 32000 samples in 10.3965659142 seconds (3077.939414 samples/s, 0.0003249 s/sample) ---
 37 | Forward + Backward:
 38 | --- 32000 samples in 28.3808200359 seconds (1127.522036 samples/s, 0.0008869 s/sample) ---
 39 | ``` 
 40 | 
 41 | ### 30 x 32 x 1024
 42 | 
 43 | 
 44 | ```
 45 | python rnn.py -n basic_lstm -b 32 -l 1024 -s 30
 46 | Setup : compile + forward/backward x 1
 47 | --- 3.81890392303 seconds
 48 | Forward:
 49 | --- 32000 samples in 18.0062820911 seconds (1777.157541 samples/s, 0.0005627 s/sample) ---
 50 | Forward + Backward:
 51 | --- 32000 samples in 42.533454895 seconds (752.348947 samples/s, 0.0013292 s/sample) ---
 52 | ``` 
 53 | 
 54 | 
 55 | ### 30 x 128 x 128
 56 | 
 57 | ```
 58 | $ python rnn.py -n basic_lstm -b 128 -l 128 -s 30
 59 | Setup : compile + forward/backward x 1
 60 | --- 3.68258690834 seconds
 61 | Forward:
 62 | --- 128000 samples in 8.3175599575 seconds (15389.128621 samples/s, 0.0000650 s/sample) ---
 63 | Forward + Backward:
 64 | --- 128000 samples in 24.1425020695 seconds (5301.853123 samples/s, 0.0001886 s/sample) ---
 65 | 
 66 | ``` 
 67 | 
 68 | ### 30 x 128 x 512
 69 | 
 70 | ```
 71 | python rnn.py -n basic_lstm -b 128 -l 512 -s 30
 72 | Setup : compile + forward/backward x 1
 73 | --- 3.72586607933 seconds
 74 | Forward:
 75 | --- 128000 samples in 14.6179850101 seconds (8756.336794 samples/s, 0.0001142 s/sample) ---
 76 | Forward + Backward:
 77 | --- 128000 samples in 32.6627261639 seconds (3918.840067 samples/s, 0.0002552 s/sample) ---
 78 | 
 79 | ``` 
 80 | 
 81 | ### 30 x 128 x 1024
 82 | 
 83 | ```
 84 | python rnn.py -n basic_lstm -b 128 -l 1024 -s 30
 85 | Setup : compile + forward/backward x 1
 86 | --- 3.84206986427 seconds
 87 | Forward:
 88 | --- 128000 samples in 30.4814198017 seconds (4199.279457 samples/s, 0.0002381 s/sample) ---
 89 | Forward + Backward:
 90 | --- 128000 samples in 75.8014390469 seconds (1688.622295 samples/s, 0.0005922 s/sample) ---
 91 | 
 92 | ``` 
 93 | 
 94 | ## RNN
 95 | 
 96 | ### 30 x 32 x 128
 97 | 
 98 | ```
 99 | python rnn.py -n rnn -b 32 -l 128 -s 30
100 | Setup : compile + forward/backward x 1
101 | --- 1.6487121582 seconds
102 | Forward:
103 | --- 32000 samples in 3.56794595718 seconds (8968.745711 samples/s, 0.0001115 s/sample) ---
104 | Forward + Backward:
105 | --- 32000 samples in 8.91037988663 seconds (3591.317139 samples/s, 0.0002784 s/sample) ---
106 | ``` 
107 | 
108 | ### 30 x 32 x 512
109 | 
110 | ```
111 | python rnn.py -n rnn -b 32 -l 512 -s 30
112 | Setup : compile + forward/backward x 1
113 | --- 1.62368106842 seconds
114 | Forward:
115 | --- 32000 samples in 6.98823904991 seconds (4579.122118 samples/s, 0.0002184 s/sample) ---
116 | Forward + Backward:
117 | --- 32000 samples in 11.1912858486 seconds (2859.367586 samples/s, 0.0003497 s/sample) ---
118 | ``` 
119 | 
120 | ### 30 x 32 x 1024
121 | 
122 | ```
123 | python rnn.py -n rnn -b 32 -l 1024 -s 30
124 | Setup : compile + forward/backward x 1
125 | --- 1.72744393349 seconds
126 | Forward:
127 | --- 32000 samples in 7.73560094833 seconds (4136.718041 samples/s, 0.0002417 s/sample) ---
128 | Forward + Backward:
129 | --- 32000 samples in 16.9597899914 seconds (1886.815816 samples/s, 0.0005300 s/sample) ---
130 | ``` 
131 | 
132 | ### 30 x 128 x 128
133 | 
134 | ```
135 | python rnn.py -n rnn -b 128 -l 128 -s 30
136 | Setup : compile + forward/backward x 1
137 | --- 1.698335886 seconds
138 | Forward:
139 | --- 128000 samples in 4.29631710052 seconds (29792.959180 samples/s, 0.0000336 s/sample) ---
140 | Forward + Backward:
141 | --- 128000 samples in 9.66468191147 seconds (13244.098582 samples/s, 0.0000755 s/sample) ---
142 | ``` 
143 | 
144 | ### 30 x 128 x 512
145 | 
146 | ```
147 | python rnn.py -n rnn -b 128 -l 512 -s 30
148 | Setup : compile + forward/backward x 1
149 | --- 1.63733696938 seconds
150 | Forward:
151 | --- 128000 samples in 11.1102721691 seconds (11520.869881 samples/s, 0.0000868 s/sample) ---
152 | Forward + Backward:
153 | --- 128000 samples in 16.0786859989 seconds (7960.849538 samples/s, 0.0001256 s/sample) ---
154 | ``` 
155 | 
156 | ### 30 x 128 x 1024
157 | 
158 | ```
159 | python rnn.py -n rnn -b 128 -l 1024 -s 30
160 | Setup : compile + forward/backward x 1
161 | --- 1.7014939785 seconds
162 | Forward:
163 | --- 128000 samples in 17.6321749687 seconds (7259.456092 samples/s, 0.0001378 s/sample) ---
164 | Forward + Backward:
165 | --- 128000 samples in 28.4844169617 seconds (4493.685097 samples/s, 0.0002225 s/sample) ---
166 | 
167 | ``` 
168 | 
169 | 
170 |  [1] Turning on performance governor: `sudo bash -c 'for i in ls /sys/devices/system/cpu/*/cpufreq/scaling_governor; do echo 'performance' > $i; done;'`
171 | 


--------------------------------------------------------------------------------
/tensorflow/rnn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import time
 3 | import optparse
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from tensorflow.python.ops import rnn
 7 | 
 8 | 
 9 | def get_feed_dict(x_data, y_data=None):
10 |     feed_dict = {}
11 | 
12 |     if y_data is not None:
13 |         feed_dict[y] = y_data
14 | 
15 |     for i in xrange(x_data.shape[0]):
16 |         feed_dict[x[i]] = x_data[i, :, :]
17 | 
18 |     return feed_dict
19 | 
20 | 
21 | # Parameters
22 | optparser = optparse.OptionParser()
23 | optparser.add_option("-n", "--network_type", default='rnn', help="Network type (rnn, lstm, basic_lstm)")
24 | optparser.add_option("-l", "--hidden_size", default=100, type='int', help="Hidden layer size")
25 | optparser.add_option("-s", "--seq_length", default=30, type='int', help="Sequence length")
26 | optparser.add_option("-b", "--batch_size", default=20, type='int', help="Batch size")
27 | opts = optparser.parse_args()[0]
28 | 
29 | network_type = opts.network_type
30 | print(network_type)
31 | hidden_size = opts.hidden_size
32 | hidden_size = opts.hidden_size
33 | seq_length = opts.seq_length
34 | batch_size = opts.batch_size
35 | 
36 | n_batch = 1000
37 | n_samples = batch_size * n_batch 
38 | 
39 | # Data
40 | xinput = np.random.rand(seq_length, batch_size, hidden_size).astype(np.float32)
41 | ytarget = np.random.rand(batch_size, hidden_size).astype(np.float32)
42 | 
43 | with tf.device('/gpu:0'):
44 | 
45 |    x = [tf.placeholder(tf.float32, [batch_size, hidden_size], name="x") for i in range(seq_length)]
46 |    y = tf.placeholder(tf.float32, [batch_size, hidden_size], name="y")
47 | 
48 |    if network_type == 'rnn':
49 |        cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
50 |    elif network_type == 'lstm':
51 |        cell = tf.nn.rnn_cell.LSTMCell(hidden_size, hidden_size)
52 |    elif network_type == 'basic_lstm':
53 |        cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
54 |    else:
55 |        raise Exception('Unknown network! '+network_type)
56 | 
57 |    print "Compiling..."
58 |    start = time.time()
59 |    output, _cell_state = rnn.rnn(cell, x, dtype=tf.float32)
60 |    cost = tf.reduce_sum((output[-1] - y) ** 2)
61 | 
62 |    optim = tf.train.GradientDescentOptimizer(0.01)
63 |    train_op = optim.minimize(cost)
64 | 
65 |    session = tf.Session()
66 |    session.run(tf.initialize_all_variables())
67 |    session.run(train_op, feed_dict=get_feed_dict(xinput, ytarget))
68 |    print "Setup : compile + forward/backward x 1"
69 |    print "--- %s seconds" % (time.time() - start)
70 | 
71 |    start = time.time()
72 |    for i in xrange(0, n_batch):
73 |        session.run(output[-1], feed_dict=get_feed_dict(xinput))
74 |    end = time.time()
75 |    print "Forward:"
76 |    print "--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples)
77 | 
78 |    start = time.time()
79 |    for i in xrange(0, n_batch):
80 |        session.run(train_op, feed_dict=get_feed_dict(xinput, ytarget))
81 |    end = time.time()
82 |    print "Forward + Backward:"
83 |    print "--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples)
84 | 


--------------------------------------------------------------------------------
/theano/README.md:
--------------------------------------------------------------------------------
  1 | # Theano Benchmark Log
  2 | 
  3 | Cuda 7.5, cudnnv5, Intel(R) Xeon(R) CPU E5-2630L v3 @ 1.80GHz, Titan X.
  4 | 
  5 | ## Fast LSTM
  6 | 
  7 | 
  8 | 
  9 | ### 30 x 32 x 128
 10 | ```
 11 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 128 -s 30 -b 32 
 12 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 13 | Compiling...
 14 | Setup : compile + forward/backward x 1
 15 | --- 7.45822191238 seconds
 16 | Forward:
 17 | --- 32000 samples in 3.17055702209 seconds (10092.863739 samples/s, 0.0000991 s/sample) ---
 18 | Forward + Backward:
 19 | --- 32000 samples in 9.26702213287 seconds (3453.104950 samples/s, 0.0002896 s/sample) ---
 20 | ``` 
 21 | ### 30 x 32 x 512
 22 | 
 23 | 
 24 | ```
 25 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 512 -s 30 -b 32 
 26 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 27 | Compiling...
 28 | Setup : compile + forward/backward x 1
 29 | --- 7.58512711525 seconds
 30 | Forward:
 31 | --- 32000 samples in 6.42896199226 seconds (4977.475374 samples/s, 0.0002009 s/sample) ---
 32 | Forward + Backward:
 33 | --- 32000 samples in 19.8206739426 seconds (1614.475880 samples/s, 0.0006194 s/sample) ---
 34 | ```  
 35 | 
 36 | ### 30 x 32 x 1024
 37 | 
 38 | ```
 39 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 1024 -s 30 -b 32 
 40 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 41 | Compiling...
 42 | Setup : compile + forward/backward x 1
 43 | --- 9.6281080246 seconds
 44 | Forward:
 45 | --- 32000 samples in 10.3716170788 seconds (3085.343371 samples/s, 0.0003241 s/sample) ---
 46 | Forward + Backward:
 47 | --- 32000 samples in 32.4317178726 seconds (986.688406 samples/s, 0.0010135 s/sample) ---
 48 | ``` 
 49 | 
 50 | ### 30 x 128 x 128
 51 | 
 52 | 
 53 | ```
 54 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 128 -s 30 -b 128 
 55 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 56 | Compiling...
 57 | Setup : compile + forward/backward x 1
 58 | --- 7.37970685959 seconds
 59 | Forward:
 60 | --- 128000 samples in 3.27810716629 seconds (39046.923577 samples/s, 0.0000256 s/sample) ---
 61 | Forward + Backward:
 62 | --- 128000 samples in 13.1759991646 seconds (9714.633281 samples/s, 0.0001029 s/sample) --
 63 | ``` 
 64 | 
 65 | ### 30 x 128 x 512
 66 | 
 67 | ```
 68 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 512 -s 30 -b 128 
 69 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 70 | Compiling...
 71 | Setup : compile + forward/backward x 1
 72 | --- 7.49780893326 seconds
 73 | Forward:
 74 | --- 128000 samples in 8.03891611099 seconds (15922.544561 samples/s, 0.0000628 s/sample) ---
 75 | Forward + Backward:
 76 | --- 128000 samples in 32.7736029625 seconds (3905.582189 samples/s, 0.0002560 s/sample) ---
 77 | 
 78 | ``` 
 79 | 
 80 | ### 30 x 128 x 1024
 81 | 
 82 | 
 83 | ```
 84 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'fastlstm' -l 1024 -s 30 -b 128 
 85 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
 86 | Compiling...
 87 | Setup : compile + forward/backward x 1
 88 | --- 7.44703698158 seconds
 89 | Forward:
 90 | --- 128000 samples in 20.5059478283 seconds (6242.091371 samples/s, 0.0001602 s/sample) ---
 91 | Forward + Backward:
 92 | --- 128000 samples in 74.6807880402 seconds (1713.961560 samples/s, 0.0005834 s/sample) ---
 93 | ``` 
 94 | 
 95 | ## RNN
 96 | 
 97 | 
 98 | ### 30 x 32 x 128
 99 | 
100 | ```
101 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 128 -s 30 -b 32
102 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
103 | Compiling...
104 | Setup : compile + forward/backward x 1
105 | --- 4.309237957 seconds
106 | Forward:
107 | --- 32000 samples in 0.989920139313 seconds (32325.839963 samples/s, 0.0000309 s/sample) ---
108 | Forward + Backward:
109 | --- 32000 samples in 3.34791088104 seconds (9558.199467 samples/s, 0.0001046 s/sample) ---
110 | ``` 
111 | 
112 | ### 30 x 32 x 512
113 | 
114 | 
115 | ```
116 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 512 -s 30 -b 32
117 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
118 | Compiling...
119 | Setup : compile + forward/backward x 1
120 | --- 4.36186599731 seconds
121 | Forward:
122 | --- 32000 samples in 3.27020597458 seconds (9785.316353 samples/s, 0.0001022 s/sample) ---
123 | Forward + Backward:
124 | --- 32000 samples in 8.80706095695 seconds (3633.448225 samples/s, 0.0002752 s/sample) ---
125 | ``` 
126 | 
127 | ### 30 x 32 x 1024
128 | 
129 | ```
130 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 1024 -s 30 -b 32
131 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
132 | Compiling...
133 | Setup : compile + forward/backward x 1
134 | --- 4.44132804871 seconds
135 | Forward:
136 | --- 32000 samples in 5.74468803406 seconds (5570.363405 samples/s, 0.0001795 s/sample) ---
137 | Forward + Backward:
138 | --- 32000 samples in 14.2010200024 seconds (2253.359265 samples/s, 0.0004438 s/sample) ---
139 | 
140 | ``` 
141 | 
142 | ### 30 x 128 x 128
143 | 
144 | ```
145 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 128 -s 30 -b 128
146 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
147 | Compiling...
148 | Setup : compile + forward/backward x 1
149 | --- 4.48347306252 seconds
150 | Forward:
151 | --- 128000 samples in 1.74959516525 seconds (73159.781498 samples/s, 0.0000137 s/sample) ---
152 | Forward + Backward:
153 | --- 128000 samples in 5.81079101562 seconds (22027.982018 samples/s, 0.0000454 s/sample) ---
154 | 
155 | ``` 
156 | 
157 | ### 30 x 128 x 512
158 | 
159 | ```
160 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 512 -s 30 -b 128
161 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
162 | Compiling...
163 | Setup : compile + forward/backward x 1
164 | --- 4.40771007538 seconds
165 | Forward:
166 | --- 128000 samples in 3.04104089737 seconds (42090.851231 samples/s, 0.0000238 s/sample) ---
167 | Forward + Backward:
168 | --- 128000 samples in 10.1157169342 seconds (12653.576690 samples/s, 0.0000790 s/sample) ---
169 | ``` 
170 | 
171 | ### 30 x 128 x 1024
172 | 
173 | ```
174 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python rnn.py -n 'rnn' -l 1024 -s 30 -b 128
175 | Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN not available)
176 | Compiling...
177 | Setup : compile + forward/backward x 1
178 | --- 4.38037991524 seconds
179 | Forward:
180 | --- 128000 samples in 6.43677687645 seconds (19885.728907 samples/s, 0.0000503 s/sample) ---
181 | Forward + Backward:
182 | --- 128000 samples in 18.919303894 seconds (6765.576615 samples/s, 0.0001478 s/sample) ---
183 | ``` 
184 | 


--------------------------------------------------------------------------------
/theano/rnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import time
  3 | import optparse
  4 | import numpy as np
  5 | import theano
  6 | import theano.tensor as T
  7 | 
  8 | 
  9 | def random_weights(shape):
 10 |     drange = np.sqrt(6. / (np.sum(shape)))
 11 |     return drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
 12 | 
 13 | 
 14 | def create_shared(value, name):
 15 |     return theano.shared(value=np.array(value, dtype=np.float32), name=name)
 16 | 
 17 | 
 18 | class RNN(object):
 19 |     """
 20 |     Recurrent neural network. Can be used with or without batches.
 21 |     Without batches:
 22 |         Input: matrix of dimension (sequence_length, input_dim)
 23 |         Output: vector of dimension (output_dim)
 24 |     With batches:
 25 |         Input: tensor3 of dimension (batch_size, sequence_length, input_dim)
 26 |         Output: matrix of dimension (batch_size, output_dim)
 27 |     """
 28 |     def __init__(self, input_dim, hidden_dim, activation=T.nnet.sigmoid,
 29 |                  with_batch=True, name='RNN'):
 30 |         self.input_dim = input_dim
 31 |         self.hidden_dim = hidden_dim
 32 |         self.activation = activation
 33 |         self.with_batch = with_batch
 34 |         self.name = name
 35 | 
 36 |         self.w_x = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_x')
 37 |         self.w_h = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_h')
 38 | 
 39 |         self.b_h = create_shared(np.zeros((hidden_dim,)), name + '__b_h')
 40 |         self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0')
 41 | 
 42 |         self.params = [self.w_x, self.w_h, self.b_h, self.h_0]
 43 | 
 44 |     def link(self, input):
 45 |         """
 46 |         Propagate the input through the network and return the last hidden vector.
 47 |         The whole sequence is also accessible through self.h
 48 |         """
 49 |         def recurrence(x_t, h_tm1):
 50 |             return self.activation(x_t + T.dot(h_tm1, self.w_h) + self.b_h)
 51 | 
 52 |         # If we used batches, we have to permute the first and second dimension.
 53 |         if self.with_batch:
 54 |             self.input = input.dimshuffle(1, 0, 2)
 55 |             outputs_info = T.alloc(self.h_0, self.input.shape[1], self.hidden_dim)
 56 |         else:
 57 |             self.input = input
 58 |             outputs_info = self.h_0
 59 | 
 60 |         h, _ = theano.scan(
 61 |             fn=recurrence,
 62 |             sequences=T.dot(self.input, self.w_x),
 63 |             outputs_info=outputs_info,
 64 |             n_steps=self.input.shape[0]
 65 |         )
 66 |         self.h = h
 67 |         self.output = h[-1]
 68 | 
 69 |         return self.output
 70 | 
 71 | 
 72 | class LSTM(object):
 73 |     """
 74 |     Long short-term memory (LSTM). Can be used with or without batches.
 75 |     Without batches:
 76 |         Input: matrix of dimension (sequence_length, input_dim)
 77 |         Output: vector of dimension (output_dim)
 78 |     With batches:
 79 |         Input: tensor3 of dimension (batch_size, sequence_length, input_dim)
 80 |         Output: matrix of dimension (batch_size, output_dim)
 81 |     """
 82 | 
 83 |     def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'):
 84 |         """
 85 |         Initialize neural network.
 86 |         """
 87 |         self.input_dim = input_dim
 88 |         self.hidden_dim = hidden_dim
 89 |         self.with_batch = with_batch
 90 |         self.name = name
 91 | 
 92 |         # Input gate weights
 93 |         self.w_xi = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xi')
 94 |         self.w_hi = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hi')
 95 |         self.w_ci = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ci')
 96 | 
 97 |         # Forget gate weights
 98 |         self.w_xf = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xf')
 99 |         self.w_hf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hf')
100 |         self.w_cf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_cf')
101 | 
102 |         # Output gate weights
103 |         self.w_xo = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xo')
104 |         self.w_ho = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ho')
105 |         self.w_co = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_co')
106 | 
107 |         # Cell weights
108 |         self.w_xc = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xc')
109 |         self.w_hc = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hc')
110 | 
111 |         # Initialize the bias vectors, c_0 and h_0 to zero vectors
112 |         self.b_i = create_shared(np.zeros((hidden_dim,)), name + '__b_i')
113 |         self.b_f = create_shared(np.zeros((hidden_dim,)), name + '__b_f')
114 |         self.b_c = create_shared(np.zeros((hidden_dim,)), name + '__b_c')
115 |         self.b_o = create_shared(np.zeros((hidden_dim,)), name + '__b_o')
116 |         self.c_0 = create_shared(np.zeros((hidden_dim,)), name + '__c_0')
117 |         self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0')
118 | 
119 |         # Define parameters
120 |         self.params = [self.w_xi, self.w_hi,  # self.w_ci,
121 |                        self.w_xf, self.w_hf,  # self.w_cf,
122 |                        self.w_xo, self.w_ho,  # self.w_co,
123 |                        self.w_xc, self.w_hc,
124 |                        self.b_i, self.b_c, self.b_o, self.b_f,
125 |                       ] # self.c_0, self.h_0]
126 | 
127 |     def link(self, input):
128 |         """
129 |         Propagate the input through the network and return the last hidden vector.
130 |         The whole sequence is also accessible through self.h
131 |         """
132 | 
133 |         def recurrence(x_t, c_tm1, h_tm1):
134 |             i_t = T.nnet.sigmoid(T.dot(x_t, self.w_xi) + T.dot(h_tm1, self.w_hi) + self.b_i)  # + T.dot(c_tm1, self.w_ci)
135 |             f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) + T.dot(h_tm1, self.w_hf) + self.b_f)  # + T.dot(c_tm1, self.w_cf)
136 |             c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.w_xc) + T.dot(h_tm1, self.w_hc) + self.b_c)
137 |             o_t = T.nnet.sigmoid(T.dot(x_t, self.w_xo) + T.dot(h_tm1, self.w_ho) + self.b_o)  # + T.dot(c_t, self.w_co)
138 |             h_t = o_t * T.tanh(c_t)
139 |             return [c_t, h_t]
140 | 
141 |         # If we used batches, we have to permute the first and second dimension.
142 |         if self.with_batch:
143 |             self.input = input.dimshuffle(1, 0, 2)
144 |             outputs_info = [T.alloc(x, self.input.shape[1], self.hidden_dim) for x in [self.c_0, self.h_0]]
145 |         else:
146 |             self.input = input
147 |             outputs_info = [self.c_0, self.h_0]
148 | 
149 |         [c, h], _ = theano.scan(
150 |             fn=recurrence,
151 |             sequences=self.input,
152 |             outputs_info=outputs_info,
153 |             n_steps=self.input.shape[0]
154 |         )
155 |         self.c = c
156 |         self.h = h
157 |         self.output = h[-1]
158 | 
159 |         return self.output
160 | 
161 | 
162 | class FastLSTM(object):
163 |     """
164 |     LSTM with faster implementation (supposedly).
165 |     Not as expressive as the previous one though, because it doesn't include the peepholes connections.
166 |     """
167 |     def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'):
168 |         """
169 |         Initialize neural network.
170 |         """
171 |         self.input_dim = input_dim
172 |         self.hidden_dim = hidden_dim
173 |         self.with_batch = with_batch
174 |         self.name = name
175 | 
176 |         self.W = create_shared(random_weights((input_dim, hidden_dim * 4)), name + 'W')
177 |         self.U = create_shared(random_weights((hidden_dim, hidden_dim * 4)), name + 'U')
178 |         self.b = create_shared(random_weights((hidden_dim * 4, )), name + 'b')
179 | 
180 |         self.c_0 = create_shared(np.zeros((hidden_dim,)), name + '__c_0')
181 |         self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0')
182 | 
183 |         self.params = [self.W, self.U, self.b]
184 | 
185 |     def link(self, input):
186 |         """
187 |         Propagate the input through the network and return the last hidden vector.
188 |         The whole sequence is also accessible through self.h
189 |         """
190 |         def split(x, n, dim):
191 |             return x[:, n*dim:(n+1)*dim]
192 | 
193 |         def recurrence(x_t, c_tm1, h_tm1):
194 |             p = x_t + T.dot(h_tm1, self.U)
195 |             i = T.nnet.sigmoid(split(p, 0, self.hidden_dim))
196 |             f = T.nnet.sigmoid(split(p, 1, self.hidden_dim))
197 |             o = T.nnet.sigmoid(split(p, 2, self.hidden_dim))
198 |             c = T.tanh(split(p, 3, self.hidden_dim))
199 |             c = f * c_tm1 + i * c
200 |             h = o * T.tanh(c)
201 |             return c, h
202 | 
203 |         preact = T.dot(input.dimshuffle(1, 0, 2), self.W) + self.b
204 |         outputs_info = [T.alloc(x, input.shape[0], self.hidden_dim) for x in [self.c_0, self.h_0]]
205 | 
206 |         [_, h], _ = theano.scan(
207 |             fn=recurrence,
208 |             sequences=preact,
209 |             outputs_info=outputs_info,
210 |             n_steps=input.shape[1]
211 |         )
212 |         self.h = h
213 |         self.output = h[-1]
214 | 
215 |         return self.output
216 | 
217 | 
218 | # Parameters
219 | 
220 | optparser = optparse.OptionParser()
221 | optparser.add_option("-n", "--network_type", default='rnn', help="Network type (rnn, lstm, fastlstm)")
222 | optparser.add_option("-l", "--hidden_size", default=128, type='int', help="Hidden layer size")
223 | optparser.add_option("-s", "--seq_length", default=30, type='int', help="Sequence length")
224 | optparser.add_option("-b", "--batch_size", default=32, type='int', help="Batch size")
225 | opts = optparser.parse_args()[0]
226 | 
227 | network_type = opts.network_type
228 | hidden_size = opts.hidden_size
229 | seq_length = opts.seq_length
230 | batch_size = opts.batch_size
231 | 
232 | 
233 | # Data
234 | 
235 | n_batch = 1000
236 | xinput = theano.shared(np.random.rand(seq_length, batch_size, hidden_size).astype(np.float32))
237 | ytarget = theano.shared(np.random.rand(batch_size, hidden_size).astype(np.float32))
238 | 
239 | 
240 | # Network
241 | 
242 | start = time.time()
243 | 
244 | index = T.iscalar()
245 | x = T.ftensor3()
246 | y = T.fmatrix()
247 | 
248 | if network_type == 'rnn':
249 |     rnn = RNN(hidden_size, hidden_size)
250 | elif network_type == 'lstm':
251 |     rnn = LSTM(hidden_size, hidden_size)
252 | elif network_type == 'fastlstm':
253 |     rnn = FastLSTM(hidden_size, hidden_size)
254 | else:
255 |     raise Exception('Unknown network!')
256 | output = rnn.link(x.dimshuffle(1, 0, 2))
257 | 
258 | cost = ((output - y) ** 2).mean()
259 | updates = [(p, p - theano.shared(np.float32(0.01)) * g) for p, g in zip(rnn.params, T.grad(cost, rnn.params))]
260 | 
261 | print 'Compiling...'
262 | f_test = theano.function(inputs=[], outputs=output, givens={x: xinput})
263 | f_train = theano.function(inputs=[], outputs=cost, updates=updates, givens={x: xinput, y: ytarget})
264 | f_train()
265 | theano.sandbox.cuda.synchronize()
266 | print "Setup : compile + forward/backward x 1"
267 | print "--- %s seconds" % (time.time() - start)
268 | 
269 | n_samples = n_batch * batch_size
270 | start = time.time()
271 | for i in xrange(0, n_batch):
272 |     f_test()
273 | theano.sandbox.cuda.synchronize()
274 | end = time.time()
275 | print "Forward:"
276 | print "--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples)
277 | 
278 | start = time.time()
279 | for i in xrange(0, n_batch):
280 |     # if k % 100 == 0:
281 |     #     print k
282 |     f_train()
283 | theano.sandbox.cuda.synchronize()
284 | end = time.time()
285 | print "Forward + Backward:"
286 | print "--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples)
287 | 


--------------------------------------------------------------------------------
/torch/README.md:
--------------------------------------------------------------------------------
  1 | # Torch Benchmark
  2 | 
  3 | Provided by [Nicholas Leonard](https://github.com/nicholas-leonard).
  4 | 
  5 | Benchmark script uses [Element-Research/rnn](https://github.com/Element-Research/rnn).
  6 | 
  7 | Lua 5.2, Cuda 7.5, cudnnv5, Intel(R) Xeon(R) CPU E5-2630L v3 @ 1.80GHz, Titan X:
  8 | 
  9 | 
 10 | ## Fast LSTM
 11 | 
 12 | 
 13 | ### 30 x 32 x 128
 14 | 
 15 | ```
 16 | $ th rnn.lua -network 'fastlstm' -batchsize 32 -hiddensize 128 -seqlen 30
 17 | Setup : compile + forward/backward x 1  
 18 | --- 0.024899005889893 seconds ---
 19 | Forward:
 20 | --- 32000 samples in 3.1959130764008 seconds (10012.885074946 samples/s, 99.871315062046 microsec/samples) ---  
 21 | Forward + Backward:
 22 | --- 32000 samples in 13.899139881134 seconds (2302.3021987976 samples/s, 434.34784561396 microsec/samples) ---
 23 | ``` 
 24 | 
 25 | ### 30 x 32 x 512
 26 | 
 27 | ```
 28 | $ th rnn.lua -network 'fastlstm' -batchsize 32 -hiddensize 512 -seqlen 30
 29 | Setup : compile + forward/backward x 1  
 30 | --- 0.18875980377197 seconds ---
 31 | Forward:
 32 | --- 32000 samples in 6.4531669616699 seconds (4958.8108406272 samples/s, 201.66125148535 microsec/samples) ---  
 33 | Forward + Backward:
 34 | --- 32000 samples in 19.541891098022 seconds (1637.5083655011 samples/s, 610.68390309811 microsec/samples) ---
 35 | ```  
 36 | 
 37 | ### 30 x 32 x 1024
 38 | 
 39 | ```
 40 | $ th rnn.lua -network 'fastlstm' -batchsize 32 -hiddensize 1024 -seqlen 30
 41 | Setup : compile + forward/backward x 1  
 42 | --- 0.69268393516541 seconds ---
 43 | Forward:
 44 | --- 32000 samples in 11.082577943802 seconds (2887.4174470646 samples/s, 346.33024781942 microsec/samples) ---  
 45 | Forward + Backward:
 46 | --- 32000 samples in 36.474525928497 seconds (877.32484315331 samples/s, 1139.8286595941 microsec/samples) ---
 47 | ``` 
 48 | 
 49 | ### 30 x 128 x 128
 50 | 
 51 | 
 52 | ```
 53 | $ th rnn.lua -network 'fastlstm' -batchsize 128 -hiddensize 128 -seqlen 30
 54 | Setup : compile + forward/backward x 1  
 55 | --- 0.028716802597046 seconds ---
 56 | Forward:
 57 | --- 128000 samples in 3.2250719070435 seconds (39689.110895787 samples/s, 25.195827707648 microsec/samples) --- 
 58 | Forward + Backward:
 59 | --- 128000 samples in 14.058291912079 seconds (9104.9498912316 samples/s, 109.83036831021 microsec/samples) ---
 60 | ``` 
 61 | 
 62 | ### 30 x 128 x 512
 63 | 
 64 | 
 65 | ```
 66 | $ th rnn.lua -network 'fastlstm' -batchsize 128 -hiddensize 512 -seqlen 30
 67 | Setup : compile + forward/backward x 1  
 68 | --- 0.19667100906372 seconds ---
 69 | Forward:
 70 | --- 128000 samples in 6.5813970565796 seconds (19448.779340937 samples/s, 51.417108625174 microsec/samples) --- 
 71 | Forward + Backward:
 72 | --- 128000 samples in 27.426359891891 seconds (4667.0445070921 samples/s, 214.26836587489 microsec/samples) --- 
 73 | ``` 
 74 | 
 75 | ### 30 x 128 x 1024
 76 | 
 77 | ```
 78 | $ th rnn.lua -network 'fastlstm' -batchsize 128 -hiddensize 1024 -seqlen 30
 79 | Setup : compile + forward/backward x 1  
 80 | --- 0.74531388282776 seconds ---
 81 | Forward:
 82 | --- 128000 samples in 14.383507966995 seconds (8899.0845165442 samples/s, 112.37110942602 microsec/samples) --- 
 83 | Forward + Backward:
 84 | --- 128000 samples in 71.433391094208 seconds (1791.8792478834 samples/s, 558.07331949472 microsec/samples) --- 
 85 | ```
 86 | 
 87 | 
 88 | 
 89 | 
 90 | ## RNN
 91 | 
 92 | 
 93 | ### 30 x 32 x 128
 94 | 
 95 | ```
 96 | $ th rnn.lua -network 'rnn' -batchsize 32 -hiddensize 128 -seqlen 30
 97 | Setup : compile + forward/backward x 1  
 98 | --- 0.045458793640137 seconds ---
 99 | Forward:
100 | --- 32000 samples in 3.2980129718781 seconds (9702.8295844296 samples/s, 103.06271910667 microsec/samples) ---  
101 | Forward + Backward:
102 | --- 32000 samples in 8.305154800415 seconds (3853.0314888602 samples/s, 259.53590124846 microsec/samples) ---
103 | 
104 | ``` 
105 | 
106 | ### 30 x 32 x 512
107 | 
108 | ```
109 | $ th rnn.lua -network 'rnn' -batchsize 32 -hiddensize 512 -seqlen 30
110 | Setup : compile + forward/backward x 1  
111 | --- 0.053925037384033 seconds ---
112 | Forward:
113 | --- 32000 samples in 3.6663720607758 seconds (8727.9910213711 samples/s, 114.57390338182 microsec/samples) ---  
114 | Forward + Backward:
115 | --- 32000 samples in 9.2218749523163 seconds (3470.0127856443 samples/s, 288.18337619305 microsec/samples) --- 
116 | ``` 
117 | 
118 | ### 30 x 32 x 1024
119 | 
120 | ```
121 | $ th rnn.lua -network 'rnn' -batchsize 32 -hiddensize 1024 -seqlen 30
122 | Setup : compile + forward/backward x 1  
123 | --- 0.08701491355896 seconds ---
124 | Forward:
125 | --- 32000 samples in 3.8027799129486 seconds (8414.9119629321 samples/s, 118.83665621281 microsec/samples) ---  
126 | Forward + Backward:
127 | --- 32000 samples in 12.205145835876 seconds (2621.8464374057 samples/s, 381.4105913043 microsec/samples) ---   
128 | ``` 
129 | 
130 | ### 30 x 128 x 128
131 | 
132 | ```
133 | $ th rnn.lua -network 'rnn' -batchsize 128 -hiddensize 128 -seqlen 30
134 | Setup : compile + forward/backward x 1  
135 | --- 0.078629016876221 seconds ---
136 | Forward:
137 | --- 128000 samples in 4.1859209537506 seconds (30578.752442332 samples/s, 32.702445983887 microsec/samples) --- 
138 | Forward + Backward:
139 | --- 128000 samples in 8.6592428684235 seconds (14781.904624814 samples/s, 67.650280892849 microsec/samples) --- 
140 | ``` 
141 | 
142 | ### 30 x 128 x 512
143 | 
144 | ```
145 | $ th rnn.lua -network 'rnn' -batchsize 128 -hiddensize 512 -seqlen 30
146 | Setup : compile + forward/backward x 1  
147 | --- 0.088251113891602 seconds ---
148 | Forward:
149 | --- 128000 samples in 4.383120059967 seconds (29203.014867419 samples/s, 34.243039786816 microsec/samples) ---  
150 | Forward + Backward:
151 | --- 128000 samples in 9.4049069881439 seconds (13609.928358313 samples/s, 73.475772514939 microsec/samples) --- 
152 | ``` 
153 | 
154 | ### 30 x 128 x 1024
155 | 
156 | ```
157 | $ th rnn.lua -network 'rnn' -batchsize 128 -hiddensize 1024 -seqlen 30
158 | Setup : compile + forward/backward x 1  
159 | --- 0.12880301475525 seconds ---
160 | Forward:
161 | --- 128000 samples in 8.2753868103027 seconds (15467.566064044 samples/s, 64.651412889361 microsec/samples) --- 
162 | Forward + Backward:
163 | --- 128000 samples in 19.230028152466 seconds (6656.2610449056 samples/s, 150.23449249566 microsec/samples) --- 
164 | ``` 
165 | 
166 | 


--------------------------------------------------------------------------------
/torch/rnn.lua:
--------------------------------------------------------------------------------
 1 | require('torch')
 2 | require('cutorch')
 3 | require('nn')
 4 | require('cunn')
 5 | require('rnn')
 6 | require('nngraph')
 7 | 
 8 | -- Should produce a speed increase.
 9 | nn.FastLSTM.usenngraph = true
10 | 
11 | -- cutorch.setDevice(2)
12 | 
13 | cmd = torch.CmdLine()
14 | cmd:text()
15 | cmd:text('Options')
16 | cmd:option('-nbatch', 1000, 'Number of samples')
17 | cmd:option('-network', 'fastlstm', 'Network type')
18 | cmd:option('-hiddensize', 128, 'Neural network input and output size')
19 | cmd:option('-seqlen', 30, 'Sequence length')
20 | cmd:option('-batchsize', 20, 'Batch size')
21 | cmd:text()
22 | for k, v in pairs(cmd:parse(arg)) do _G[k] = v end
23 | 
24 | local input = torch.rand(seqlen, batchsize, hiddensize):cuda()
25 | local target = torch.rand(batchsize, hiddensize):cuda()
26 | 
27 | local a = torch.Timer()
28 | local rnn
29 | if network == 'rnn' then
30 |    rnn = nn.Sequential()
31 |       :add(nn.JoinTable(1,1))
32 |       :add(nn.Linear(hiddensize*2, hiddensize))
33 |       :add(nn.Sigmoid())
34 |    rnn = nn.Recurrence(rnn, hiddensize, 1)
35 |    rnn = nn.Sequential()
36 |       :add(nn.Sequencer(rnn))
37 |       :add(nn.Select(1,-1))
38 | elseif network == 'lstm' then -- ( no peephole connections)
39 |    rnn = nn.LSTM(hiddensize, hiddensize)
40 |    rnn = nn.Sequential()
41 |       :add(nn.Sequencer(rnn))
42 |       :add(nn.Select(1,-1))
43 | elseif network == 'oldfastlstm' then -- ( no peephole connections)
44 |    rnn = nn.FastLSTM(hiddensize, hiddensize)
45 |    rnn = nn.Sequential()
46 |       :add(nn.Sequencer(rnn))
47 |       :add(nn.Select(1,-1))
48 | elseif network == 'fastlstm' then -- like fastlstm but faster ( no peephole connections)
49 |    rnn = nn.SeqLSTM(hiddensize, hiddensize)
50 |    rnn = nn.Sequential()
51 |       :add(rnn)
52 |       :add(nn.Select(1,-1))
53 | else
54 |    error('Unkown network type!')
55 | end
56 | 
57 | local criterion = nn.MSECriterion()
58 | if cpu ~= true then
59 |    rnn:cuda()
60 |    criterion:cuda()
61 | end
62 | 
63 | criterion:forward(rnn:forward(input), target)
64 | rnn:backward(input, criterion:backward(rnn.output, target))
65 | cutorch.synchronize()
66 | print("Setup : compile + forward/backward x 1")
67 | print("--- " .. a:time().real .. " seconds ---")
68 | 
69 | a:reset()
70 | for i = 1, nbatch do
71 |    rnn:forward(input)
72 | end
73 | cutorch.synchronize()
74 | print("Forward:")
75 | local nSamples = nbatch * batchsize
76 | local speed = nSamples / a:time().real
77 | print("--- " .. nSamples .. " samples in " .. a:time().real .. " seconds (" .. speed .. " samples/s, " .. 1000000/speed .. " microsec/samples) ---")
78 | 
79 | a:reset()
80 | for i = 1, nbatch do
81 |    criterion:forward(rnn:forward(input), target)
82 |    rnn:zeroGradParameters()
83 |    rnn:backward(input, criterion:backward(rnn.output, target))
84 |    rnn:updateParameters(0.01)
85 | end
86 | cutorch.synchronize()
87 | print("Forward + Backward:")
88 | local speed = nSamples / a:time().real
89 | print("--- " .. nSamples .. " samples in " .. a:time().real .. " seconds (" .. speed .. " samples/s, " .. 1000000/speed .. " microsec/samples) ---")
90 | 


--------------------------------------------------------------------------------