├── .gitignore
├── README.txt
├── c
    ├── Makefile
    └── aa.cc
├── eblearn
    ├── Makefile
    ├── convnet.cc
    ├── convnet256.cc
    ├── convnet96.cc
    ├── mlp.cc
    ├── mnist_example.cc
    └── run.sh
├── numpy
    ├── aa_numpy.py
    ├── logreg.py
    ├── mlp.py
    ├── rbm.py
    └── run.sh
├── reports
    ├── ascii.py
    ├── build_csv.py
    ├── show_csv.py
    └── task_pdfs.py
├── theano
    ├── aa.py
    ├── control.py
    ├── convnet.py
    ├── mlp.py
    ├── rbm.py
    └── run.sh
├── torch5
    ├── MiniBatchGradient.lua
    ├── mlp.lua
    ├── mlp_minibatch.lua
    └── run.sh
└── torch7
    ├── .gitignore
    ├── README.txt
    ├── SpatialConvolutionFast.lua
    ├── add_to_db.py
    ├── benchmark.lua
    ├── cudahacks.lua
    └── run.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | code/mnist.pkl.gz
2 | html
3 | *.pyc
4 | *.swp
5 | *.x
6 | *~
7 | *.bmark
8 | db.pkl
9 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
  1 | Intro
  2 | =====
  3 | 
  4 | 
  5 | The benchmarking folder contains efforts to benchmark Theano against various
  6 | other systems.  Each subfolder corresponds to a particular style of
  7 | implementation.  Since there is a variety of benchmark problems and of software
  8 | systems, there isn't a standard for how to run the benchmark suite.  There is
  9 | however a standard for how each benchmark should produce results.  Every
 10 | benchmark run should produce one or more files with the results of benchmarking.
 11 | These files must end with extension '.bmark'.  These files must have 'csv' lines
 12 | like this:
 13 | 
 14 | task<tab>implementation name<tab>examples/second
 15 | 
 16 | 
 17 | Current Tasks
 18 | ==============
 19 | 
 20 | Dense
 21 | -----
 22 | 
 23 | mlp_784_10
 24 | - training on 10K MNIST-sized examples with unregularized Logistic Regression (crossentropy / NLL error)
 25 | 
 26 | mlp_784_500_10
 27 | - training on 10K examples with a single-hidden layer model with 500 hidden units
 28 | 
 29 | mlp_784_1000_1000_1000_10
 30 | - training on 10K examples with multiple hidden layers 
 31 | 
 32 | cd1 rbm_bernoulli 1024_1024
 33 | - train an RBM from 10K 1024-dimensional inputs
 34 | 
 35 | daa_1024_1024
 36 | - train a denoising autoassociator from 10K 1024-dimensional inputs
 37 | 
 38 | Convolutional
 39 | -------------
 40 | 
 41 | ConvSmall
 42 | - train from 10K 32x32 inputs, as in LeNet5
 43 | - convnet_32x32_c5x5_s2x2_c5x5_s2x2_120_10
 44 | 
 45 | ConvMed
 46 | - train from 10K 96x96 images
 47 | 
 48 | ConvLarge
 49 | - train from 10K 256x256 images
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | Potential Tasks
 59 | ================
 60 | 
 61 | Dense
 62 | -----
 63 | 
 64 | mlp_32_10
 65 | - training on 10K tiny examples with unregularized Logistic Regression (crossentropy / NLL error)
 66 | 
 67 | mlp_784_10 with L1
 68 | - training on 10K examples with L1 regularization
 69 | 
 70 | mlp_784_10 with L2
 71 | - training on 10K examples with L2 regularization
 72 | 
 73 | aa_64_64
 74 | - train an autoassociator from at 10K 64-dimensional inputs
 75 | 
 76 | aa_1024_1024
 77 | - train an autoassociator from 10K 1024-dimensional inputs
 78 | 
 79 | 
 80 | cd1 rbm_bernoulli 64_64
 81 | - train an RBM from 10K 64-dimensional inputs
 82 | 
 83 | 
 84 | 
 85 | Convolutional
 86 | -------------
 87 | 
 88 | 
 89 | LeNet5_32x32x3
 90 | - train from 10K Tiny-Image sized inputs (in color)
 91 | 
 92 | conv_daa_i32x32_f7x7
 93 | - train a convolutional
 94 | 
 95 | conv_daa_i256x256_f9x9
 96 | - train a convolutional
 97 | 
 98 | 
 99 | Recurrent
100 | ---------
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/c/Makefile:
--------------------------------------------------------------------------------
1 | aa.x : aa.cc
2 | 	g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl ${THEANO_BLAS_LDFLAGS}
3 | 
4 | clean : 
5 | 	rm aa.x
6 | 


--------------------------------------------------------------------------------
/c/aa.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *
  3 |  * g++ -O2 -ffast-math -I$PUB_PREFIX/include aa.cc -o aa.x -lgsl -lgslcblas 
  4 |  *
  5 |  * g++ -O2 -ffast-math -I$PUB_PREFIX/include aa.cc -o aa.x -L$PUB_PREFIX/lib -lgsl -lcblas -lgoto -lgfortran 
  6 |  *
  7 |  * ./aa.x 10 5 7 1000
  8 |  *
  9 |  * */
 10 | #include <cassert>
 11 | #include <cstdlib>
 12 | #include <cstdio>
 13 | #include <cmath>
 14 | #include <gsl/gsl_rng.h>
 15 | #include <gsl/gsl_blas.h>
 16 | 
 17 | #include <time.h>
 18 | #include <sys/time.h>
 19 | 
 20 | double pytime(const struct timeval * tv)
 21 | {
 22 |     return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
 23 | }
 24 | 
 25 | int main(int argc, char **argv)
 26 | {
 27 |     assert(argc == 5);
 28 | 
 29 |     int neg = strtol(argv[1], 0, 0);
 30 |     int nout = strtol(argv[2], 0, 0);
 31 |     int nin = nout;
 32 |     int nhid = strtol(argv[3], 0, 0);
 33 |     int niter = strtol(argv[4], 0, 0);
 34 |     double lr = 0.01;
 35 |     gsl_rng * rng = gsl_rng_alloc (gsl_rng_taus);
 36 |     gsl_rng_set(rng, 234);
 37 | 
 38 | 
 39 |     gsl_matrix * x = gsl_matrix_alloc(neg, nin);
 40 |     gsl_matrix * w = gsl_matrix_alloc(nin, nhid);
 41 |     gsl_vector * a = gsl_vector_alloc(nhid);
 42 |     gsl_vector * b = gsl_vector_alloc(nout);
 43 |     gsl_matrix * xw = gsl_matrix_alloc(neg, nhid);
 44 |     gsl_matrix * hid = gsl_matrix_alloc(neg, nhid);
 45 |     gsl_matrix * hidwt = gsl_matrix_alloc(neg, nout);
 46 |     gsl_matrix * g_hidwt = gsl_matrix_alloc(neg, nout);
 47 |     gsl_matrix * g_hid = gsl_matrix_alloc(neg, nhid);
 48 |     gsl_matrix * g_w = gsl_matrix_alloc(nout, nhid);
 49 |     gsl_vector * g_b = gsl_vector_alloc(nout);
 50 | 
 51 |     for (int i = 0; i < neg*nout; ++i) x->data[i] = (gsl_rng_uniform(rng) -0.5)*1.5;
 52 |     for (int i = 0; i < nout*nhid; ++i) w->data[i] = gsl_rng_uniform(rng);
 53 |     for (int i = 0; i < nhid; ++i) a->data[i] = 0.0;
 54 |     for (int i = 0; i < nout; ++i) b->data[i] = 0.0;
 55 | 
 56 | //  
 57 | //
 58 | //
 59 | //
 60 | 
 61 |     struct timeval tv0, tv1;
 62 | 
 63 |     struct timeval tdot0, tdot1;
 64 |     double time_of_dot = 0.0;
 65 | 
 66 |     gettimeofday(&tv0, 0);
 67 |     double err = 0.0;
 68 |     for (int iter = 0; iter < niter; ++iter)
 69 |     {
 70 |         gettimeofday(&tdot0, 0);
 71 |         gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, x, w, 0.0, xw);
 72 |         gettimeofday(&tdot1, 0);
 73 |         time_of_dot += pytime(&tdot1) - pytime(&tdot0);
 74 | 
 75 |         for (int i = 0; i < neg; ++i)
 76 |             for (int j = 0; j < nhid; ++j)
 77 |             {
 78 |                 double act = xw->data[i*nhid+j] + a->data[j];
 79 |                 hid->data[i*nhid+j] = tanh(act);
 80 |             }
 81 | 
 82 |         gettimeofday(&tdot0, 0);
 83 |         gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, hid, w, 0.0, hidwt);
 84 |         gettimeofday(&tdot1, 0);
 85 |         time_of_dot += pytime(&tdot1) - pytime(&tdot0);
 86 | 
 87 |         for (int i = 0; i < nout; ++i) g_b->data[i] = 0.0;
 88 |         err = 0.0;
 89 |         for (int i = 0; i < neg; ++i)
 90 |             for (int j = 0; j < nout; ++j)
 91 |             {
 92 |                 double act = hidwt->data[i*nout+j] + b->data[j];
 93 |                 double out = tanh(act);
 94 |                 double g_out = out - x->data[i*nout+j];
 95 |                 err += g_out * g_out;
 96 |                 g_hidwt->data[i*nout+j] = g_out * (1.0 - out*out);
 97 |                 g_b->data[j] += g_hidwt->data[i*nout+j];
 98 |             }
 99 |         for (int i = 0; i < nout; ++i) b->data[i] -= lr * g_b->data[i];
100 | 
101 |         if (1)
102 |         {
103 |         gettimeofday(&tdot0, 0);
104 |             gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, g_hidwt, w, 0.0, g_hid);
105 |             gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, g_hidwt, hid, 0.0, g_w);
106 |         gettimeofday(&tdot1, 0);
107 |         time_of_dot += pytime(&tdot1) - pytime(&tdot0);
108 |             
109 | 
110 |             for (int i = 0; i < neg; ++i)
111 |                 for (int j = 0; j < nhid; ++j)
112 |                 {
113 |                     g_hid->data[i*nhid+j] *= (1.0 - hid->data[i*nhid+j] * hid->data[i*nhid+j]);
114 |                     a->data[j] -= lr * g_hid->data[i*nhid+j];
115 |                 }
116 | 
117 |         gettimeofday(&tdot0, 0);
118 |             gsl_blas_dgemm(CblasTrans, CblasNoTrans, -lr, x, g_hid, 1.0, w);
119 |         gettimeofday(&tdot1, 0);
120 |         time_of_dot += pytime(&tdot1) - pytime(&tdot0);
121 |             for (int i = 0; i < nout*nhid; ++i) w->data[i] -= lr * g_w->data[i];
122 |         }
123 | 
124 |     }
125 |     gettimeofday(&tv1, 0);
126 | 
127 |     double total_time = pytime(&tv1) - pytime(&tv0);
128 |     fprintf(stdout, "took = %lfs  to get err %lf\n", total_time, 0.5 * err);
129 |     fprintf(stdout, "... of which %.2lfs was spent in dgemm (fraction: %.2lf)\n", time_of_dot, time_of_dot / total_time);
130 |     //skip freeing
131 |     return 0;
132 | }
133 | 
134 | 


--------------------------------------------------------------------------------
/eblearn/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: mnist_example_ipp.x mnist_example_noipp.x convnet_noipp.x convnet_ipp.x convnet96_ipp.x convnet96_noipp.x convnet256_ipp.x convnet256_noipp.x
 3 | 
 4 | clean:
 5 | 	rm *.x
 6 | 
 7 | mnist_example_ipp.x : mnist_example.cc
 8 | 	g++ -I${PUB_PREFIX}/eblearn_ipp -o mnist_example_ipp.x mnist_example.cc\
 9 | 	    -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\
10 | 	    -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread
11 | 
12 | 
13 | mnist_example_noipp.x : mnist_example.cc
14 | 	g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o mnist_example_noipp.x mnist_example.cc\
15 | 	    -L${PUB_PREFIX}/eblearn_noipp -leblearn
16 | 
17 | convnet_noipp.x : convnet.cc
18 | 	g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet_noipp.x convnet.cc\
19 | 	    -L${PUB_PREFIX}/eblearn_noipp -leblearn
20 | 
21 | convnet96_noipp.x : convnet96.cc
22 | 	g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet96_noipp.x convnet96.cc\
23 | 	    -L${PUB_PREFIX}/eblearn_noipp -leblearn
24 | 
25 | convnet256_noipp.x : convnet256.cc
26 | 	g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet256_noipp.x convnet256.cc\
27 | 	    -L${PUB_PREFIX}/eblearn_noipp -leblearn
28 | 
29 | convnet_ipp.x : convnet.cc
30 | 	g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet_ipp.x convnet.cc\
31 | 	    -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\
32 | 	    -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread
33 | 
34 | convnet96_ipp.x : convnet96.cc
35 | 	g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet96_ipp.x convnet96.cc\
36 | 	    -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\
37 | 	    -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread
38 | 
39 | convnet256_ipp.x : convnet256.cc
40 | 	g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet256_ipp.x convnet256.cc\
41 | 	    -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\
42 | 	    -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread
43 | 
44 | 


--------------------------------------------------------------------------------
/eblearn/convnet.cc:
--------------------------------------------------------------------------------
 1 | #include "libeblearn.h"
 2 | #include <time.h>
 3 | #include <sys/time.h>
 4 | 
 5 | using namespace std;
 6 | using namespace ebl; // all eblearn objects are under the ebl namespace
 7 | 
 8 | static double time_time() // a time function like time.time()
 9 | {
10 |     struct timeval tv;
11 |     gettimeofday(&tv, 0);
12 |     return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0;
13 | }
14 | 
15 | typedef double t_net;
16 | 
17 | int main(int argc, char **argv) { // regular main without gui
18 |   init_drand(92394); // initialize random seed
19 | 
20 |   intg n_examples = 1000; // maximum training set size: 60000
21 |   idxdim dims(1,32,32); // get order and dimensions of sample
22 |   
23 |   //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest
24 |   idx<t_net> targets = create_target_matrix(10, 1.0);
25 |   idx<t_net> inputs(n_examples, 32, 32);
26 | 
27 |   parameter<t_net> theparam(60000); // create trainable parameter
28 |   lenet5<t_net> l5(theparam, 32, 32, 5, 5, 2, 2, 5, 5, 2, 2, 120, 10);
29 |     // TODO: use an all-to-all connection table in second layer convolution
30 |     // Because that's what the other packages implement.
31 |   supervised_euclidean_machine<t_net, ubyte> thenet(
32 |           (module_1_1<t_net>&)l5,
33 |           targets,
34 |           dims);
35 |   supervised_trainer<t_net, ubyte,ubyte> thetrainer(thenet, theparam);
36 |   classifier_meter trainmeter, testmeter;
37 |   forget_param_linear fgp(1, 0.5);
38 |   thenet.forget(fgp);
39 | 
40 |   // learning parameters
41 |   gd_param gdp(/* double leta*/ 0.0001,
42 |          /* double ln */  0.0,
43 |          /* double l1 */  0.0,
44 |          /* double l2 */  0.0,
45 |          /* int dtime */  0,
46 |          /* double iner */0.0,
47 |          /* double a_v */ 0.0,
48 |          /* double a_t */ 0.0,
49 |          /* double g_t*/  0.0);
50 |   infer_param infp;
51 | 
52 |   state_idx<t_net> dummy_input(1, 32, 32); 
53 |   int J = 2000;
54 |   double t = time_time();
55 |   for (intg j = 0; j < J; ++j)
56 |   {
57 | 	thetrainer.learn_sample(dummy_input, j%10, gdp);
58 |         // TODO: iterate over mock dataset to simulate more realistic
59 |         // memaccess pattern
60 |     }
61 | #ifdef USED_IPP
62 |   cout << "ConvSmall\teblearn{ipp}\t" << J / (time_time() - t) << endl;
63 | #else
64 |   cout << "ConvSmall\teblearn\t" << J / (time_time() - t) << endl;
65 | #endif
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/eblearn/convnet256.cc:
--------------------------------------------------------------------------------
 1 | #include "libeblearn.h"
 2 | #include <time.h>
 3 | #include <sys/time.h>
 4 | 
 5 | using namespace std;
 6 | using namespace ebl; // all eblearn objects are under the ebl namespace
 7 | 
 8 | static double time_time() // a time function like time.time()
 9 | {
10 |     struct timeval tv;
11 |     gettimeofday(&tv, 0);
12 |     return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0;
13 | }
14 | 
15 | typedef double t_net;
16 | 
17 | int main(int argc, char **argv) { // regular main without gui
18 |   init_drand(92394); // initialize random seed
19 | 
20 |   intg n_examples = 20; // maximum training set size: 60000
21 |   idxdim dims(1,256,256); // get order and dimensions of sample
22 |   
23 |   //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest
24 |   idx<t_net> targets = create_target_matrix(10, 1.0);
25 |   idx<t_net> inputs(n_examples, 256, 256);
26 | 
27 |   parameter<t_net> theparam(6000); // create trainable parameter
28 |   lenet5<t_net> l5(theparam, 256, 256, 7, 7, 5, 5, 7, 7, 4, 4, 120, 10);
29 |     // TODO: use an all-to-all connection table in second layer convolution
30 |     // Because that's what the other packages implement.
31 |   supervised_euclidean_machine<t_net, ubyte> thenet(
32 |           (module_1_1<t_net>&)l5,
33 |           targets,
34 |           dims);
35 |   supervised_trainer<t_net, ubyte,ubyte> thetrainer(thenet, theparam);
36 |   classifier_meter trainmeter, testmeter;
37 |   forget_param_linear fgp(1, 0.5);
38 |   thenet.forget(fgp);
39 | 
40 |   // learning parameters
41 |   gd_param gdp(/* double leta*/ 0.0001,
42 |          /* double ln */  0.0,
43 |          /* double l1 */  0.0,
44 |          /* double l2 */  0.0,
45 |          /* int dtime */  0,
46 |          /* double iner */0.0,
47 |          /* double a_v */ 0.0,
48 |          /* double a_t */ 0.0,
49 |          /* double g_t*/  0.0);
50 |   infer_param infp;
51 | 
52 |   state_idx<t_net> dummy_input(1, 256, 256); 
53 |   double t = time_time();
54 |   for (intg j = 0; j < n_examples; ++j)
55 |   {
56 | 	thetrainer.learn_sample(dummy_input, j%10, gdp);
57 |         // TODO: iterate over mock dataset to simulate more realistic
58 |         // memaccess pattern
59 |     }
60 | #ifdef USED_IPP
61 |   cout << "ConvLarge\teblearn{ipp}\t" << n_examples / (time_time() - t) << endl;
62 | #else
63 |   cout << "ConvLarge\teblearn\t" << n_examples / (time_time() - t) << endl;
64 | #endif
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/eblearn/convnet96.cc:
--------------------------------------------------------------------------------
 1 | #include "libeblearn.h"
 2 | #include <time.h>
 3 | #include <sys/time.h>
 4 | 
 5 | using namespace std;
 6 | using namespace ebl; // all eblearn objects are under the ebl namespace
 7 | 
 8 | static double time_time() // a time function like time.time()
 9 | {
10 |     struct timeval tv;
11 |     gettimeofday(&tv, 0);
12 |     return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0;
13 | }
14 | 
15 | typedef double t_net;
16 | 
17 | int main(int argc, char **argv) { // regular main without gui
18 |   init_drand(92394); // initialize random seed
19 | 
20 |   intg n_examples = 100; // maximum training set size: 60000
21 |   idxdim dims(1,96,96); // get order and dimensions of sample
22 |   
23 |   //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest
24 |   idx<t_net> targets = create_target_matrix(10, 1.0);
25 |   idx<t_net> inputs(n_examples, 96, 96);
26 | 
27 |   parameter<t_net> theparam(6000); // create trainable parameter
28 |   lenet5<t_net> l5(theparam, 96, 96, 7, 7, 3, 3, 7, 7, 3, 3, 120, 10);
29 |     // TODO: use an all-to-all connection table in second layer convolution
30 |     // Because that's what the other packages implement.
31 |   supervised_euclidean_machine<t_net, ubyte> thenet(
32 |           (module_1_1<t_net>&)l5,
33 |           targets,
34 |           dims);
35 |   supervised_trainer<t_net, ubyte,ubyte> thetrainer(thenet, theparam);
36 |   classifier_meter trainmeter, testmeter;
37 |   forget_param_linear fgp(1, 0.5);
38 |   thenet.forget(fgp);
39 | 
40 |   // learning parameters
41 |   gd_param gdp(/* double leta*/ 0.0001,
42 |          /* double ln */  0.0,
43 |          /* double l1 */  0.0,
44 |          /* double l2 */  0.0,
45 |          /* int dtime */  0,
46 |          /* double iner */0.0,
47 |          /* double a_v */ 0.0,
48 |          /* double a_t */ 0.0,
49 |          /* double g_t*/  0.0);
50 |   infer_param infp;
51 | 
52 |   state_idx<t_net> dummy_input(1, 96, 96); 
53 |   double t = time_time();
54 |   for (intg j = 0; j < n_examples; ++j)
55 |   {
56 | 	thetrainer.learn_sample(dummy_input, j%10, gdp);
57 |         // TODO: iterate over mock dataset to simulate more realistic
58 |         // memaccess pattern
59 |     }
60 | #ifdef USED_IPP
61 |   cout << "ConvMed\teblearn{ipp}\t" << n_examples / (time_time() - t) << endl;
62 | #else
63 |   cout << "ConvMed\teblearn\t" << n_examples / (time_time() - t) << endl;
64 | #endif
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/eblearn/mlp.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaberg/DeepLearningBenchmarks/590892c283b768a5b4baec24629bb3647b251434/eblearn/mlp.cc


--------------------------------------------------------------------------------
/eblearn/mnist_example.cc:
--------------------------------------------------------------------------------
  1 | #include "libeblearn.h"
  2 | #include <time.h>
  3 | #include <sys/time.h>
  4 | 
  5 | using namespace std;
  6 | using namespace ebl; // all eblearn objects are under the ebl namespace
  7 | 
  8 | static double time_time() // a time function like time.time()
  9 | {
 10 |     struct timeval tv;
 11 |     gettimeofday(&tv, 0);
 12 |     return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0;
 13 | }
 14 | 
 15 | typedef double t_net;
 16 | 
 17 | // argv[1] is expected to contain the directory of the mnist dataset
 18 | #ifdef __GUI__
 19 | MAIN_QTHREAD() { // this is the macro replacing main to enable multithreaded gui
 20 | #else
 21 | int main(int argc, char **argv) { // regular main without gui
 22 | #endif
 23 |   cerr << "* MNIST demo: learning handwritten digits using the eblearn";
 24 |   cerr << " C++ library *" << endl;
 25 |   if (argc != 2) {
 26 |     cerr << "Usage: ./mnist <my mnist directory>" << endl;
 27 |     eblerror("MNIST path not specified");
 28 |   }
 29 |   init_drand(time(NULL)); // initialize random seed
 30 | 
 31 |   intg trsize = 10000; // maximum training set size: 60000
 32 |   intg tesize = 10000; // maximum testing set size:  10000
 33 | 
 34 |   //! load MNIST datasets: trize for training set and tesize for testing set
 35 |   mnist_datasource<t_net, ubyte,ubyte> 
 36 |       train_ds(argv[1], "train", trsize),
 37 |       test_ds(argv[1], "t10k", tesize);
 38 | 
 39 |   //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest
 40 |   idx<t_net> targets = create_target_matrix(1+idx_max(train_ds.labels), 1.0);
 41 | 
 42 |   //! create the network weights, network and trainer
 43 |   cerr << "creating idxdim: " << endl;
 44 |   idxdim dims(train_ds.sample_dims()); // get order and dimensions of sample
 45 |   cerr << "creating theparam: " << endl;
 46 |   parameter<t_net> theparam(60000); // create trainable parameter
 47 |   cerr << "creating l5: " << endl;
 48 |   lenet5<t_net> l5(theparam, 32, 32, 5, 5, 2, 2, 5, 5, 2, 2, 120, targets.dim(0));
 49 |   //TODO: Consider using net_nn_cscsc directly rather than lenet5
 50 | 
 51 |   cerr << "creating thenet: " << endl;
 52 |   supervised_euclidean_machine<t_net, ubyte> thenet((module_1_1<t_net>&)l5, targets, dims);
 53 |   cerr << "creating thetrainer: " << endl;
 54 |   supervised_trainer<t_net, ubyte,ubyte> thetrainer(thenet, theparam);
 55 |   //supervised_trainer_gui<t_net> stgui; // the gui to display supervised_trainer
 56 | 
 57 |   //! a classifier-meter measures classification errors
 58 |   classifier_meter trainmeter, testmeter;
 59 | 
 60 |   //! initialize the network weights
 61 |   forget_param_linear fgp(1, 0.5);
 62 |   thenet.forget(fgp);
 63 | 
 64 |   // learning parameters
 65 |   gd_param gdp(/* double leta*/ 0.0001,
 66 |          /* double ln */  0.0,
 67 |          /* double l1 */  0.0,
 68 |          /* double l2 */  0.0,
 69 |          /* int dtime */  0,
 70 |          /* double iner */0.0,
 71 |          /* double a_v */ 0.0,
 72 |          /* double a_t */ 0.0,
 73 |          /* double g_t*/  0.0);
 74 |   infer_param infp;
 75 | 
 76 |   int use_hessian = 0;
 77 |   // estimate second derivative on 100 iterations, using mu=0.02
 78 |   if (use_hessian)
 79 |   {
 80 |       cerr << "Computing second derivatives on MNIST dataset: " << endl;
 81 |       thetrainer.compute_diaghessian(train_ds, 100, 0.02);
 82 |   }
 83 | 
 84 |   //code borrowd from libeblearn/include/ebl_trainer.hpp
 85 |   ubyte lab;
 86 |   thetrainer.init(train_ds, &trainmeter);
 87 |   // training on lowest size common to all classes (times # classes)
 88 |   // now do training iterations
 89 |   //cerr << "... Training network from " << train_ds.get_lowest_common_size() << endl;
 90 |   double t = time_time();
 91 |   train_ds.fprop(*thetrainer.input, thetrainer.label);
 92 |   lab = thetrainer.label.get();
 93 |   //int J = train_ds.get_lowest_common_size();
 94 |   int J = 2000;
 95 |   for (intg j = 0; j < J; ++j) {
 96 | 	//train_ds.fprop(*thetrainer.input, thetrainer.label);
 97 | 	//lab = thetrainer.label.get();
 98 | 	thetrainer.learn_sample(*thetrainer.input, lab, gdp);
 99 | 	// use energy as distance for samples probabilities to be used
100 | 	///// train_ds.set_answer_distance(energy.x.get());
101 | 	//      log.update(age, output, label.get(), energy);
102 | 	//train_ds.next_train();
103 |     }
104 | #ifdef __IPP__
105 |     cout << "lenet5\teblearn{ipp}\t" << J / (time_time() - t) << endl;
106 | #else
107 |     cout << "lenet5\teblearn\t" << J / (time_time() - t) << endl;
108 | #endif
109 |   return 0;
110 | }
111 | 
112 | 
113 | #if 0
114 |   for (int i = 0; i < 100; ++i) {
115 |     double t = time_time();
116 |     cerr << "Training... " << endl;
117 |     thetrainer.train(train_ds, trainmeter, gdp, 1);                // train
118 |     cerr << "Training took" << t - time_time() << "seconds" << endl;
119 |     cerr << "Testing on train... " << endl;
120 |     thetrainer.test(train_ds, trainmeter, infp);           // test
121 |     cerr << "Testing on test... " << endl;
122 |     thetrainer.test(test_ds, testmeter, infp);                     // test
123 |     //stgui.display_datasource(thetrainer, test_ds, infp, 10, 10); // display
124 |     //stgui.display_internals(thetrainer, test_ds, infp, 2);       // display
125 |     if (use_hessian)
126 |         thetrainer.compute_diaghessian(train_ds, 100, 0.02); // recompute 2nd der
127 |     cerr << "Iteration took" << t - time_time() << "seconds" << endl;
128 |   }
129 | #endif
130 | 


--------------------------------------------------------------------------------
/eblearn/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./mnist_example_ipp.x /data/lisa/data/mnist
 4 | # LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./mnist_example_noipp.x /data/lisa/data/mnist
 5 | 
 6 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet_ipp.x > ${HOSTNAME}_eblearn_convnet_ipp.bmark
 7 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet96_ipp.x > ${HOSTNAME}_eblearn_convnet96_ipp.bmark
 8 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet256_ipp.x > ${HOSTNAME}_eblearn_convnet256_ipp.bmark
 9 | 
10 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet_noipp.x > ${HOSTNAME}_eblearn_convnet.bmark
11 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet96_noipp.x > ${HOSTNAME}_eblearn_convnet96.bmark
12 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet256_noipp.x > ${HOSTNAME}_eblearn_convnet256.bmark
13 | 


--------------------------------------------------------------------------------
/numpy/aa_numpy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | from __future__ import absolute_import
 3 | import numpy as N
 4 | import sys
 5 | import time
 6 | 
 7 | # c: aa.cc
 8 | 
 9 | neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
10 | lr = 0.01
11 | 
12 | rng = N.random.RandomState(342)
13 | 
14 | w = rng.rand(nout, nhid)
15 | a = rng.randn(nhid) * 0.0
16 | b = rng.randn(nout) * 0.0
17 | x = (rng.rand(neg, nout)-0.5) * 1.5
18 | 
19 | dot_time = 0.0
20 | 
21 | t = time.time()
22 | for i in xrange(niter):
23 |     tt = time.time()
24 |     d = N.dot(x, w)
25 |     dot_time += time.time() - tt
26 | 
27 |     hid = N.tanh(d + a)
28 | 
29 |     tt = time.time()
30 |     d = N.dot(hid, w.T)
31 |     dot_time += time.time() - tt
32 |     out = N.tanh(d + b)
33 | 
34 |     g_out = out - x
35 |     err = 0.5 * N.sum(g_out**2)
36 | 
37 |     g_hidwt = g_out * (1.0 - out**2)
38 | 
39 |     b -= lr * N.sum(g_hidwt, axis=0)
40 | 
41 |     tt = time.time()
42 |     g_hid = N.dot(g_hidwt, w)
43 |     dot_time += time.time() - tt
44 | 
45 |     g_hidin = g_hid * (1.0 - hid**2)
46 | 
47 |     tt = time.time()
48 |     d = N.dot(g_hidwt.T, hid)
49 |     dd = N.dot(x.T, g_hidin)
50 |     dot_time += time.time() - tt
51 | 
52 |     gw = (d + dd)
53 |     w -= lr * gw
54 | 
55 |     a -= lr * N.sum(g_hidin, axis=0)
56 | 
57 | total_time = time.time() - t
58 | print 'time: ',total_time, 'err: ', err
59 | print ' of which', dot_time, 'was spent on dot. Fraction:', dot_time / total_time
60 | 
61 | 


--------------------------------------------------------------------------------
/numpy/logreg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | from __future__ import absolute_import
 3 | import numpy as N
 4 | import sys
 5 | import time
 6 | 
 7 | # c: aa.cc
 8 | 
 9 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]]
10 | lr = 0.01
11 | 
12 | rng = N.random.RandomState(342)
13 | 
14 | # declare data
15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5
16 | y = (rng.rand(batchsize*niter, nout)-0.5) * 1.5
17 | 
18 | # declare model weights
19 | w = rng.rand(nin, nout)
20 | b = rng.randn(nout) * 0.0
21 | 
22 | t = time.time()
23 | for i in xrange(niter):
24 |     x_i = x[i*batchsize:(i+1)*batchsize]
25 |     y_i = y[i*batchsize:(i+1)*batchsize]
26 | 
27 |     hidin = N.dot(x_i, w) + b
28 | 
29 |     hidout = (N.tanh(hidin)+1)/2.0 # sigmoid
30 | 
31 |     g_hidout = hidout - y_i
32 |     err = 0.5 * N.sum(g_hidout**2)
33 | 
34 |     g_hidin = g_hidout * hidout * (1.0 - hidout)
35 | 
36 |     b -= lr * N.sum(g_hidin, axis=0)
37 |     w -= lr * N.dot(x_i.T, g_hidin)
38 | 
39 | total_time = time.time() - t
40 | print 'mlp_%i_%i\tnumpy{%i}\t%.2f' %(
41 |         nin, nout, batchsize, niter*batchsize/total_time)
42 | 
43 | 


--------------------------------------------------------------------------------
/numpy/mlp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | from __future__ import absolute_import
 3 | import numpy as N
 4 | import sys
 5 | import time
 6 | 
 7 | # c: aa.cc
 8 | 
 9 | nin, nhid, nout, batchsize, niter = [int(a) for a in sys.argv[1:]]
10 | lr = 0.01
11 | 
12 | rng = N.random.RandomState(342)
13 | 
14 | # declare data
15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5
16 | y = (rng.rand(batchsize*niter, nout)-0.5) * 1.5
17 | 
18 | # declare model weights
19 | w = rng.rand(nin, nhid)
20 | b = rng.randn(nhid) * 0.0
21 | v = rng.rand(nhid, nout)
22 | c = rng.randn(nout) * 0.0
23 | 
24 | t = time.time()
25 | for i in xrange(niter):
26 |     x_i = x[i*batchsize:(i+1)*batchsize]
27 |     y_i = y[i*batchsize:(i+1)*batchsize]
28 | 
29 |     hidin = N.dot(x_i, w) + b
30 | 
31 |     hidout = N.tanh(hidin)
32 | 
33 |     outin = N.dot(hidout, v) + c
34 |     outout = (N.tanh(outin)+1)/2.0
35 | 
36 |     g_outout = outout - y_i
37 |     err = 0.5 * N.sum(g_outout**2)
38 | 
39 |     g_outin = g_outout * outout * (1.0 - outout)
40 | 
41 |     g_hidout = N.dot(g_outin, v.T)
42 |     g_hidin = g_hidout * (1 - hidout**2)
43 | 
44 |     b -= lr * N.sum(g_hidin, axis=0)
45 |     c -= lr * N.sum(g_outin, axis=0)
46 |     w -= lr * N.dot(x_i.T, g_hidin)
47 |     v -= lr * N.dot(hidout.T, g_outin)
48 | 
49 | total_time = time.time() - t
50 | print 'mlp_%i_%i_%i\tnumpy{%i}\t%.2f' %(
51 |         nin, nhid, nout, batchsize, niter*batchsize/total_time)
52 | 
53 | 


--------------------------------------------------------------------------------
/numpy/rbm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | from __future__ import absolute_import
 3 | import numpy as N
 4 | import sys
 5 | import time
 6 | 
 7 | # c: aa.cc
 8 | 
 9 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]]
10 | lr = 0.01
11 | 
12 | rng = N.random.RandomState(342)
13 | 
14 | # declare data
15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5
16 | 
17 | # declare model weights
18 | a = rng.randn(nin) * 0.0
19 | w = rng.rand(nin, nout)
20 | b = rng.randn(nout) * 0.0
21 | 
22 | def sigm(x): return (N.tanh(x)+1)/2
23 | 
24 | def bern(x): return N.random.binomial(p=x,n=1)
25 | 
26 | t = time.time()
27 | for i in xrange(niter):
28 |     pos_vis = x[i*batchsize:(i+1)*batchsize]
29 | 
30 |     pos_hid = sigm(N.dot(pos_vis, w)+b)
31 | 
32 |     neg_vis = sigm(N.dot(bern(pos_hid), w.T)+a)
33 | 
34 |     neg_hid = sigm(N.dot(bern(neg_vis), w) + b)
35 | 
36 |     a += lr * N.sum(pos_vis - neg_vis, axis=0)
37 |     b -= lr * N.sum(pos_hid - neg_hid, axis=0)
38 |     w -= lr * (N.dot(pos_vis.T, pos_hid) - N.dot(neg_vis.T, neg_hid))
39 | 
40 | total_time = time.time() - t
41 | print 'cd1 rbm_bernoulli %i_%i\tnumpy{%i}\t%.2f' %(
42 |         nin, nout, batchsize, niter*batchsize/total_time)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/numpy/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | python mlp.py 784 500 10 1 1000 > ${HOSTNAME}_mlp_1.bmark
 4 | python mlp.py 784 500 10 60 100 > ${HOSTNAME}_mlp_60.bmark
 5 | 
 6 | python logreg.py 784 10 1 1000 > ${HOSTNAME}_lr_784_1.bmark
 7 | python logreg.py 784 10 60 100 > ${HOSTNAME}_lr_784_60.bmark
 8 | python logreg.py 32  10 1 1000 > ${HOSTNAME}_lr_32_1.bmark
 9 | python logreg.py 32  10 60 100 > ${HOSTNAME}_lr_32_60.bmark
10 | python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_1.bmark
11 | python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_60.bmark
12 | 


--------------------------------------------------------------------------------
/reports/ascii.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | import cPickle
 5 | 
 6 | if __name__ == '__main__':
 7 |     assert sys.argv[1] == '--db'
 8 |     db = cPickle.load(open(sys.argv[2]))
 9 |     for entry in db:
10 |         print entry
11 | 
12 | 


--------------------------------------------------------------------------------
/reports/build_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | def build_results(path='.'):
 7 |     results = {} # map task -> impl -> time
 8 | 
 9 |     for root, dirs, files in os.walk(path):
10 |         for bmark in [f for f in files if f.endswith('.bmark')]:
11 |             for line in open(os.path.join(root,bmark)):
12 |                 if not line or line == "\n":
13 |                     continue
14 |                 try:
15 |                     task, impl, t = line[:-1].split('\t')[:3]
16 |                 except:
17 |                     print >> sys.stderr, "PARSE ERR:", line
18 |                     continue
19 | 
20 |                 if task.startswith('#'):
21 |                     print >> sys.stderr, "Skipping", task, impl, t
22 |                 else:
23 |                     results.setdefault(task, {})[impl] = float(t)
24 |     return results
25 | 
26 | if __name__ == '__main__':
27 |     r = build_results(sys.argv[1])
28 | 
29 |     for k in r:
30 |         for i in r[k]:
31 |             print '%s\t%s\t%f' % (k,i,r[k][i])
32 | 
33 |     if 0:
34 | 
35 |         keys = r.keys()
36 |         keys.sort()
37 | 
38 |         for k in keys:
39 |             v = r[k]
40 |             print k
41 |             r_k = [(v[i],i) for i in v]
42 |             r_k.sort()
43 |             r_k.reverse()
44 |             for t, i in r_k:
45 |                 print "   %10.2f - %s" %(t, i)
46 |             print ''
47 | 


--------------------------------------------------------------------------------
/reports/show_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # a bar plot with errorbars
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import sys
 6 | from pylab import *
 7 | 
 8 | def rcolor():
 9 |     return tuple(np.random.rand(3))
10 | 
11 | from build_csv import build_results
12 | 
13 | results = build_results(sys.argv[1]) # dict task -> impl -> time
14 | 
15 | n_tasks = len(results)
16 | 
17 | tasks = results.keys()
18 | impls = set()
19 | for k, v in results.items():
20 |     impls.update(v.keys())
21 | 
22 | print tasks
23 | print impls
24 | fig = plt.figure()
25 | ax = fig.add_subplot(111)
26 | ind = np.arange(n_tasks)  # the x locations for the groups
27 | width = 0.10       # the width of the bars
28 | 
29 | rects = []
30 | for i, impl in enumerate(impls):
31 |     means = []
32 |     std = []
33 |     for t in tasks:
34 |         std.append(0)
35 |         try:
36 |             means.append(1.0/results[t][impl])
37 |         except KeyError:
38 |             means.append(0)
39 |     rects.append(ax.bar(ind+i*width, means, width, color=rcolor(), log=True)) #, color='r', yerr=menStd)
40 |     print "adding rect for", impl, means
41 | 
42 | #womenMeans = (25, 32, 34, 20, 25)
43 | #womenStd =   (3, 5, 2, 3, 3)
44 | #rects2 = ax.bar(ind+width, womenMeans, width, color='y', yerr=womenStd)
45 | 
46 | # add some
47 | ax.set_ylabel('Examples / seconds')
48 | #ax.set_title('Scores by group and gender')
49 | ax.set_xticks(ind+width*len(impls)/2.0)
50 | print 'tasks', tasks
51 | ax.set_xticklabels( [t[:12] for t in tasks] )
52 | print 'gca', gca().get_xticklabels()
53 | setp(gca().get_xticklabels(), rotation=30, fontsize=10)
54 | 
55 | ax.legend( [r[0] for r in rects], impls, 'upper left' )
56 | 
57 | #def autolabel(rects):
58 |     # attach some text labels
59 | #    for rect in rects:
60 | #        height = rect.get_height()
61 | #        ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height),
62 | #                ha='center', va='bottom')
63 | #autolabel(rects1)
64 | #autolabel(rects2)
65 | 
66 | subplots_adjust(left=.09, bottom=.14, right=.97, top=.95)
67 | 
68 | savefig('blah.pdf')
69 | 
70 | 


--------------------------------------------------------------------------------
/reports/task_pdfs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # a bar plot with errorbars
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import sys
 6 | from pylab import *
 7 | 
 8 | def rcolor():
 9 |     return 'b'
10 |     return tuple(np.random.rand(3))
11 | 
12 | from build_csv import build_results
13 | 
14 | results = build_results(sys.argv[1]) # dict task -> impl -> time
15 | 
16 | 
17 | for task in results:
18 |     print task
19 |     
20 |     fig = plt.figure()
21 |     ax = fig.add_subplot(111)
22 |     width = 0.30       # the width of the bars
23 | 
24 |     scores = [(s,i) for (i,s) in results[task].items()]
25 |     scores.sort()
26 | 
27 |     scores = scores[-8:]
28 | 
29 |     rect = ax.barh(-.15+np.arange(len(scores)),
30 |             [s for (s,i) in scores],
31 |             0.3, # width
32 |             color='b',
33 |             log=False)
34 | 
35 |     # add some
36 |     ax.set_title('Preliminary Benchmark Results: %s'% task)
37 |     ax.set_yticklabels(['']+[i for (s,i) in scores], minor=True)
38 |     #ax.set_ylabel('Training Speed (examples/sec)')
39 |     #ax.set_xticks(np.arange(len(scores)), minor=False)
40 |     #ax.set_xticklabels([i[:3] for (s,i) in scores])#, minor=True
41 |     #setp(ax.get_xmajorticklabels(), rotation=90, fontsize=10)
42 | 
43 |     subplots_adjust(left=.29, bottom=.14, right=.97, top=.91)
44 | 
45 |     savefig('%s.pdf'%task)
46 | 
47 | 


--------------------------------------------------------------------------------
/theano/aa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.5
  2 | from __future__ import absolute_import
  3 | import numpy
  4 | import sys
  5 | import time
  6 | 
  7 | import theano
  8 | import theano.tensor as T
  9 | import theano.sandbox
 10 | import theano.sandbox.wraplinker
 11 | from theano.compile import module, Mode
 12 | from theano.sandbox.wraplinker import ProfileMode
 13 | from theano import gof, Op, Apply
 14 | 
 15 | from theano.tensor import blas, opt
 16 | 
 17 | # numpy: aa_numpy.py
 18 | # c : aa.cc
 19 | 
 20 | 
 21 | if 0:
 22 |     class Opt(object):
 23 |         merge = theano.gof.MergeOptimizer()
 24 |         gemm_opt_1 = theano.gof.TopoOptimizer(theano.tensor_opt.gemm_pattern_1)
 25 | 
 26 |         gemm_opt_2 = theano.gof.TopoOptimizer( # d -= a * (dot()+transpose(dot))
 27 |                 theano.gof.PatternSub(
 28 |                     (
 29 |                         T.sub_inplace,
 30 |                         'd',
 31 |                         (
 32 |                             T.mul,
 33 |                             dict(pattern = (T.DimShuffle((), ['x', 'x'], inplace = True), 'a'),
 34 |                                 allow_multiple_clients = True),
 35 |                             (
 36 |                                 T.add,
 37 |                                 (T.dot, 'b', 'c'),
 38 |                                 (T.transpose_inplace, (T.dot, 'f', 'g'))
 39 |                             )
 40 |                         )
 41 |                     ),
 42 |                     (
 43 |                         T.gemm, 
 44 |                         (
 45 |                             T.gemm,
 46 |                             'd', 
 47 |                             (T.neg, 'a'),
 48 |                             (T.transpose_inplace, 'g'),
 49 |                             (T.transpose_inplace, 'f'),
 50 |                             T.constant(1.0)
 51 |                         ),
 52 |                         (T.neg, 'a'), 
 53 |                         'b', 
 54 |                         'c', 
 55 |                         T.constant(1.0)
 56 |                     ),
 57 |                     allow_multiple_clients = False))
 58 | 
 59 |         sqr = []
 60 |         sqr.append( theano.gof.TopoOptimizer(
 61 |                 theano.gof.PatternSub(
 62 |                     (T.mul,'x', 'x'),
 63 |                     (T.sqr, 'x'), allow_multiple_clients=True)))
 64 |         sqr.append(theano.gof.TopoOptimizer(
 65 |             theano.gof.PatternSub(
 66 |                 (T.pow, 'x', (T.DimShuffle((), ['x', 'x'], inplace=True), T.constant(2))),
 67 |                 (T.sqr, 'x'), allow_multiple_clients=True)))
 68 | 
 69 |         ident_opt_list = []
 70 |         ident_opt_list.append(  # remove explicit copies
 71 |                 theano.gof.TopoOptimizer(
 72 |                     theano.gof.PatternSub(
 73 |                         (T.tensor_copy, 'x'),
 74 |                         'x',
 75 |                         allow_multiple_clients=True)))
 76 |         ident_opt_list.append( # remove double-transpose
 77 |                 theano.gof.TopoOptimizer(
 78 |                     theano.gof.PatternSub(
 79 |                         (T.transpose_inplace, (T.transpose_inplace, 'x')),
 80 |                         'x',
 81 |                         allow_multiple_clients=True)))
 82 | 
 83 |         ident_opt_list.append(
 84 |                 theano.gof.TopoOptimizer(
 85 |                     theano.gof.PatternSub(
 86 |                         (T.sqr, (T.sqrt,'x')),
 87 |                         'x',
 88 |                         allow_multiple_clients=True)))
 89 |         ident_opt_list.append(
 90 |                 theano.gof.TopoOptimizer(
 91 |                     theano.gof.PatternSub(
 92 |                         (T.sqrt, (T.sqr,'x')),
 93 |                         'x',
 94 |                         allow_multiple_clients=True)))
 95 |         ident_opt_list.append(
 96 |                 theano.gof.TopoOptimizer(
 97 |                     theano.gof.PatternSub(
 98 |                         (T.mul, 'x', (T.div,'y', 'x')),
 99 |                         'y',
100 |                         allow_multiple_clients=True)))
101 | 
102 |         ident_opt_list.append(
103 |                 theano.gof.TopoOptimizer(
104 |                     theano.gof.PatternSub(
105 |                         (T.mul, (T.div,'y', 'x'), 'x'),
106 |                         'y',
107 |                         allow_multiple_clients=True)))
108 | 
109 |         ident_opt_list.append(
110 |                 theano.gof.TopoOptimizer(
111 |                     theano.gof.PatternSub(
112 |                         (T.div, (T.mul,'y', 'x'), 'x'),
113 |                         'y',
114 |                         allow_multiple_clients=True)))
115 | 
116 |         ident_opt_list.append(
117 |                 theano.gof.TopoOptimizer(
118 |                     theano.gof.PatternSub(
119 |                         (T.div, (T.mul,'y', 'x'), 'y'),
120 |                         'x',
121 |                         allow_multiple_clients=True)))
122 | 
123 |         def __call__(self, env):
124 |             self.merge(env)
125 |             #eliminate identities
126 |             if 0:
127 |                 print 'SKIPPING optimizations'
128 |             else:
129 | 
130 |                 for opt in self.ident_opt_list:
131 |                     opt(env)
132 | 
133 |                 for opt in self.sqr:
134 |                     opt(env)
135 | 
136 |                 self.gemm_opt_1(env)
137 |                 self.gemm_opt_2(env)
138 | 
139 |                 self.merge(env)
140 | 
141 | def print_graph_linker(print_prog=True):
142 |     if 1:
143 |         imap = {None:'-'}
144 |         def blah(i, node, thunk):
145 |             imap[node] = str(i)
146 |             if print_prog:# and node.op.__class__ is T.DimShuffle:
147 |                 if False and  node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
148 |                     print node.op == T.DimShuffle((), ['x', 'x'], inplace = True),
149 |                     print node.inputs[0], type(node.inputs[0]), 
150 |                     print node.inputs[0].equals(T.constant(2)), 
151 |                 outputs = node.outputs
152 |                 inputs = theano.gof.graph.inputs(outputs)
153 |                 print 'node ', i, node,
154 |                 print ':'.join([imap[inp.owner] for inp in node.inputs])
155 |                 #print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
156 |         return theano.sandbox.wraplinker.WrapLinkerMany(
157 |                 [theano.gof.OpWiseCLinker()],
158 |                 [theano.sandbox.wraplinker.run_all
159 |                     ,blah
160 |                     #,theano.sandbox.wraplinker.numpy_notall_isfinite
161 |                     ])
162 |     else:
163 |         return theano.gof.OpWiseCLinker()
164 | 
165 | 
166 | class M(module.Module):
167 |     def __init__(self):
168 |         super(M, self).__init__()
169 | 
170 |         x = T.matrix('x') # input, target
171 |         self.w = module.Member(T.matrix('w')) # weights
172 |         self.a = module.Member(T.vector('a')) # hid bias
173 |         self.b = module.Member(T.vector('b')) # output bias
174 | 
175 |         self.hid = T.tanh(T.dot(x, self.w) + self.a)
176 |         hid = self.hid
177 | 
178 |         self.out = T.tanh(T.dot(hid, self.w.T) + self.b)
179 |         out = self.out
180 | 
181 |         self.err = 0.5 * T.sum((out - x)**2)
182 |         err = self.err
183 | 
184 |         params = [self.w, self.a, self.b]
185 | 
186 |         gparams = T.grad(err, params)
187 | 
188 |         updates = [(p, p - 0.01 * gp) for p, gp in zip(params, gparams)]
189 | 
190 |         self.step = module.Method([x], err, updates=dict(updates))
191 | 
192 | mod = M()
193 | mode = 'FAST_RUN'
194 | #mode = ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
195 | mode = Mode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker(nice_errors=True))
196 | mode = Mode(optimizer='fast_run', linker='c')
197 | mode = Mode(optimizer='fast_run', linker='c|py')
198 | print mod.pretty(mode=mode)
199 | m = mod.make(mode=mode)
200 | 
201 | neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
202 | rng = numpy.random.RandomState(342)
203 | m.w = rng.rand(nout, nhid)
204 | m.a = rng.randn(nhid) * 0.0
205 | m.b = rng.randn(nout) * 0.0
206 | 
207 | x = (rng.rand(neg, nout)-0.5) * 1.5
208 | 
209 | t = time.time()
210 | for i in xrange(niter):
211 |     err = m.step(x)
212 | print 'time: ',time.time() - t, 'err: ', err
213 | try:
214 |     mode.print_summary()
215 |     pass
216 | except:
217 |     pass
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/theano/control.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | from theano.misc.check_blas import execute
 3 | 
 4 | sizes = [500, 1000, 1500, 2000, 2500]
 5 | iters = 10
 6 | 
 7 | for order in ['c', 'f']:
 8 |     for size in sizes:
 9 |         t = execute(verbose=False, M=size, N=size, K=size, iters=iters)[0]
10 |         print "gemm theano{order_%s/%s/%d/%d}" % (
11 |             order, theano.config.floatX, iters, size), t
12 | 


--------------------------------------------------------------------------------
/theano/convnet.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import time
  3 | 
  4 | import numpy
  5 | from numpy import asarray, random
  6 | 
  7 | from theano.tensor import lscalar, tanh, dot, grad, log, arange
  8 | from theano.tensor.nnet import softmax
  9 | from theano.tensor.nnet.conv import conv2d
 10 | from theano.tensor.signal.downsample import max_pool_2d
 11 | from theano import shared, function, config
 12 | 
 13 | random.seed(2344)
 14 | 
 15 | 
 16 | def rand(*size):
 17 |     return asarray(random.rand(*size), dtype=config.floatX)
 18 | 
 19 | 
 20 | def randn(*size):
 21 |     return asarray(random.randn(*size), dtype=config.floatX)
 22 | 
 23 | 
 24 | def randint(size, high):
 25 |     return asarray(random.randint(size=size, low=0, high=high), dtype='int32')
 26 | 
 27 | 
 28 | def zeros(*size):
 29 |     return numpy.zeros(size, dtype=config.floatX)
 30 | 
 31 | 
 32 | n_examples = 1000
 33 | outputs = 10
 34 | lr = numpy.asarray(0.01, dtype=config.floatX)
 35 | 
 36 | data_x = shared(randn(n_examples, 1, 32, 32))
 37 | data_y = shared(randint((n_examples,), outputs))
 38 | 
 39 | si = lscalar()
 40 | nsi = lscalar()
 41 | sx = data_x[si:si + nsi]
 42 | sy = data_y[si:si + nsi]
 43 | 
 44 | bmark = open("%s_convnet_%s_%s.bmark" % (socket.gethostname(),
 45 |                                          config.device, config.floatX), 'w')
 46 | 
 47 | if config.floatX == 'float32':
 48 |     prec = 'float'
 49 | else:
 50 |     prec = 'double'
 51 | 
 52 | 
 53 | def reportmodel(model, batchsize, v):
 54 |     bmark.write("%s\t" % model)
 55 |     bmark.write("theano{%s/%s/%i}\t" % (
 56 |         config.device[0], prec, batchsize))
 57 |     bmark.write("%.2f\n" % v)
 58 | 
 59 | 
 60 | def eval_and_report(train, name, batchsizes, N=n_examples):
 61 |     for bs in batchsizes:
 62 |         assert N % bs == 0  # can't be cheatin now...
 63 |         t = time.time()
 64 |         for i in xrange(N / bs):
 65 |             cost = train(i * bs, bs)
 66 |             if not (i % (1000 / bs)):
 67 |                 print i * bs, cost
 68 |         reportmodel(name, bs, N / (time.time() - t))
 69 | 
 70 | 
 71 | def bench_ConvSmall(batchsize):
 72 |     data_x.set_value(randn(n_examples, 1, 32, 32))
 73 |     w0 = shared(rand(6, 1, 5, 5) * numpy.sqrt(6 / (25.)))
 74 |     b0 = shared(zeros(6))
 75 |     w1 = shared(rand(16, 6, 5, 5) * numpy.sqrt(6 / (25.)))
 76 |     b1 = shared(zeros(16))
 77 |     vv = shared(rand(16 * 5 * 5, 120) * numpy.sqrt(6.0 / 16. / 25))
 78 |     cc = shared(zeros(120))
 79 |     v = shared(zeros(120, outputs))
 80 |     c = shared(zeros(outputs))
 81 |     params = [w0, b0, w1, b1, v, c, vv, cc]
 82 | 
 83 |     c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 32, 32),
 84 |                      filter_shape=(6, 1, 5, 5)) + b0.dimshuffle(0, 'x', 'x'))
 85 |     # this is not the correct leNet5 model, but it's closer to
 86 |     s0 = tanh(max_pool_2d(c0, (2, 2)))
 87 | 
 88 |     c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 14, 14),
 89 |                      filter_shape=(16, 6, 5, 5)) +
 90 |               b1.dimshuffle(0, 'x', 'x'))
 91 |     s1 = tanh(max_pool_2d(c1, (2, 2)))
 92 | 
 93 |     p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c)
 94 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
 95 |     cost = nll.mean()
 96 | 
 97 |     gparams = grad(cost, params)
 98 | 
 99 |     train = function([si, nsi], cost,
100 |             updates=[(p, p - lr * gp) for p, gp  in zip(params, gparams)])
101 | 
102 |     eval_and_report(train, "ConvSmall", [batchsize], N=600)
103 | 
104 | 
105 | def bench_ConvMed(batchsize):
106 |     data_x.set_value(randn(n_examples, 1, 96, 96))
107 |     w0 = shared(rand(6, 1, 7, 7) * numpy.sqrt(6 / (25.)))
108 |     b0 = shared(zeros(6))
109 |     w1 = shared(rand(16, 6, 7, 7) * numpy.sqrt(6 / (25.)))
110 |     b1 = shared(zeros(16))
111 |     vv = shared(rand(16 * 8 * 8, 120) * numpy.sqrt(6.0 / 16. / 25))
112 |     cc = shared(zeros(120))
113 |     v = shared(zeros(120, outputs))
114 |     c = shared(zeros(outputs))
115 |     params = [w0, b0, w1, b1, v, c, vv, cc]
116 | 
117 |     c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 96, 96),
118 |                      filter_shape=(6, 1, 7, 7)) + b0.dimshuffle(0, 'x', 'x'))
119 |     # this is not the correct leNet5 model, but it's closer to
120 |     s0 = tanh(max_pool_2d(c0, (3, 3)))
121 | 
122 |     c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 30, 30),
123 |                      filter_shape=(16, 6, 7, 7)) + b1.dimshuffle(0, 'x', 'x'))
124 |     s1 = tanh(max_pool_2d(c1, (3, 3)))
125 | 
126 |     p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c)
127 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
128 |     cost = nll.mean()
129 | 
130 |     gparams = grad(cost, params)
131 | 
132 |     train = function([si, nsi], cost,
133 |             updates=[(p, p - lr * gp) for p, gp in zip(params, gparams)])
134 |     eval_and_report(train, "ConvMed", [batchsize], N=120)
135 | 
136 | 
137 | def bench_ConvLarge(batchsize):
138 |     data_x.set_value(randn(n_examples, 1, 256, 256))
139 |     w0 = shared(rand(6, 1, 7, 7) * numpy.sqrt(6 / (25.)))
140 |     b0 = shared(zeros(6))
141 |     w1 = shared(rand(16, 6, 7, 7) * numpy.sqrt(6 / (25.)))
142 |     b1 = shared(zeros(16))
143 |     vv = shared(rand(16 * 11 * 11, 120) * numpy.sqrt(6.0 / 16. / 25))
144 |     cc = shared(zeros(120))
145 |     v = shared(zeros(120, outputs))
146 |     c = shared(zeros(outputs))
147 |     params = [w0, b0, w1, b1, v, c, vv, cc]
148 | 
149 |     c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 256, 256),
150 |                      filter_shape=(6, 1, 7, 7)) + b0.dimshuffle(0, 'x', 'x'))
151 |     # this is not the correct leNet5 model, but it's closer to
152 |     s0 = tanh(max_pool_2d(c0, (5, 5)))
153 | 
154 |     c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 50, 50),
155 |                      filter_shape=(16, 6, 7, 7)) + b1.dimshuffle(0, 'x', 'x'))
156 |     s1 = tanh(max_pool_2d(c1, (4, 4)))
157 | 
158 |     p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c)
159 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
160 |     cost = nll.mean()
161 | 
162 |     gparams = grad(cost, params)
163 | 
164 |     train = function([si, nsi], cost,
165 |             updates=[(p, p - lr * gp) for p, gp in zip(params, gparams)])
166 |     eval_and_report(train, "ConvLarge", [batchsize], N=120)
167 | 
168 | if __name__ == '__main__':
169 |     bench_ConvSmall(1)
170 |     bench_ConvSmall(60)
171 |     bench_ConvMed(1)
172 |     bench_ConvMed(60)
173 |     bench_ConvLarge(1)
174 |     bench_ConvLarge(60)
175 | 


--------------------------------------------------------------------------------
/theano/mlp.py:
--------------------------------------------------------------------------------
  1 | import time, socket
  2 | from theano.tensor import lscalar, lvector, matrix, tanh, dot, grad, log, arange
  3 | from theano.tensor.nnet import softmax, crossentropy_softmax_argmax_1hot_with_bias
  4 | from theano import shared, function, config
  5 | import numpy, theano
  6 | from numpy import asarray, random
  7 | random.seed(2344)
  8 | 
  9 | import theano.tensor.blas_c
 10 | 
 11 | def rand(*size):
 12 |     return asarray(random.rand(*size), dtype=config.floatX)
 13 | def randn(*size):
 14 |     return asarray(random.randn(*size), dtype=config.floatX)
 15 | def randint(size, high):
 16 |     return asarray(random.randint(size=size, low=0, high=high), dtype='int32')
 17 | def zeros(*size):
 18 |     return numpy.zeros(size, dtype=config.floatX)
 19 | 
 20 | n_examples=6000
 21 | inputs=784
 22 | outputs=10
 23 | lr=numpy.asarray(0.01, dtype=config.floatX)
 24 | 
 25 | batchsize=60
 26 | 
 27 | data_x = shared(randn(n_examples, inputs))
 28 | data_y = shared(randint((n_examples,), outputs))
 29 | 
 30 | si = lscalar()
 31 | nsi = lscalar()
 32 | sx = data_x[si:si + nsi]
 33 | sy = data_y[si:si + nsi]
 34 | 
 35 | bmark = open("%smlp_%s_%s.bmark" %(
 36 |     socket.gethostname(),
 37 |     config.device,
 38 |     config.floatX),
 39 |     'w')
 40 | 
 41 | def reportmodel(model, batchsize, t):
 42 |     bmark.write("%s\t" % model)
 43 |     if config.floatX == 'float32':
 44 |         prec = 'float'
 45 |     else:
 46 |         prec = 'double'
 47 |     bmark.write("theano{%s/%s/%i}\t" % (
 48 |         config.device[0], prec, batchsize))
 49 |     bmark.write("%.2f\n"%(n_examples/t)) # report examples / second
 50 | 
 51 | def eval_and_report(train, name):
 52 |     if 1:
 53 |         t = time.time()
 54 |         for i in xrange(n_examples):
 55 |             train(i, 1)
 56 |         reportmodel(name, 1, time.time()-t)
 57 | 
 58 |     if 0:# repeat w batchsize
 59 |         t = time.time()
 60 |         for i in xrange(n_examples/batchsize):
 61 |             cost = train(i*batchsize, batchsize)
 62 |             if not (i % 20):
 63 |                 print i*batchsize, cost
 64 |         reportmodel(name, batchsize, time.time()-t)
 65 | 
 66 | 
 67 | def online_mlp_784_10():
 68 |     v = shared(zeros(outputs, inputs))
 69 |     c = shared(zeros(outputs))
 70 |     si = shared(0)    # current training example index
 71 |     sx = data_x[si]
 72 |     sy = data_y[si]
 73 | 
 74 |     nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias(
 75 |             dot(sx, v.T).dimshuffle('x', 0),
 76 |             c,
 77 |             sy.dimshuffle('x'))
 78 |     cost = nll.mean()
 79 |     gv, gc = grad(cost, [v, c])
 80 |     train = function([], [],
 81 |             updates={
 82 |                 v:v - lr * gv,
 83 |                 c:c - lr * gc,
 84 |                 si: (si + 1) % n_examples})
 85 |     theano.printing.debugprint(train, file=open('foo_train', 'wb'))
 86 |     t = time.time()
 87 |     train.fn(n_calls=n_examples)
 88 |     dt = time.time() - t
 89 |     try:
 90 |         train.fn.update_profile(train.profile)
 91 |     except AttributeError:
 92 |         pass
 93 |     reportmodel('mlp_784_10_hack', 1, dt)
 94 |     if 1:
 95 |         t = time.time()
 96 |         for i in xrange(n_examples):
 97 |             train()
 98 |         dt = time.time() - t
 99 |         reportmodel('mlp_784_10_hack2', 1, dt)
100 |     if 1:
101 |         t = time.time()
102 |         fn = train.fn
103 |         for i in xrange(n_examples): fn()
104 |         dt = time.time() - t
105 |         reportmodel('mlp_784_10_hack3', 1, dt)
106 | 
107 | def online_mlp_784_500_10():
108 |     HUs=500
109 |     w = shared(rand(HUs, inputs) * numpy.sqrt(6 / (inputs + HUs)))
110 |     b = shared(zeros(HUs))
111 |     v = shared(zeros(outputs,HUs))
112 |     c = shared(zeros(outputs))
113 |     si = shared(0)    # current training example index
114 |     sx = data_x[si]
115 |     sy = data_y[si]
116 | 
117 |     nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias(
118 |             dot(tanh(dot(sx, w.T)+b), v.T).dimshuffle('x', 0),
119 |             c,
120 |             sy.dimshuffle('x'))
121 |     cost = nll.mean()
122 |     gw, gb, gv, gc = grad(cost, [w, b, v, c])
123 |     train = function([], [],
124 |             updates={
125 |                 w:w - lr * gw,
126 |                 b:b - lr * gb,
127 |                 v:v - lr * gv,
128 |                 c:c - lr * gc,
129 |                 si: (si + 1) % n_examples})
130 |     theano.printing.debugprint(train, file=open('foo_train', 'wb'))
131 |     t = time.time()
132 |     train.fn(n_calls=n_examples)
133 |     dt = time.time() - t
134 |     try:
135 |         train.fn.update_profile(train.profile)
136 |     except AttributeError:
137 |         pass
138 |     reportmodel('mlp_784_500_10_hack', 1, dt)
139 | 
140 | def online_mlp_784_1000_1000_1000_10():
141 |     w0 = shared(rand(inputs, 1000) * numpy.sqrt(6 / (inputs + 1000)))
142 |     b0 = shared(zeros(1000))
143 |     w1 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000)))
144 |     b1 = shared(zeros(1000))
145 |     w2 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000)))
146 |     b2 = shared(zeros(1000))
147 |     v = shared(zeros(1000, outputs))
148 |     c = shared(zeros(outputs))
149 |     params=[w0,b0,w1,b1,w2,b2,v,c]
150 | 
151 |     si = shared(0)    # current training example index
152 |     sx = data_x[si]
153 |     sy = data_y[si]
154 |     h0 = tanh(dot(sx, w0)+b0)
155 |     h1 = tanh(dot(h0, w1)+b1)
156 |     h2 = tanh(dot(h1, w2)+b2)
157 | 
158 |     nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias(
159 |             dot(h2, v).dimshuffle('x', 0),
160 |             c,
161 |             sy.dimshuffle('x'))
162 |     cost = nll.mean()
163 |     gparams = grad(cost, params)
164 |     updates = [(p,p-lr*gp) for p,gp in zip(params, gparams)]
165 |     updates += [(si, (si + 1) % n_examples)]
166 |     train = function([], [], updates=updates)
167 |     theano.printing.debugprint(train, file=open('foo_train', 'wb'))
168 |     t = time.time()
169 |     train.fn(n_calls=n_examples)
170 |     dt = time.time() - t
171 |     try:
172 |         train.fn.update_profile(train.profile)
173 |     except AttributeError:
174 |         pass
175 |     reportmodel('mlp_784_1000_1000_1000_10_hack', 1, dt)
176 | 
177 | def bench_logreg():
178 |     v = shared(zeros(outputs, inputs))
179 |     c = shared(zeros(outputs))
180 |     #
181 |     # Note on the transposed-ness of v for some reason, this data layout is faster than the
182 |     # non-transposed orientation.
183 |     # The change doesn't make much difference in the deeper models, 
184 |     # but in this case it was more than twice as fast.
185 |     #
186 | 
187 |     p_y_given_x = softmax(dot(sx, v.T) + c)
188 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
189 |     cost = nll.mean()
190 | 
191 |     gv, gc = grad(cost, [v, c])
192 | 
193 |     theano.printing.debugprint(grad(cost, [v, c]), file=open('foo', 'wb'))
194 |     train = function([si, nsi], [],
195 |             updates={ v:v - lr * gv, c:c - lr * gc })
196 |     theano.printing.debugprint(train, file=open('foo_train', 'wb'))
197 | 
198 |     eval_and_report(train, "mlp_784_10")
199 |     print v.get_value().mean()
200 |     print v.get_value()[:5,:5]
201 | 
202 | def bench_mlp_500():
203 |     HUs=500
204 |     w = shared(rand(HUs, inputs) * numpy.sqrt(6 / (inputs + HUs)))
205 |     b = shared(zeros(HUs))
206 |     v = shared(zeros(outputs,HUs))
207 |     c = shared(zeros(outputs))
208 | 
209 |     p_y_given_x = softmax(dot(tanh(dot(sx, w.T)+b), v.T)+c)
210 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
211 |     cost = nll.mean()
212 | 
213 |     gw,gb,gv,gc = grad(cost, [w,b,v,c])
214 | 
215 |     train = function([si, nsi], cost,
216 |             updates={ w:w-lr*gw,
217 |                       b:b-lr*gb,
218 |                       v:v-lr*gv,
219 |                       c:c-lr*gc })
220 |     eval_and_report(train, "mlp_784_500_10")
221 | 
222 | def bench_deep1000():
223 |     w0 = shared(rand(inputs, 1000) * numpy.sqrt(6 / (inputs + 1000)))
224 |     b0 = shared(zeros(1000))
225 |     w1 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000)))
226 |     b1 = shared(zeros(1000))
227 |     w2 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000)))
228 |     b2 = shared(zeros(1000))
229 |     v = shared(zeros(1000, outputs))
230 |     c = shared(zeros(outputs))
231 |     params=[w0,b0,w1,b1,w2,b2,v,c]
232 | 
233 |     h0 = tanh(dot(sx, w0)+b0)
234 |     h1 = tanh(dot(h0, w1)+b1)
235 |     h2 = tanh(dot(h1, w2)+b2)
236 | 
237 |     p_y_given_x = softmax(dot(h2, v)+c)
238 |     nll = -log(p_y_given_x)[arange(sy.shape[0]), sy]
239 |     cost = nll.mean()
240 | 
241 |     gparams = grad(cost, params)
242 | 
243 |     train = function([si, nsi], cost,
244 |             updates=[(p,p-lr*gp) for p,gp in zip(params, gparams)])
245 |     eval_and_report(train, "mlp_784_1000_1000_1000_10")
246 | 
247 | if __name__ == '__main__':
248 |     online_mlp_784_10()
249 |     online_mlp_784_500_10()
250 |     bench_logreg()
251 |     bench_mlp_500()
252 |     #online_mlp_784_1000_1000_1000_10()
253 |     #bench_deep1000()
254 | 


--------------------------------------------------------------------------------
/theano/rbm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | from __future__ import absolute_import
 3 | import numpy as np
 4 | import sys
 5 | import time
 6 | from theano.tensor import lscalar, dot, sum as tsum
 7 | from theano.tensor.nnet import sigmoid
 8 | from theano import shared, function, config
 9 | 
10 | rng = np.random.RandomState(342)
11 | 
12 | def rand(*size):
13 |     return np.asarray(rng.rand(*size), dtype=config.floatX)
14 | def randn(*size):
15 |     return np.asarray(rng.randn(*size), dtype=config.floatX)
16 | def randint(size, high):
17 |     return np.asarray(rng.randint(size=size, low=0, high=high), dtype='int32')
18 | def zeros(*size):
19 |     return np.zeros(size, dtype=config.floatX)
20 | 
21 | # c: aa.cc
22 | 
23 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]]
24 | lr = 0.01
25 | 
26 | # declare data
27 | data_x = shared(rand(batchsize*niter, nin))
28 | si = lscalar()
29 | nsi = lscalar()
30 | 
31 | # declare model weights
32 | a = shared(zeros(nin))
33 | w = shared(zeros(nin, nout))
34 | b = shared(zeros(nout))
35 | 
36 | import theano.sandbox.rng_mrg
37 | R = theano.sandbox.rng_mrg.MRG_RandomStreams()
38 | 
39 | def bern(x, size):
40 |     return R.binomial(size=size, p=x, n=1, dtype=config.floatX)
41 | 
42 | pos_vis = data_x[si:si+nsi]
43 | 
44 | pos_hid = sigmoid(dot(pos_vis, w)+b)
45 | 
46 | neg_vis = sigmoid(dot(bern(pos_hid, (batchsize, nout)), w.T)+a)
47 | 
48 | neg_hid = sigmoid(dot(bern(neg_vis, (batchsize, nin)), w) + b)
49 | 
50 | new_a = a - lr * tsum(pos_vis - neg_vis, axis=0)
51 | new_b = b - lr * tsum(pos_hid - neg_hid, axis=0)
52 | new_w = w - lr * (dot(pos_vis.T, pos_hid) - dot(neg_vis.T, neg_hid))
53 | 
54 | f = function([si, nsi], [], updates={a:new_a, b:new_b, w:new_w})
55 | 
56 | t = time.time()
57 | for i in xrange(niter):
58 |     f(i*batchsize, batchsize)
59 | 
60 | print 'cd1 rbm_bernoulli %i_%i\ttheano{%s/%s/%i}\t%.2f' %(
61 |         nin, nout, 
62 |         config.device[0],
63 |         ('float' if config.floatX == 'float32' else 'double'),
64 |         batchsize, 
65 |         niter*batchsize/(time.time() - t))
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/theano/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # FOR MAGGIE I INSTALLED MKL SO DO LIKE THIS:
 5 | # LD_LIBRARY_PATH to include     /u/bergstrj/pub/intel/mkl/10.2.4.032/lib/em64t 
 6 | # LIBRARY_PATH to include        /u/bergstrj/pub/intel/mkl/10.2.4.032/lib/em64t
 7 | # THEANO_FLAGS="device=cpu,floatX=float64,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def -lpthread" python mlp.py
 8 | 
 9 | 
10 | MKL32='linker=c|py_nogc,device=cpu,floatX=float32,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def'
11 | MKL64='linker=c|py_nogc,device=cpu,floatX=float64,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def'
12 | GPU32='linker=c|py_nogc,device=gpu0,floatX=float32'
13 | 
14 | 
15 | #THEANO_FLAGS="$MKL32" python mlp.py
16 | #THEANO_FLAGS="$MKL64" python mlp.py
17 | #THEANO_FLAGS="$GPU32" python mlp.py
18 | 
19 | #THEANO_FLAGS="$MKL32" python convnet.py
20 | #THEANO_FLAGS="$MKL64" python convnet.py
21 | #THEANO_FLAGS="$GPU32" python convnet.py
22 | 
23 | 
24 | cat /proc/cpuinfo |grep "model name"|uniq > ${HOSTNAME}_config.conf
25 | free >> ${HOSTNAME}_config.conf
26 | uname -a >>  ${HOSTNAME}_config.conf
27 | 
28 | THEANO_FLAGS="$MKL32" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_cpu32_b1.bmark
29 | THEANO_FLAGS="$MKL32" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_cpu32_b60.bmark
30 | 
31 | THEANO_FLAGS="$MKL64" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_cpu64_b1.bmark
32 | THEANO_FLAGS="$MKL64" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_cpu64_b60.bmark
33 | 
34 | #THEANO_FLAGS="$GPU32" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_gpu32_b1.bmark
35 | #THEANO_FLAGS="$GPU32" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_gpu32_b60.bmark
36 | 
37 | 


--------------------------------------------------------------------------------
/torch5/MiniBatchGradient.lua:
--------------------------------------------------------------------------------
 1 | require "lab"
 2 | 
 3 | local MiniBatchGradient = torch.class('nn.MiniBatchGradient')
 4 | 
 5 | function MiniBatchGradient:__init(module, criterion, batchSize)
 6 |    self.learningRate = 0.01
 7 |    self.learningRateDecay = 0
 8 |    self.maxIteration = 25
 9 |    self.shuffleIndices = true
10 |    self.module = module
11 |    self.criterion = criterion
12 |    self.batchSize = batchSize or -1
13 | end
14 | 
15 | function MiniBatchGradient:train(dataset)
16 |    local iteration = 1
17 |    local currentLearningRate = self.learningRate
18 |    local module = self.module
19 |    local criterion = self.criterion
20 | 
21 |    local shuffledIndices = lab.randperm(dataset:size())
22 |    if not self.shuffleIndices then
23 |       for t = 1,dataset:size() do
24 |          shuffledIndices[t] = t
25 |       end
26 |    end
27 | 
28 |    -- fully batch?
29 |    if self.batchSize < 1 then
30 |       self.batchSize = dataset:size()
31 |    end
32 | 
33 |    print("# MiniBatchGradient: training with batch size: " .. self.batchSize)
34 | 
35 |    while true do
36 |       local currentError = 0
37 |       module:zeroGradParameters()
38 |       for t = 1,dataset:size() do
39 |          local example = dataset[shuffledIndices[t]]
40 |          local input = example[1]
41 |          local target = example[2]
42 | 
43 |          currentError = currentError + criterion:forward(module:forward(input), target)
44 | 
45 |          module:backward(input, criterion:backward(module.output, target))
46 | 
47 |          if t % self.batchSize == 0 then
48 |             module:updateParameters(currentLearningRate)
49 |             module:zeroGradParameters()
50 |          end
51 | 
52 |          if self.hookExample then
53 |             self.hookExample(self, example)
54 |          end
55 |       end
56 | 
57 |       if self.hookIteration then
58 |          self.hookIteration(self, iteration)
59 |       end
60 | 
61 |       currentError = currentError / dataset:size()
62 |       print("# current error = " .. currentError)
63 |       iteration = iteration + 1
64 |       currentLearningRate = self.learningRate/(1+iteration*self.learningRateDecay)
65 |       if self.maxIteration > 0 and iteration > self.maxIteration then
66 |          print("# MiniBatchGradient: you have reached the maximum number of iterations")
67 |          break
68 |       end
69 |    end
70 | end
71 | 
72 | function MiniBatchGradient:write(file)
73 |    file:writeDouble(self.learningRate)
74 |    file:writeDouble(self.learningRateDecay)
75 |    file:writeInt(self.maxIteration)
76 |    file:writeBool(self.shuffleIndices)
77 |    file:writeObject(self.module)
78 |    file:writeObject(self.criterion)
79 |    file:writeLong(self.batchSize)
80 | end
81 | 
82 | function MiniBatchGradient:read(file)
83 |    self.learningRate = file:readDouble()
84 |    self.learningRateDecay = file:readDouble()
85 |    self.maxIteration = file:readInt()
86 |    self.shuffleIndices = file:readBool()
87 |    self.module = file:readObject()
88 |    self.criterion = file:readObject()
89 |    self.batchSize = file:readLong()
90 | end
91 | 


--------------------------------------------------------------------------------
/torch5/mlp.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env lua
  2 | 
  3 | require "lab"
  4 | require "os"
  5 | require "nn"
  6 | 
  7 | 
  8 | n_examples=12000;
  9 | outputs=10;
 10 | 
 11 | io.output("torch5.bmark")
 12 | 
 13 | if true then -- MLP 32/10
 14 |     dataset={};
 15 |     function dataset:size() return n_examples end
 16 |     inputs=32;
 17 |     for i=1,dataset:size() do 
 18 |       dataset[i] = {lab.randn(inputs), (i % outputs)+1}
 19 |     end
 20 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 21 |     mlp:add(nn.Linear(inputs, outputs))
 22 |     mlp:add(nn.LogSoftMax())
 23 | 
 24 |     criterion = nn.ClassNLLCriterion()  
 25 |     trainer = nn.StochasticGradient(mlp, criterion)
 26 | 
 27 |     trainer.learningRate = 0.01
 28 |     trainer.shuffleIndices = false
 29 |     trainer.maxIteration = 1
 30 |     local x = os.clock()
 31 |     trainer:train(dataset)
 32 |     -- we're not using Xent, but using Xent would be even slower
 33 |     io.write(string.format("mlp_%i_%i", inputs, outputs), "\t",
 34 |         "torch5", "\t",
 35 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
 36 | end
 37 | 
 38 | 
 39 | dataset={};
 40 | function dataset:size() return n_examples end
 41 | inputs=784;
 42 | 
 43 | 
 44 | for i=1,dataset:size() do 
 45 |   dataset[i] = {lab.randn(inputs), (i % outputs)+1}
 46 | end
 47 | 
 48 | if true -- MLP 784/10
 49 | then
 50 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 51 |     mlp:add(nn.Linear(inputs, outputs))
 52 |     mlp:add(nn.LogSoftMax())
 53 | 
 54 |     criterion = nn.ClassNLLCriterion()  
 55 |     trainer = nn.StochasticGradient(mlp, criterion)
 56 | 
 57 |     trainer.learningRate = 0.01
 58 |     trainer.shuffleIndices = false
 59 |     trainer.maxIteration = 1
 60 |     local x = os.clock()
 61 |     trainer:train(dataset)
 62 |     -- we're not using Xent, but using Xent would be even slower
 63 |     io.write(string.format("mlp_%i_%i", inputs, outputs), "\t",
 64 |         "torch5", "\t",
 65 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
 66 | else
 67 |     io.write(string.format("# mlp_%i_%i", inputs, outputs), "\t",
 68 |         "torch5", "\t",
 69 |         "0.0", "\n")
 70 | end
 71 | 
 72 | 
 73 | if true -- MLP 784/500/10
 74 | then
 75 | 
 76 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 77 |     mlp:add(nn.Linear(inputs, 500))
 78 |     mlp:add(nn.Tanh())
 79 |     mlp:add(nn.Linear(500, outputs))
 80 |     mlp:add(nn.LogSoftMax())
 81 | 
 82 |     criterion = nn.ClassNLLCriterion()  
 83 |     trainer = nn.StochasticGradient(mlp, criterion)
 84 | 
 85 |     trainer.learningRate = 0.01
 86 |     trainer.shuffleIndices = false
 87 |     trainer.maxIteration = 1
 88 |     local x = os.clock()
 89 |     trainer:train(dataset)
 90 |     -- we're not using Xent, but using Xent would be even slower
 91 |     io.write(string.format("mlp_%i_500_%i", inputs, outputs), "\t",
 92 |         "torch5", "\t",
 93 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
 94 | else
 95 |     io.write(string.format("# mlp_%i_500_%i", inputs, outputs), "\t",
 96 |         "torch5", "\t",
 97 |         "0.0", "\n")
 98 | end
 99 | 
100 | 
101 | if true --MLP 784/1000/1000/1000/10
102 | then
103 | 
104 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
105 |     mlp:add(nn.Linear(inputs, 1000))
106 |     mlp:add(nn.Tanh())
107 |     mlp:add(nn.Linear(1000, 1000))
108 |     mlp:add(nn.Tanh())
109 |     mlp:add(nn.Linear(1000, 1000))
110 |     mlp:add(nn.Tanh())
111 |     mlp:add(nn.Linear(1000, outputs))
112 |     mlp:add(nn.LogSoftMax())
113 | 
114 |     criterion = nn.ClassNLLCriterion()  
115 |     trainer = nn.StochasticGradient(mlp, criterion)
116 | 
117 |     trainer.learningRate = 0.01
118 |     trainer.shuffleIndices = false
119 |     trainer.maxIteration = 1
120 |     local x = os.clock()
121 |     trainer:train(dataset)
122 |     -- we're not using Xent, but using Xent would be even slower
123 |     io.write("mlp_784_1000_1000_1000_10", "\t",
124 |         "torch5", "\t",
125 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
126 | 
127 | else
128 |     io.write("# mlp_784_1000_1000_1000_10", "\t",
129 |         "torch5", "\t",
130 |         "0.0", "\n")
131 | 
132 | end
133 | 
134 | dset_32x32={};
135 | function dset_32x32:size() return n_examples end
136 | for i=1,dset_32x32:size() do 
137 |   dset_32x32[i] = {lab.randn(32,32,1), (i % outputs)+1}
138 | end
139 | 
140 | if true --LeNet5-like 32x32
141 | then
142 | 
143 |     -- There is no max-pooling implemented, just avg pooling.
144 |     -- So I added tanh between every layer to separate true conv layers with
145 |     -- the subsampling (which is just a convolution with 1s)
146 | 
147 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
148 |     mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28
149 |     mlp:add(nn.Tanh())
150 |     mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14
151 |     mlp:add(nn.Tanh())
152 |     mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10
153 |     mlp:add(nn.Tanh())
154 |     mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5
155 |     mlp:add(nn.Tanh())
156 |     mlp:add(nn.Reshape(16*5*5))
157 |     mlp:add(nn.Linear(16*5*5, 120))
158 |     mlp:add(nn.Linear(120, outputs))
159 |     mlp:add(nn.LogSoftMax())
160 | 
161 |     criterion = nn.ClassNLLCriterion()  
162 |     trainer = nn.StochasticGradient(mlp, criterion)
163 | 
164 |     trainer.learningRate = 0.01
165 |     trainer.shuffleIndices = false
166 |     trainer.maxIteration = 1
167 |     local x = os.clock()
168 |     trainer:train(dset_32x32)
169 |     -- we're not using Xent, but using Xent would be even slower
170 |     io.write("ConvSmall", "\t",
171 |         "torch5", "\t",
172 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
173 | end
174 | 
175 | dset_96x96={};
176 | function dset_96x96:size() return 100 end
177 | for i=1,dset_96x96:size() do 
178 |   dset_96x96[i] = {lab.randn(96,96,1), (i % outputs)+1}
179 | end
180 | 
181 | if true --LeNet5-like 96x96
182 | then
183 | 
184 |     -- There is no max-pooling implemented, just avg pooling.
185 |     -- So I added tanh between every layer to separate true conv layers with
186 |     -- the subsampling (which is just a convolution with 1s)
187 | 
188 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
189 |     mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90
190 |     mlp:add(nn.Tanh())
191 |     mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30
192 |     mlp:add(nn.Tanh())
193 |     mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24
194 |     mlp:add(nn.Tanh())
195 |     mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8
196 |     mlp:add(nn.Tanh())
197 |     mlp:add(nn.Reshape(16*8*8))
198 |     mlp:add(nn.Linear(16*8*8, 120))
199 |     mlp:add(nn.Linear(120, outputs))
200 |     mlp:add(nn.LogSoftMax())
201 | 
202 |     criterion = nn.ClassNLLCriterion()  
203 |     trainer = nn.StochasticGradient(mlp, criterion)
204 | 
205 |     trainer.learningRate = 0.01
206 |     trainer.shuffleIndices = false
207 |     trainer.maxIteration = 1
208 |     local x = os.clock()
209 |     trainer:train(dset_96x96)
210 |     -- we're not using Xent, but using Xent would be even slower
211 |     io.write("ConvMed", "\t",
212 |         "torch5", "\t",
213 |         string.format("%.2f\n", dset_96x96:size()/(os.clock() - x)), "\n")
214 | end
215 | 
216 | 
217 | dset_256x256={};
218 | function dset_256x256:size() return 20 end
219 | for i=1,dset_256x256:size() do 
220 |   dset_256x256[i] = {lab.randn(256,256,1), (i % outputs)+1}
221 | end
222 | 
223 | if true --LeNet5-like 256x256
224 | then
225 | 
226 |     -- There is no max-pooling implemented, just avg pooling.
227 |     -- So I added tanh between every layer to separate true conv layers with
228 |     -- the subsampling (which is just a convolution with 1s)
229 | 
230 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
231 |     mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250
232 |     mlp:add(nn.Tanh())
233 |     mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50
234 |     mlp:add(nn.Tanh())
235 |     mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44
236 |     mlp:add(nn.Tanh())
237 |     mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11
238 |     mlp:add(nn.Tanh())
239 |     mlp:add(nn.Reshape(16*11*11))
240 |     mlp:add(nn.Linear(16*11*11, 120))
241 |     mlp:add(nn.Linear(120, outputs))
242 |     mlp:add(nn.LogSoftMax())
243 | 
244 |     criterion = nn.ClassNLLCriterion()  
245 |     trainer = nn.StochasticGradient(mlp, criterion)
246 | 
247 |     trainer.learningRate = 0.01
248 |     trainer.shuffleIndices = false
249 |     trainer.maxIteration = 1
250 |     local x = os.clock()
251 |     trainer:train(dset_256x256)
252 |     -- we're not using Xent, but using Xent would be even slower
253 |     io.write("ConvLarge", "\t",
254 |         "torch5", "\t",
255 |         string.format("%.2f\n", dset_256x256:size()/(os.clock() - x)), "\n")
256 | end
257 | 


--------------------------------------------------------------------------------
/torch5/mlp_minibatch.lua:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env lua
  2 | 
  3 | require "lab"
  4 | require "os"
  5 | require "nn"
  6 | 
  7 | -- When discussin Torch's performance by email,
  8 | -- Ronan sent me this file for doing mini-batches.  
  9 | -- It seems like an unofficial feature, I don't know why this isn't in the
 10 | -- main distribution...
 11 | dofile('MiniBatchGradient.lua')
 12 | 
 13 | 
 14 | n_examples=12000;
 15 | outputs=10;
 16 | 
 17 | io.output("torch5_minibatch.bmark")
 18 | 
 19 | if true then -- MLP 32/10
 20 |     dataset={};
 21 |     function dataset:size() return n_examples end
 22 |     inputs=32;
 23 |     for i=1,dataset:size() do 
 24 |       dataset[i] = {lab.randn(inputs), (i % outputs)+1}
 25 |     end
 26 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 27 |     mlp:add(nn.Linear(inputs, outputs))
 28 |     mlp:add(nn.LogSoftMax())
 29 | 
 30 |     criterion = nn.ClassNLLCriterion()  
 31 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
 32 | 
 33 |     trainer.learningRate = 0.01
 34 |     trainer.shuffleIndices = false
 35 |     trainer.maxIteration = 1
 36 |     local x = os.clock()
 37 |     trainer:train(dataset)
 38 |     -- we're not using Xent, but using Xent would be even slower
 39 |     io.write(string.format("mlp_%i_%i", inputs, outputs), "\t",
 40 |         "torch5{60}", "\t",
 41 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
 42 | end
 43 | 
 44 | 
 45 | dataset={};
 46 | function dataset:size() return n_examples end
 47 | inputs=784;
 48 | 
 49 | 
 50 | for i=1,dataset:size() do 
 51 |   dataset[i] = {lab.randn(inputs), (i % outputs)+1}
 52 | end
 53 | 
 54 | if true -- MLP 784/10
 55 | then
 56 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 57 |     mlp:add(nn.Linear(inputs, outputs))
 58 |     mlp:add(nn.LogSoftMax())
 59 | 
 60 |     criterion = nn.ClassNLLCriterion()  
 61 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
 62 | 
 63 |     trainer.learningRate = 0.01
 64 |     trainer.shuffleIndices = false
 65 |     trainer.maxIteration = 1
 66 |     local x = os.clock()
 67 |     trainer:train(dataset)
 68 |     -- we're not using Xent, but using Xent would be even slower
 69 |     io.write(string.format("mlp_%i_%i", inputs, outputs), "\t",
 70 |         "torch5{60}", "\t",
 71 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
 72 | else
 73 |     io.write(string.format("# mlp_%i_%i", inputs, outputs), "\t",
 74 |         "torch5{60}", "\t",
 75 |         "0.0", "\n")
 76 | end
 77 | 
 78 | 
 79 | if true -- MLP 784/500/10
 80 | then
 81 | 
 82 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
 83 |     mlp:add(nn.Linear(inputs, 500))
 84 |     mlp:add(nn.Tanh())
 85 |     mlp:add(nn.Linear(500, outputs))
 86 |     mlp:add(nn.LogSoftMax())
 87 | 
 88 |     criterion = nn.ClassNLLCriterion()  
 89 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
 90 | 
 91 |     trainer.learningRate = 0.01
 92 |     trainer.shuffleIndices = false
 93 |     trainer.maxIteration = 1
 94 |     local x = os.clock()
 95 |     trainer:train(dataset)
 96 |     -- we're not using Xent, but using Xent would be even slower
 97 |     io.write(string.format("mlp_%i_500_%i", inputs, outputs), "\t",
 98 |         "torch5{60}", "\t",
 99 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
100 | else
101 |     io.write(string.format("# mlp_%i_500_%i", inputs, outputs), "\t",
102 |         "torch5{60}", "\t",
103 |         "0.0", "\n")
104 | end
105 | 
106 | 
107 | if true --MLP 784/1000/1000/1000/10
108 | then
109 | 
110 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
111 |     mlp:add(nn.Linear(inputs, 1000))
112 |     mlp:add(nn.Tanh())
113 |     mlp:add(nn.Linear(1000, 1000))
114 |     mlp:add(nn.Tanh())
115 |     mlp:add(nn.Linear(1000, 1000))
116 |     mlp:add(nn.Tanh())
117 |     mlp:add(nn.Linear(1000, outputs))
118 |     mlp:add(nn.LogSoftMax())
119 | 
120 |     criterion = nn.ClassNLLCriterion()  
121 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
122 | 
123 |     trainer.learningRate = 0.01
124 |     trainer.shuffleIndices = false
125 |     trainer.maxIteration = 1
126 |     local x = os.clock()
127 |     trainer:train(dataset)
128 |     -- we're not using Xent, but using Xent would be even slower
129 |     io.write("mlp_784_1000_1000_1000_10", "\t",
130 |         "torch5{60}", "\t",
131 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
132 | 
133 | else
134 |     io.write("# mlp_784_1000_1000_1000_10", "\t",
135 |         "torch5{60}", "\t",
136 |         "0.0", "\n")
137 | 
138 | end
139 | 
140 | dset_32x32={};
141 | function dset_32x32:size() return n_examples end
142 | for i=1,dset_32x32:size() do 
143 |   dset_32x32[i] = {lab.randn(32,32,1), (i % outputs)+1}
144 | end
145 | 
146 | if true --LeNet5-like 32x32
147 | then
148 | 
149 |     -- There is no max-pooling implemented, just avg pooling.
150 |     -- So I added tanh between every layer to separate true conv layers with
151 |     -- the subsampling (which is just a convolution with 1s)
152 | 
153 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
154 |     mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28
155 |     mlp:add(nn.Tanh())
156 |     mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14
157 |     mlp:add(nn.Tanh())
158 |     mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10
159 |     mlp:add(nn.Tanh())
160 |     mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5
161 |     mlp:add(nn.Tanh())
162 |     mlp:add(nn.Reshape(16*5*5))
163 |     mlp:add(nn.Linear(16*5*5, 120))
164 |     mlp:add(nn.Linear(120, outputs))
165 |     mlp:add(nn.LogSoftMax())
166 | 
167 |     criterion = nn.ClassNLLCriterion()  
168 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
169 | 
170 |     trainer.learningRate = 0.01
171 |     trainer.shuffleIndices = false
172 |     trainer.maxIteration = 1
173 |     local x = os.clock()
174 |     trainer:train(dset_32x32)
175 |     -- we're not using Xent, but using Xent would be even slower
176 |     io.write("ConvSmall", "\t",
177 |         "torch5{60}", "\t",
178 |         string.format("%.2f\n", n_examples/(os.clock() - x)), "\n")
179 | end
180 | 
181 | dset_96x96={};
182 | function dset_96x96:size() return 100 end
183 | for i=1,dset_96x96:size() do 
184 |   dset_96x96[i] = {lab.randn(96,96,1), (i % outputs)+1}
185 | end
186 | 
187 | if true --LeNet5-like 96x96
188 | then
189 | 
190 |     -- There is no max-pooling implemented, just avg pooling.
191 |     -- So I added tanh between every layer to separate true conv layers with
192 |     -- the subsampling (which is just a convolution with 1s)
193 | 
194 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
195 |     mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90
196 |     mlp:add(nn.Tanh())
197 |     mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30
198 |     mlp:add(nn.Tanh())
199 |     mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24
200 |     mlp:add(nn.Tanh())
201 |     mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8
202 |     mlp:add(nn.Tanh())
203 |     mlp:add(nn.Reshape(16*8*8))
204 |     mlp:add(nn.Linear(16*8*8, 120))
205 |     mlp:add(nn.Linear(120, outputs))
206 |     mlp:add(nn.LogSoftMax())
207 | 
208 |     criterion = nn.ClassNLLCriterion()  
209 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
210 | 
211 |     trainer.learningRate = 0.01
212 |     trainer.shuffleIndices = false
213 |     trainer.maxIteration = 1
214 |     local x = os.clock()
215 |     trainer:train(dset_96x96)
216 |     -- we're not using Xent, but using Xent would be even slower
217 |     io.write("ConvMed", "\t",
218 |         "torch5{60}", "\t",
219 |         string.format("%.2f\n", dset_96x96:size()/(os.clock() - x)), "\n")
220 | end
221 | 
222 | 
223 | dset_256x256={};
224 | function dset_256x256:size() return 20 end
225 | for i=1,dset_256x256:size() do 
226 |   dset_256x256[i] = {lab.randn(256,256,1), (i % outputs)+1}
227 | end
228 | 
229 | if true --LeNet5-like 256x256
230 | then
231 | 
232 |     -- There is no max-pooling implemented, just avg pooling.
233 |     -- So I added tanh between every layer to separate true conv layers with
234 |     -- the subsampling (which is just a convolution with 1s)
235 | 
236 |     mlp = nn.Sequential();                 -- make a multi-layer perceptron
237 |     mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250
238 |     mlp:add(nn.Tanh())
239 |     mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50
240 |     mlp:add(nn.Tanh())
241 |     mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44
242 |     mlp:add(nn.Tanh())
243 |     mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11
244 |     mlp:add(nn.Tanh())
245 |     mlp:add(nn.Reshape(16*11*11))
246 |     mlp:add(nn.Linear(16*11*11, 120))
247 |     mlp:add(nn.Linear(120, outputs))
248 |     mlp:add(nn.LogSoftMax())
249 | 
250 |     criterion = nn.ClassNLLCriterion()  
251 |     trainer = nn.MiniBatchGradient(mlp, criterion, 60)
252 | 
253 |     trainer.learningRate = 0.01
254 |     trainer.shuffleIndices = false
255 |     trainer.maxIteration = 1
256 |     local x = os.clock()
257 |     trainer:train(dset_256x256)
258 |     -- we're not using Xent, but using Xent would be even slower
259 |     io.write("ConvLarge", "\t",
260 |         "torch5{60}", "\t",
261 |         string.format("%.2f\n", dset_256x256:size()/(os.clock() - x)), "\n")
262 | end
263 | 


--------------------------------------------------------------------------------
/torch5/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | ./mlp.lua
4 | 
5 | mv torch5.bmark ${HOSTNAME}_torch5.bmark
6 | 
7 | ./mlp_minibatch.lua
8 | mv torch5_minibatch.bmark ${HOSTNAME}_torch5_minibatch.bmark
9 | 


--------------------------------------------------------------------------------
/torch7/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | run.sh.results*
3 | lib
4 | 


--------------------------------------------------------------------------------
/torch7/README.txt:
--------------------------------------------------------------------------------
 1 | The code in this directory was forked from
 2 | 
 3 | https://github.com/andresy/benchmark/commit/cd81345962bc05fe4819a56a675681605ea1587f
 4 | 
 5 | Installing Torch 7
 6 | ------------------
 7 | 
 8 | Torch 7 (https://github.com/andresy/torch) is required to run the scripts in
 9 | this folder.  I had personal help from Koray to install torch7. It was
10 | straightforward once he convinced me not to use luarocks. Torch7 failed to
11 | find the openblas I installed, so I had to trick it post-compilation by
12 | setting the LD_LIBRARY_PATH to include a symlink with the right name to my
13 | libopenblas.so.  Use ldd on the libTH.so built by torch7 to see what name you
14 | must give to this fake library.
15 | 
16 | 
17 | Running timing experiments
18 | --------------------------
19 | 
20 | The file run.sh produces a number of timing files whose names are of the form
21 |     run.sh.results_${HOSTNAME}_b[1,10,100]_p[32,64][,_openmp,_cuda]
22 | 
23 | The cuda trials are run on the GPU device 0, and simply fail if no GPU is
24 | present.
25 | 
26 | 
27 | Adding results to DB
28 | --------------------
29 | 
30 | To add the current timing results to ../db.pkl, type:
31 | 
32 | $ python add_to_db.py --db ../db.pkl  run.sh.results_*
33 | 


--------------------------------------------------------------------------------
/torch7/SpatialConvolutionFast.lua:
--------------------------------------------------------------------------------
 1 | local SpatialConvolutionFast, parent = torch.class('nn.SpatialConvolutionFast', 'nn.Module')
 2 | 
 3 | function SpatialConvolutionFast:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH)
 4 |    parent.__init(self)
 5 | 
 6 |    dW = dW or 1
 7 |    dH = dH or 1
 8 | 
 9 |    self.nInputPlane = nInputPlane
10 |    self.nOutputPlane = nOutputPlane
11 |    self.kW = kW
12 |    self.kH = kH
13 |    self.dW = dW
14 |    self.dH = dH
15 | 
16 |    self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
17 |    self.bias = torch.Tensor(nOutputPlane)
18 |    self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
19 |    self.gradBias = torch.Tensor(nOutputPlane)
20 | 
21 |    self.finput = torch.Tensor()
22 |    self.fgradInput = torch.Tensor()
23 | 
24 |    self:reset()
25 | end
26 | 
27 | function SpatialConvolutionFast:reset(stdv)
28 |    if stdv then
29 |       stdv = stdv * math.sqrt(3)
30 |    else
31 |       stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
32 |    end
33 |    self.weight:apply(function()
34 |                         return random.uniform(-stdv, stdv)
35 |                      end)
36 |    self.bias:apply(function()
37 |                       return random.uniform(-stdv, stdv)
38 |                    end)   
39 | end
40 | 
41 | function SpatialConvolutionFast:forward(input)   
42 |    input = input:unfold(2, self.kH, self.dH)
43 |    input = input:unfold(3, self.kW, self.dW)
44 |    input = input:transpose(2,4)
45 |    input = input:transpose(3,5)
46 | 
47 |    self.finput:resize(self.kW*self.kH*self.nInputPlane, input:size(4)*input:size(5)):copy(input)
48 | 
49 |    self.output:resize(self.nOutputPlane, input:size(4), input:size(5))
50 |    local output = input.new(self.output:storage(), 1, self.nOutputPlane, -1, input:size(4)*input:size(5), -1):copy(
51 |       input.new(self.bias:storage(), 1, self.nOutputPlane, 1, input:size(4)*input:size(5), 0))
52 | 
53 |    output:addmm(1, self.weight, self.finput)
54 |    return self.output
55 | end
56 | 
57 | function SpatialConvolutionFast:backward(input, gradOutput)
58 |    if self.gradInput then
59 |       gradOutput = input.new(gradOutput:storage(), 1, gradOutput:size(1), -1, gradOutput:size(2)*gradOutput:size(3), -1)
60 |       
61 |       self.fgradInput:resizeAs(self.finput):zero()
62 |       self.fgradInput:addmm(1, self.weight:t(), gradOutput)
63 |       
64 |       self.gradInput:resizeAs(input):zero()
65 |       local gradInput = self.gradInput:unfold(2, self.kH, self.dH)
66 |       gradInput = gradInput:unfold(3, self.kW, self.dW)
67 |       gradInput = gradInput:transpose(2,4)
68 |       gradInput = gradInput:transpose(3,5)
69 |       gradInput:add(self.fgradInput)
70 | 
71 |       return self.gradInput
72 | 
73 |    end
74 | end
75 | 
76 | function SpatialConvolutionFast:accGradParameters(input, gradOutput, scale)
77 |    gradOutput = input.new(gradOutput:storage(), 1, gradOutput:size(1), -1, gradOutput:size(2)*gradOutput:size(3), -1)
78 |    self.gradWeight:addmm(1, gradOutput, self.finput:t())
79 |    input.new(self.gradBias:storage(), 1, gradOutput:size(1), 1, gradOutput:size(2), 0):add(gradOutput)
80 | end
81 | 


--------------------------------------------------------------------------------
/torch7/add_to_db.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Write the results of run.sh to the pickled database of timing results
 3 | """
 4 | import os
 5 | import sys
 6 | import cPickle
 7 | 
 8 | def main():
 9 |     assert sys.argv[1] == '--db'
10 |     try:
11 |         db = cPickle.load(open(sys.argv[2]))
12 |     except IOError:
13 |         db = []
14 | 
15 |     for results_file in sys.argv[3:]:
16 |         template = dict()
17 |         for lineno, line in enumerate(open(results_file)):
18 |             if '=' in line:
19 |                 key = line[:line.index('=')]
20 |                 val = line[line.index('=') + 1:]
21 |                 if key in ('host', 'device'):
22 |                     template[key] = val.strip()
23 |                 elif key in ('OpenMP',):
24 |                     template[key] = bool(int(val))
25 |                 elif key in ('batch', 'precision'):
26 |                     template[key] = int(val)
27 |                 else:
28 |                     raise ValueError(key)
29 | 
30 |             elif line.startswith('mlp'):
31 |                 problem, speed_str = line.split('\t')
32 |                 entry = dict(template)
33 |                 entry['problem'] = problem
34 |                 entry['speed'] = float(speed_str)
35 |                 db.append(entry)
36 |             elif line.startswith('cnn'):
37 |                 problem, speed_str = line.split('\t')
38 |                 entry = dict(template)
39 |                 entry['problem'] = problem
40 |                 entry['speed'] = float(speed_str)
41 |                 db.append(entry)
42 |             else:
43 |                 print "ERROR: ", line
44 | 
45 |     if 1:
46 |         print "Writing database to", sys.argv[2]
47 |         cPickle.dump(db, open(sys.argv[2], 'wb'))
48 |     else:
49 |         print "DEBUG FINAL DB:"
50 |         for entry in db:
51 |             print entry
52 | 
53 | if __name__ == '__main__':
54 |     sys.exit(main())
55 | 


--------------------------------------------------------------------------------
/torch7/benchmark.lua:
--------------------------------------------------------------------------------
  1 | require "lab"
  2 | require "nn"
  3 | 
  4 | cmd = torch.CmdLine()
  5 | 
  6 | cmd:text()
  7 | cmd:text('Benchmark Torch7')
  8 | cmd:text()
  9 | cmd:text()
 10 | cmd:text('Misc options:')
 11 | cmd:option('-nomlp', false, 'do not perform MLP tests')
 12 | cmd:option('-nocnn', false, 'do not perform CNN tests')
 13 | cmd:option('-nexmlp', 60000, '# of examples for the MLPs')
 14 | cmd:option('-nexcnn', 6000, '# of examples for the CNNs')
 15 | cmd:option('-hardtanh', false, 'use hardtanh instead of tanh')
 16 | cmd:option('-convfast', false, 'use "fast" convolution code instead of standard')
 17 | cmd:option('-openmp', false, 'use openmp *package*')
 18 | cmd:option('-double', false, 'use doubles instead of floats')
 19 | cmd:option('-cuda', false, 'use CUDA instead of floats')
 20 | cmd:option('-gi', false, 'compute gradInput')
 21 | cmd:option('-v', false, 'be verbose')
 22 | cmd:option('-batch', 1, 'batch size')
 23 | 
 24 | cmd:text()
 25 | 
 26 | local params = cmd:parse(arg)
 27 | 
 28 | random.manualSeed(5555)
 29 | 
 30 | if params.v then
 31 |    printlog = print
 32 | else
 33 |    printlog = print
 34 |    print = function()
 35 |            end
 36 | end
 37 | 
 38 | if params.openmp then
 39 |    require 'openmp'
 40 | end
 41 | 
 42 | if params.convfast then
 43 |    dofile('SpatialConvolutionFast.lua')
 44 |    nn.SpatialConvolution = nn.SpatialConvolutionFast
 45 | end
 46 | 
 47 | if params.hardtanh then
 48 |    nn.Tanh = nn.HardTanh
 49 | end
 50 | 
 51 | if params.double and params.cuda then
 52 |    error('make your choice between double and cuda!!')
 53 | end
 54 | 
 55 | if params.double then
 56 |    torch.setdefaulttensortype('torch.DoubleTensor')
 57 | elseif params.cuda then
 58 |    require 'cunn'
 59 |    dofile('cudahacks.lua')
 60 |    torch.setdefaulttensortype('torch.CudaTensor')
 61 | else
 62 |    torch.setdefaulttensortype('torch.FloatTensor')
 63 | end
 64 | 
 65 | local noutput = 10
 66 | 
 67 | if not params.nomlp then
 68 | 
 69 |    local ninput = 784
 70 |    local dataset = {}
 71 |    local data = lab.randn(params.nexmlp, ninput)
 72 |    local label = torch.LongTensor(params.nexmlp)
 73 |    for i=1,params.nexmlp do
 74 |       label[i] = (i % noutput) + 1
 75 |    end
 76 |    
 77 |    if params.batch == 1 then
 78 |       function dataset:size()
 79 |          return params.nexmlp
 80 |       end
 81 | 
 82 |       setmetatable(dataset, {__index = function(self, index)
 83 |                                           return {data[index], label[index]}
 84 |                                        end})
 85 |    else
 86 |       assert(params.nexmlp % params.batch == 0, '# of examples must be divisible with batch size')
 87 |       function dataset:size()
 88 |          return params.nexmlp/params.batch
 89 |       end
 90 |       setmetatable(dataset, {__index = function(self, index)
 91 |                                           return {data:narrow(1,(index-1)*params.batch+1, params.batch),
 92 |                                                   label:narrow(1,(index-1)*params.batch+1, params.batch)}
 93 |                                        end})
 94 |    end
 95 | 
 96 |    if true then -- MLP 784/10
 97 |       collectgarbage()
 98 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
 99 |       mlp:add(nn.Linear(ninput, noutput))
100 | 
101 |       if params.cuda then
102 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
103 |          torch.setdefaulttensortype('torch.FloatTensor')
104 |       end
105 | 
106 |       mlp:add(nn.LogSoftMax())
107 | 
108 |       if not params.gi then
109 |          if params.v then
110 |             print('# do not compute gradInput')
111 |          end
112 |          mlp:get(1).gradInput = nil
113 |       end
114 | 
115 |       local criterion = nn.ClassNLLCriterion()
116 | 
117 |       if params.cuda then
118 |          torch.setdefaulttensortype('torch.CudaTensor')
119 |       end
120 | 
121 |       local trainer = nn.StochasticGradient(mlp, criterion)
122 | 
123 |       trainer.learningRate = 0.01
124 |       trainer.shuffleIndices = false
125 |       trainer.maxIteration = 1
126 |       local t = torch.Timer()
127 |       trainer:train(dataset)
128 |       printlog(string.format("mlp_%i_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real))
129 |    end
130 | 
131 |    if true then -- MLP 784/500/10
132 |       collectgarbage()
133 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
134 |       mlp:add(nn.Linear(ninput, 500))
135 |       mlp:add(nn.Tanh())
136 |       mlp:add(nn.Linear(500, noutput))
137 | 
138 |       if params.cuda then
139 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
140 |          torch.setdefaulttensortype('torch.FloatTensor')
141 |       end
142 | 
143 |       mlp:add(nn.LogSoftMax())
144 |       
145 |       if not params.gi then
146 |          if params.v then
147 |             print('# do not compute gradInput')
148 |          end
149 |          mlp:get(1).gradInput = nil
150 |       end
151 | 
152 |       local criterion = nn.ClassNLLCriterion()  
153 | 
154 |       if params.cuda then
155 |          torch.setdefaulttensortype('torch.CudaTensor')
156 |       end
157 | 
158 |       local trainer = nn.StochasticGradient(mlp, criterion)
159 | 
160 |       trainer.learningRate = 0.01
161 |       trainer.shuffleIndices = false
162 |       trainer.maxIteration = 1
163 |       local t = torch.Timer()
164 |       trainer:train(dataset)
165 |       printlog(string.format("mlp_%i_500_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real))
166 |    end
167 | 
168 | 
169 |    if true then --MLP 784/1000/1000/1000/10
170 |       collectgarbage()
171 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
172 |       mlp:add(nn.Linear(ninput, 1000))
173 |       mlp:add(nn.Tanh())
174 |       mlp:add(nn.Linear(1000, 1000))
175 |       mlp:add(nn.Tanh())
176 |       mlp:add(nn.Linear(1000, 1000))
177 |       mlp:add(nn.Tanh())
178 |       mlp:add(nn.Linear(1000, noutput))
179 | 
180 |       if params.cuda then
181 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
182 |          torch.setdefaulttensortype('torch.FloatTensor')
183 |       end
184 | 
185 |       mlp:add(nn.LogSoftMax())
186 | 
187 |       if not params.gi then
188 |          if params.v then
189 |             print('# do not compute gradInput')
190 |          end
191 |          mlp:get(1).gradInput = nil
192 |       end
193 | 
194 |       local criterion = nn.ClassNLLCriterion()  
195 | 
196 |       if params.cuda then
197 |          torch.setdefaulttensortype('torch.CudaTensor')
198 |       end
199 | 
200 |       local trainer = nn.StochasticGradient(mlp, criterion)
201 | 
202 |       trainer.learningRate = 0.01
203 |       trainer.shuffleIndices = false
204 |       trainer.maxIteration = 1
205 |       local t = torch.Timer()
206 |       trainer:train(dataset)
207 |       printlog(string.format("mlp_%i_1000_1000_1000_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real))
208 |    end
209 | end
210 | 
211 | if not params.nocnn then
212 | 
213 |    function createcnndataset(nex,w,h)
214 |       local dataset = {}
215 |       local data = lab.randn(nex, 1, w, h)
216 |       local label = torch.LongTensor(params.nexmlp)
217 |       for i=1,params.nexmlp do
218 |          label[i] = (i % noutput) + 1
219 |       end
220 | 
221 |       if params.batch == 1 then
222 |          function dataset:size()
223 |             return nex
224 |          end
225 | 
226 |          setmetatable(dataset, {__index = function(self, index)
227 |                                              return {data[index], label[index]}
228 |                                           end})
229 |       else
230 |          assert(nex % params.batch == 0, '# of examples must be divisible with batch size')
231 |          function dataset:size()
232 |             return nex/params.batch
233 |          end
234 |          setmetatable(dataset, {__index = function(self, index)
235 |                                              return {data:narrow(1,(index-1)*params.batch+1, params.batch),
236 |                                                      label:narrow(1,(index-1)*params.batch+1, params.batch)}
237 |                                           end})
238 |       end
239 | 
240 |       return dataset
241 |    end
242 |       
243 |    if true then --LeNet5-like 32x32
244 |       collectgarbage()
245 |       local dataset = createcnndataset(params.nexcnn, 32, 32)
246 | 
247 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
248 |       mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28
249 |       mlp:add(nn.Tanh())
250 |       mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14
251 |       mlp:add(nn.Tanh())
252 |       mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10
253 |       mlp:add(nn.Tanh())
254 |       mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5
255 |       mlp:add(nn.Tanh())
256 |       mlp:add(nn.Reshape(16*5*5))
257 |       mlp:add(nn.Linear(16*5*5, 120))
258 |       mlp:add(nn.Linear(120, noutput))
259 | 
260 |       if params.cuda then
261 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
262 |          torch.setdefaulttensortype('torch.FloatTensor')
263 |       end
264 | 
265 |       mlp:add(nn.LogSoftMax())
266 | 
267 |       if not params.gi then
268 |          if params.v then
269 |             print('# do not compute gradInput')
270 |          end
271 |          mlp:get(1).gradInput = nil
272 |       end
273 |       
274 |       local criterion = nn.ClassNLLCriterion()  
275 | 
276 |       if params.cuda then
277 |          torch.setdefaulttensortype('torch.CudaTensor')
278 |       end
279 | 
280 |       local trainer = nn.StochasticGradient(mlp, criterion)
281 | 
282 |       trainer.learningRate = 0.01
283 |       trainer.shuffleIndices = false
284 |       trainer.maxIteration = 1
285 |       local t = torch.Timer()
286 |       trainer:train(dataset)
287 |       printlog(string.format("cnn_32x32\t%.2f", params.nexcnn/t:time().real))
288 |    end
289 |    
290 |    if true then --LeNet5-like 96x96
291 |       collectgarbage()
292 |       local dataset = createcnndataset(params.nexcnn, 96, 96)
293 | 
294 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
295 |       mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90
296 |       mlp:add(nn.Tanh())
297 |       mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30
298 |       mlp:add(nn.Tanh())
299 |       mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24
300 |       mlp:add(nn.Tanh())
301 |       mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8
302 |       mlp:add(nn.Tanh())
303 |       mlp:add(nn.Reshape(16*8*8))
304 |       mlp:add(nn.Linear(16*8*8, 120))
305 |       mlp:add(nn.Linear(120, noutput))
306 | 
307 |       if params.cuda then
308 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
309 |          torch.setdefaulttensortype('torch.FloatTensor')
310 |       end
311 | 
312 |       mlp:add(nn.LogSoftMax())
313 | 
314 |       if not params.gi then
315 |          if params.v then
316 |             print('# do not compute gradInput')
317 |          end
318 |          mlp:get(1).gradInput = nil
319 |       end
320 |       
321 |       local criterion = nn.ClassNLLCriterion()  
322 | 
323 |       if params.cuda then
324 |          torch.setdefaulttensortype('torch.CudaTensor')
325 |       end
326 | 
327 |       local trainer = nn.StochasticGradient(mlp, criterion)
328 | 
329 |       trainer.learningRate = 0.01
330 |       trainer.shuffleIndices = false
331 |       trainer.maxIteration = 1
332 |       local t = torch.Timer()
333 |       trainer:train(dataset)
334 |       printlog(string.format("cnn_96x96\t%.2f", params.nexcnn/t:time().real))
335 |    end
336 | 
337 |    if true then --LeNet5-like 256x256
338 |       collectgarbage()
339 |       local dataset = createcnndataset(params.nexcnn, 256, 256)
340 | 
341 |       local mlp = nn.Sequential();                 -- make a multi-layer perceptron
342 |       mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250
343 |       mlp:add(nn.Tanh())
344 |       mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50
345 |       mlp:add(nn.Tanh())
346 |       mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44
347 |       mlp:add(nn.Tanh())
348 |       mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11
349 |       mlp:add(nn.Tanh())
350 |       mlp:add(nn.Reshape(16*11*11))
351 |       mlp:add(nn.Linear(16*11*11, 120))
352 |       mlp:add(nn.Linear(120, noutput))
353 | 
354 |       if params.cuda then
355 |          mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor'))
356 |          torch.setdefaulttensortype('torch.FloatTensor')
357 |       end
358 | 
359 |       mlp:add(nn.LogSoftMax())
360 | 
361 |       if not params.gi then
362 |          if params.v then
363 |             print('# do not compute gradInput')
364 |          end
365 |          mlp:get(1).gradInput = nil
366 |       end
367 | 
368 |       local criterion = nn.ClassNLLCriterion()  
369 | 
370 |       if params.cuda then
371 |          torch.setdefaulttensortype('torch.CudaTensor')
372 |       end
373 | 
374 |       local trainer = nn.StochasticGradient(mlp, criterion)
375 |       
376 |       trainer.learningRate = 0.01
377 |       trainer.shuffleIndices = false
378 |       trainer.maxIteration = 1
379 |       local t = torch.Timer()
380 |       trainer:train(dataset)
381 |       printlog(string.format("cnn_256x256\t%.2f", params.nexcnn/t:time().real))
382 |    end
383 | end
384 | 


--------------------------------------------------------------------------------
/torch7/cudahacks.lua:
--------------------------------------------------------------------------------
 1 | torch.CudaTensor.lab = {}
 2 | 
 3 | local lab = torch.CudaTensor.lab
 4 | 
 5 | function lab.randn(...)
 6 |    local t = torch.FloatTensor.lab.randn(...)
 7 |    return torch.Tensor(t:size()):copy(t)
 8 | end
 9 | 
10 | -- local nn = torch.CudaTensor.nn
11 | 
12 | -- function nn.LogSoftMax_forward(self, input)
13 | --    local t = torch.FloatTensor(input:size()):copy(input)
14 | --    self.output = torch.FloatTensor()
15 | --    return torch.FloatTensor.nn.LogSoftMax_forward(self, t)
16 | -- end
17 | 
18 | 


--------------------------------------------------------------------------------
/torch7/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # HACKS TO USE OPENBLAS
 4 | export LIBRARY_PATH=./lib:~/.VENV/base/lib:$LIBRARY_PATH
 5 | export LD_LIBRARY_PATH=./lib:~/.VENV/base/lib:$LD_LIBRARY_PATH
 6 | 
 7 | #-convfast use "fast" convolution code instead of standard [false]
 8 | #-openmp   use openmp *package* [false]
 9 | #-double   use doubles instead of floats [false]
10 | #-cuda     use CUDA instead of floats [false]
11 | #-batch    batch size [1]
12 | #-gi       compute gradInput [false]
13 | #-v        be verbose [false]
14 | 
15 | # this would use GEMM for convolution, Koray said this was not use
16 | # and it makes a huge unrolled matrix for large problems.
17 | USE_CONVFAST=""
18 | 
19 | for batchsize in 1 10 100 ; do
20 |     for PREC in 32 64 ; do
21 |         if true ; then
22 |             OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC}
23 |             echo "Running normal" $OUTPUT
24 |             echo "host=$HOSTNAME" > "$OUTPUT"
25 |             echo "device=CPU" >> "$OUTPUT"
26 |             echo "OpenMP=0" >> "$OUTPUT"
27 |             echo "batch=$batchsize" >> "$OUTPUT"
28 |             echo "precision=$PREC" >> "$OUTPUT"
29 |             if [ $PREC = 32 ] ; then
30 |                 USE_DOUBLE=""
31 |             else
32 |                 USE_DOUBLE="-double"
33 |             fi
34 | 
35 |             ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE >> "$OUTPUT"
36 |         fi
37 | 
38 |         if true ; then
39 |             OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC}_openmp
40 |             echo "Running OpenMP " $OUTPUT
41 |             echo "host=$HOSTNAME" > "$OUTPUT"
42 |             echo "device=CPU" >> "$OUTPUT"
43 |             echo "OpenMP=1" >> "$OUTPUT"
44 |             echo "batch=$batchsize" >> "$OUTPUT"
45 |             echo "precision=$PREC" >> "$OUTPUT"
46 |             if [ $PREC = 32 ] ; then
47 |                 USE_DOUBLE=""
48 |             else
49 |                 USE_DOUBLE="-double"
50 |             fi
51 |             ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE -openmp >> "$OUTPUT"
52 |         fi
53 | 
54 |         if true ; then
55 |             OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC}_cuda
56 |             echo "Running CUDA " $OUTPUT
57 |             echo "host=$HOSTNAME" > "$OUTPUT"
58 |             echo "device=GTX480" >> "$OUTPUT"
59 |             echo "OpenMP=0" >> "$OUTPUT"
60 |             echo "batch=$batchsize" >> "$OUTPUT"
61 |             echo "precision=32" >> "$OUTPUT"
62 |             if [ $PREC = 32 ] ; then
63 |                 USE_DOUBLE=""
64 |             else
65 |                 USE_DOUBLE="-double"
66 |             fi
67 |             ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE -cuda >> "$OUTPUT"
68 |         fi
69 |     done
70 | done
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------