├── .gitignore ├── README.txt ├── c ├── Makefile └── aa.cc ├── eblearn ├── Makefile ├── convnet.cc ├── convnet256.cc ├── convnet96.cc ├── mlp.cc ├── mnist_example.cc └── run.sh ├── numpy ├── aa_numpy.py ├── logreg.py ├── mlp.py ├── rbm.py └── run.sh ├── reports ├── ascii.py ├── build_csv.py ├── show_csv.py └── task_pdfs.py ├── theano ├── aa.py ├── control.py ├── convnet.py ├── mlp.py ├── rbm.py └── run.sh ├── torch5 ├── MiniBatchGradient.lua ├── mlp.lua ├── mlp_minibatch.lua └── run.sh └── torch7 ├── .gitignore ├── README.txt ├── SpatialConvolutionFast.lua ├── add_to_db.py ├── benchmark.lua ├── cudahacks.lua └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | code/mnist.pkl.gz 2 | html 3 | *.pyc 4 | *.swp 5 | *.x 6 | *~ 7 | *.bmark 8 | db.pkl 9 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Intro 2 | ===== 3 | 4 | 5 | The benchmarking folder contains efforts to benchmark Theano against various 6 | other systems. Each subfolder corresponds to a particular style of 7 | implementation. Since there is a variety of benchmark problems and of software 8 | systems, there isn't a standard for how to run the benchmark suite. There is 9 | however a standard for how each benchmark should produce results. Every 10 | benchmark run should produce one or more files with the results of benchmarking. 11 | These files must end with extension '.bmark'. These files must have 'csv' lines 12 | like this: 13 | 14 | taskimplementation nameexamples/second 15 | 16 | 17 | Current Tasks 18 | ============== 19 | 20 | Dense 21 | ----- 22 | 23 | mlp_784_10 24 | - training on 10K MNIST-sized examples with unregularized Logistic Regression (crossentropy / NLL error) 25 | 26 | mlp_784_500_10 27 | - training on 10K examples with a single-hidden layer model with 500 hidden units 28 | 29 | mlp_784_1000_1000_1000_10 30 | - training on 10K examples with multiple hidden layers 31 | 32 | cd1 rbm_bernoulli 1024_1024 33 | - train an RBM from 10K 1024-dimensional inputs 34 | 35 | daa_1024_1024 36 | - train a denoising autoassociator from 10K 1024-dimensional inputs 37 | 38 | Convolutional 39 | ------------- 40 | 41 | ConvSmall 42 | - train from 10K 32x32 inputs, as in LeNet5 43 | - convnet_32x32_c5x5_s2x2_c5x5_s2x2_120_10 44 | 45 | ConvMed 46 | - train from 10K 96x96 images 47 | 48 | ConvLarge 49 | - train from 10K 256x256 images 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | Potential Tasks 59 | ================ 60 | 61 | Dense 62 | ----- 63 | 64 | mlp_32_10 65 | - training on 10K tiny examples with unregularized Logistic Regression (crossentropy / NLL error) 66 | 67 | mlp_784_10 with L1 68 | - training on 10K examples with L1 regularization 69 | 70 | mlp_784_10 with L2 71 | - training on 10K examples with L2 regularization 72 | 73 | aa_64_64 74 | - train an autoassociator from at 10K 64-dimensional inputs 75 | 76 | aa_1024_1024 77 | - train an autoassociator from 10K 1024-dimensional inputs 78 | 79 | 80 | cd1 rbm_bernoulli 64_64 81 | - train an RBM from 10K 64-dimensional inputs 82 | 83 | 84 | 85 | Convolutional 86 | ------------- 87 | 88 | 89 | LeNet5_32x32x3 90 | - train from 10K Tiny-Image sized inputs (in color) 91 | 92 | conv_daa_i32x32_f7x7 93 | - train a convolutional 94 | 95 | conv_daa_i256x256_f9x9 96 | - train a convolutional 97 | 98 | 99 | Recurrent 100 | --------- 101 | 102 | 103 | -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | aa.x : aa.cc 2 | g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl ${THEANO_BLAS_LDFLAGS} 3 | 4 | clean : 5 | rm aa.x 6 | -------------------------------------------------------------------------------- /c/aa.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * g++ -O2 -ffast-math -I$PUB_PREFIX/include aa.cc -o aa.x -lgsl -lgslcblas 4 | * 5 | * g++ -O2 -ffast-math -I$PUB_PREFIX/include aa.cc -o aa.x -L$PUB_PREFIX/lib -lgsl -lcblas -lgoto -lgfortran 6 | * 7 | * ./aa.x 10 5 7 1000 8 | * 9 | * */ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | double pytime(const struct timeval * tv) 21 | { 22 | return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0; 23 | } 24 | 25 | int main(int argc, char **argv) 26 | { 27 | assert(argc == 5); 28 | 29 | int neg = strtol(argv[1], 0, 0); 30 | int nout = strtol(argv[2], 0, 0); 31 | int nin = nout; 32 | int nhid = strtol(argv[3], 0, 0); 33 | int niter = strtol(argv[4], 0, 0); 34 | double lr = 0.01; 35 | gsl_rng * rng = gsl_rng_alloc (gsl_rng_taus); 36 | gsl_rng_set(rng, 234); 37 | 38 | 39 | gsl_matrix * x = gsl_matrix_alloc(neg, nin); 40 | gsl_matrix * w = gsl_matrix_alloc(nin, nhid); 41 | gsl_vector * a = gsl_vector_alloc(nhid); 42 | gsl_vector * b = gsl_vector_alloc(nout); 43 | gsl_matrix * xw = gsl_matrix_alloc(neg, nhid); 44 | gsl_matrix * hid = gsl_matrix_alloc(neg, nhid); 45 | gsl_matrix * hidwt = gsl_matrix_alloc(neg, nout); 46 | gsl_matrix * g_hidwt = gsl_matrix_alloc(neg, nout); 47 | gsl_matrix * g_hid = gsl_matrix_alloc(neg, nhid); 48 | gsl_matrix * g_w = gsl_matrix_alloc(nout, nhid); 49 | gsl_vector * g_b = gsl_vector_alloc(nout); 50 | 51 | for (int i = 0; i < neg*nout; ++i) x->data[i] = (gsl_rng_uniform(rng) -0.5)*1.5; 52 | for (int i = 0; i < nout*nhid; ++i) w->data[i] = gsl_rng_uniform(rng); 53 | for (int i = 0; i < nhid; ++i) a->data[i] = 0.0; 54 | for (int i = 0; i < nout; ++i) b->data[i] = 0.0; 55 | 56 | // 57 | // 58 | // 59 | // 60 | 61 | struct timeval tv0, tv1; 62 | 63 | struct timeval tdot0, tdot1; 64 | double time_of_dot = 0.0; 65 | 66 | gettimeofday(&tv0, 0); 67 | double err = 0.0; 68 | for (int iter = 0; iter < niter; ++iter) 69 | { 70 | gettimeofday(&tdot0, 0); 71 | gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, x, w, 0.0, xw); 72 | gettimeofday(&tdot1, 0); 73 | time_of_dot += pytime(&tdot1) - pytime(&tdot0); 74 | 75 | for (int i = 0; i < neg; ++i) 76 | for (int j = 0; j < nhid; ++j) 77 | { 78 | double act = xw->data[i*nhid+j] + a->data[j]; 79 | hid->data[i*nhid+j] = tanh(act); 80 | } 81 | 82 | gettimeofday(&tdot0, 0); 83 | gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, hid, w, 0.0, hidwt); 84 | gettimeofday(&tdot1, 0); 85 | time_of_dot += pytime(&tdot1) - pytime(&tdot0); 86 | 87 | for (int i = 0; i < nout; ++i) g_b->data[i] = 0.0; 88 | err = 0.0; 89 | for (int i = 0; i < neg; ++i) 90 | for (int j = 0; j < nout; ++j) 91 | { 92 | double act = hidwt->data[i*nout+j] + b->data[j]; 93 | double out = tanh(act); 94 | double g_out = out - x->data[i*nout+j]; 95 | err += g_out * g_out; 96 | g_hidwt->data[i*nout+j] = g_out * (1.0 - out*out); 97 | g_b->data[j] += g_hidwt->data[i*nout+j]; 98 | } 99 | for (int i = 0; i < nout; ++i) b->data[i] -= lr * g_b->data[i]; 100 | 101 | if (1) 102 | { 103 | gettimeofday(&tdot0, 0); 104 | gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, g_hidwt, w, 0.0, g_hid); 105 | gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, g_hidwt, hid, 0.0, g_w); 106 | gettimeofday(&tdot1, 0); 107 | time_of_dot += pytime(&tdot1) - pytime(&tdot0); 108 | 109 | 110 | for (int i = 0; i < neg; ++i) 111 | for (int j = 0; j < nhid; ++j) 112 | { 113 | g_hid->data[i*nhid+j] *= (1.0 - hid->data[i*nhid+j] * hid->data[i*nhid+j]); 114 | a->data[j] -= lr * g_hid->data[i*nhid+j]; 115 | } 116 | 117 | gettimeofday(&tdot0, 0); 118 | gsl_blas_dgemm(CblasTrans, CblasNoTrans, -lr, x, g_hid, 1.0, w); 119 | gettimeofday(&tdot1, 0); 120 | time_of_dot += pytime(&tdot1) - pytime(&tdot0); 121 | for (int i = 0; i < nout*nhid; ++i) w->data[i] -= lr * g_w->data[i]; 122 | } 123 | 124 | } 125 | gettimeofday(&tv1, 0); 126 | 127 | double total_time = pytime(&tv1) - pytime(&tv0); 128 | fprintf(stdout, "took = %lfs to get err %lf\n", total_time, 0.5 * err); 129 | fprintf(stdout, "... of which %.2lfs was spent in dgemm (fraction: %.2lf)\n", time_of_dot, time_of_dot / total_time); 130 | //skip freeing 131 | return 0; 132 | } 133 | 134 | -------------------------------------------------------------------------------- /eblearn/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: mnist_example_ipp.x mnist_example_noipp.x convnet_noipp.x convnet_ipp.x convnet96_ipp.x convnet96_noipp.x convnet256_ipp.x convnet256_noipp.x 3 | 4 | clean: 5 | rm *.x 6 | 7 | mnist_example_ipp.x : mnist_example.cc 8 | g++ -I${PUB_PREFIX}/eblearn_ipp -o mnist_example_ipp.x mnist_example.cc\ 9 | -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\ 10 | -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread 11 | 12 | 13 | mnist_example_noipp.x : mnist_example.cc 14 | g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o mnist_example_noipp.x mnist_example.cc\ 15 | -L${PUB_PREFIX}/eblearn_noipp -leblearn 16 | 17 | convnet_noipp.x : convnet.cc 18 | g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet_noipp.x convnet.cc\ 19 | -L${PUB_PREFIX}/eblearn_noipp -leblearn 20 | 21 | convnet96_noipp.x : convnet96.cc 22 | g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet96_noipp.x convnet96.cc\ 23 | -L${PUB_PREFIX}/eblearn_noipp -leblearn 24 | 25 | convnet256_noipp.x : convnet256.cc 26 | g++ -O2 -I${PUB_PREFIX}/eblearn_noipp -o convnet256_noipp.x convnet256.cc\ 27 | -L${PUB_PREFIX}/eblearn_noipp -leblearn 28 | 29 | convnet_ipp.x : convnet.cc 30 | g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet_ipp.x convnet.cc\ 31 | -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\ 32 | -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread 33 | 34 | convnet96_ipp.x : convnet96.cc 35 | g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet96_ipp.x convnet96.cc\ 36 | -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\ 37 | -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread 38 | 39 | convnet256_ipp.x : convnet256.cc 40 | g++ -DUSED_IPP -O2 -I${PUB_PREFIX}/eblearn_ipp -o convnet256_ipp.x convnet256.cc\ 41 | -L/u/bergstrj/pub/intel/ipp/6.1.2.051/em64t/sharedlib\ 42 | -L${PUB_PREFIX}/eblearn_ipp -leblearn -lippiem64t -pthread 43 | 44 | -------------------------------------------------------------------------------- /eblearn/convnet.cc: -------------------------------------------------------------------------------- 1 | #include "libeblearn.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace ebl; // all eblearn objects are under the ebl namespace 7 | 8 | static double time_time() // a time function like time.time() 9 | { 10 | struct timeval tv; 11 | gettimeofday(&tv, 0); 12 | return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0; 13 | } 14 | 15 | typedef double t_net; 16 | 17 | int main(int argc, char **argv) { // regular main without gui 18 | init_drand(92394); // initialize random seed 19 | 20 | intg n_examples = 1000; // maximum training set size: 60000 21 | idxdim dims(1,32,32); // get order and dimensions of sample 22 | 23 | //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest 24 | idx targets = create_target_matrix(10, 1.0); 25 | idx inputs(n_examples, 32, 32); 26 | 27 | parameter theparam(60000); // create trainable parameter 28 | lenet5 l5(theparam, 32, 32, 5, 5, 2, 2, 5, 5, 2, 2, 120, 10); 29 | // TODO: use an all-to-all connection table in second layer convolution 30 | // Because that's what the other packages implement. 31 | supervised_euclidean_machine thenet( 32 | (module_1_1&)l5, 33 | targets, 34 | dims); 35 | supervised_trainer thetrainer(thenet, theparam); 36 | classifier_meter trainmeter, testmeter; 37 | forget_param_linear fgp(1, 0.5); 38 | thenet.forget(fgp); 39 | 40 | // learning parameters 41 | gd_param gdp(/* double leta*/ 0.0001, 42 | /* double ln */ 0.0, 43 | /* double l1 */ 0.0, 44 | /* double l2 */ 0.0, 45 | /* int dtime */ 0, 46 | /* double iner */0.0, 47 | /* double a_v */ 0.0, 48 | /* double a_t */ 0.0, 49 | /* double g_t*/ 0.0); 50 | infer_param infp; 51 | 52 | state_idx dummy_input(1, 32, 32); 53 | int J = 2000; 54 | double t = time_time(); 55 | for (intg j = 0; j < J; ++j) 56 | { 57 | thetrainer.learn_sample(dummy_input, j%10, gdp); 58 | // TODO: iterate over mock dataset to simulate more realistic 59 | // memaccess pattern 60 | } 61 | #ifdef USED_IPP 62 | cout << "ConvSmall\teblearn{ipp}\t" << J / (time_time() - t) << endl; 63 | #else 64 | cout << "ConvSmall\teblearn\t" << J / (time_time() - t) << endl; 65 | #endif 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /eblearn/convnet256.cc: -------------------------------------------------------------------------------- 1 | #include "libeblearn.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace ebl; // all eblearn objects are under the ebl namespace 7 | 8 | static double time_time() // a time function like time.time() 9 | { 10 | struct timeval tv; 11 | gettimeofday(&tv, 0); 12 | return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0; 13 | } 14 | 15 | typedef double t_net; 16 | 17 | int main(int argc, char **argv) { // regular main without gui 18 | init_drand(92394); // initialize random seed 19 | 20 | intg n_examples = 20; // maximum training set size: 60000 21 | idxdim dims(1,256,256); // get order and dimensions of sample 22 | 23 | //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest 24 | idx targets = create_target_matrix(10, 1.0); 25 | idx inputs(n_examples, 256, 256); 26 | 27 | parameter theparam(6000); // create trainable parameter 28 | lenet5 l5(theparam, 256, 256, 7, 7, 5, 5, 7, 7, 4, 4, 120, 10); 29 | // TODO: use an all-to-all connection table in second layer convolution 30 | // Because that's what the other packages implement. 31 | supervised_euclidean_machine thenet( 32 | (module_1_1&)l5, 33 | targets, 34 | dims); 35 | supervised_trainer thetrainer(thenet, theparam); 36 | classifier_meter trainmeter, testmeter; 37 | forget_param_linear fgp(1, 0.5); 38 | thenet.forget(fgp); 39 | 40 | // learning parameters 41 | gd_param gdp(/* double leta*/ 0.0001, 42 | /* double ln */ 0.0, 43 | /* double l1 */ 0.0, 44 | /* double l2 */ 0.0, 45 | /* int dtime */ 0, 46 | /* double iner */0.0, 47 | /* double a_v */ 0.0, 48 | /* double a_t */ 0.0, 49 | /* double g_t*/ 0.0); 50 | infer_param infp; 51 | 52 | state_idx dummy_input(1, 256, 256); 53 | double t = time_time(); 54 | for (intg j = 0; j < n_examples; ++j) 55 | { 56 | thetrainer.learn_sample(dummy_input, j%10, gdp); 57 | // TODO: iterate over mock dataset to simulate more realistic 58 | // memaccess pattern 59 | } 60 | #ifdef USED_IPP 61 | cout << "ConvLarge\teblearn{ipp}\t" << n_examples / (time_time() - t) << endl; 62 | #else 63 | cout << "ConvLarge\teblearn\t" << n_examples / (time_time() - t) << endl; 64 | #endif 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /eblearn/convnet96.cc: -------------------------------------------------------------------------------- 1 | #include "libeblearn.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace ebl; // all eblearn objects are under the ebl namespace 7 | 8 | static double time_time() // a time function like time.time() 9 | { 10 | struct timeval tv; 11 | gettimeofday(&tv, 0); 12 | return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0; 13 | } 14 | 15 | typedef double t_net; 16 | 17 | int main(int argc, char **argv) { // regular main without gui 18 | init_drand(92394); // initialize random seed 19 | 20 | intg n_examples = 100; // maximum training set size: 60000 21 | idxdim dims(1,96,96); // get order and dimensions of sample 22 | 23 | //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest 24 | idx targets = create_target_matrix(10, 1.0); 25 | idx inputs(n_examples, 96, 96); 26 | 27 | parameter theparam(6000); // create trainable parameter 28 | lenet5 l5(theparam, 96, 96, 7, 7, 3, 3, 7, 7, 3, 3, 120, 10); 29 | // TODO: use an all-to-all connection table in second layer convolution 30 | // Because that's what the other packages implement. 31 | supervised_euclidean_machine thenet( 32 | (module_1_1&)l5, 33 | targets, 34 | dims); 35 | supervised_trainer thetrainer(thenet, theparam); 36 | classifier_meter trainmeter, testmeter; 37 | forget_param_linear fgp(1, 0.5); 38 | thenet.forget(fgp); 39 | 40 | // learning parameters 41 | gd_param gdp(/* double leta*/ 0.0001, 42 | /* double ln */ 0.0, 43 | /* double l1 */ 0.0, 44 | /* double l2 */ 0.0, 45 | /* int dtime */ 0, 46 | /* double iner */0.0, 47 | /* double a_v */ 0.0, 48 | /* double a_t */ 0.0, 49 | /* double g_t*/ 0.0); 50 | infer_param infp; 51 | 52 | state_idx dummy_input(1, 96, 96); 53 | double t = time_time(); 54 | for (intg j = 0; j < n_examples; ++j) 55 | { 56 | thetrainer.learn_sample(dummy_input, j%10, gdp); 57 | // TODO: iterate over mock dataset to simulate more realistic 58 | // memaccess pattern 59 | } 60 | #ifdef USED_IPP 61 | cout << "ConvMed\teblearn{ipp}\t" << n_examples / (time_time() - t) << endl; 62 | #else 63 | cout << "ConvMed\teblearn\t" << n_examples / (time_time() - t) << endl; 64 | #endif 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /eblearn/mlp.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaberg/DeepLearningBenchmarks/590892c283b768a5b4baec24629bb3647b251434/eblearn/mlp.cc -------------------------------------------------------------------------------- /eblearn/mnist_example.cc: -------------------------------------------------------------------------------- 1 | #include "libeblearn.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace ebl; // all eblearn objects are under the ebl namespace 7 | 8 | static double time_time() // a time function like time.time() 9 | { 10 | struct timeval tv; 11 | gettimeofday(&tv, 0); 12 | return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0; 13 | } 14 | 15 | typedef double t_net; 16 | 17 | // argv[1] is expected to contain the directory of the mnist dataset 18 | #ifdef __GUI__ 19 | MAIN_QTHREAD() { // this is the macro replacing main to enable multithreaded gui 20 | #else 21 | int main(int argc, char **argv) { // regular main without gui 22 | #endif 23 | cerr << "* MNIST demo: learning handwritten digits using the eblearn"; 24 | cerr << " C++ library *" << endl; 25 | if (argc != 2) { 26 | cerr << "Usage: ./mnist " << endl; 27 | eblerror("MNIST path not specified"); 28 | } 29 | init_drand(time(NULL)); // initialize random seed 30 | 31 | intg trsize = 10000; // maximum training set size: 60000 32 | intg tesize = 10000; // maximum testing set size: 10000 33 | 34 | //! load MNIST datasets: trize for training set and tesize for testing set 35 | mnist_datasource 36 | train_ds(argv[1], "train", trsize), 37 | test_ds(argv[1], "t10k", tesize); 38 | 39 | //! create 1-of-n targets with target 1.0 for shown class, -1.0 for the rest 40 | idx targets = create_target_matrix(1+idx_max(train_ds.labels), 1.0); 41 | 42 | //! create the network weights, network and trainer 43 | cerr << "creating idxdim: " << endl; 44 | idxdim dims(train_ds.sample_dims()); // get order and dimensions of sample 45 | cerr << "creating theparam: " << endl; 46 | parameter theparam(60000); // create trainable parameter 47 | cerr << "creating l5: " << endl; 48 | lenet5 l5(theparam, 32, 32, 5, 5, 2, 2, 5, 5, 2, 2, 120, targets.dim(0)); 49 | //TODO: Consider using net_nn_cscsc directly rather than lenet5 50 | 51 | cerr << "creating thenet: " << endl; 52 | supervised_euclidean_machine thenet((module_1_1&)l5, targets, dims); 53 | cerr << "creating thetrainer: " << endl; 54 | supervised_trainer thetrainer(thenet, theparam); 55 | //supervised_trainer_gui stgui; // the gui to display supervised_trainer 56 | 57 | //! a classifier-meter measures classification errors 58 | classifier_meter trainmeter, testmeter; 59 | 60 | //! initialize the network weights 61 | forget_param_linear fgp(1, 0.5); 62 | thenet.forget(fgp); 63 | 64 | // learning parameters 65 | gd_param gdp(/* double leta*/ 0.0001, 66 | /* double ln */ 0.0, 67 | /* double l1 */ 0.0, 68 | /* double l2 */ 0.0, 69 | /* int dtime */ 0, 70 | /* double iner */0.0, 71 | /* double a_v */ 0.0, 72 | /* double a_t */ 0.0, 73 | /* double g_t*/ 0.0); 74 | infer_param infp; 75 | 76 | int use_hessian = 0; 77 | // estimate second derivative on 100 iterations, using mu=0.02 78 | if (use_hessian) 79 | { 80 | cerr << "Computing second derivatives on MNIST dataset: " << endl; 81 | thetrainer.compute_diaghessian(train_ds, 100, 0.02); 82 | } 83 | 84 | //code borrowd from libeblearn/include/ebl_trainer.hpp 85 | ubyte lab; 86 | thetrainer.init(train_ds, &trainmeter); 87 | // training on lowest size common to all classes (times # classes) 88 | // now do training iterations 89 | //cerr << "... Training network from " << train_ds.get_lowest_common_size() << endl; 90 | double t = time_time(); 91 | train_ds.fprop(*thetrainer.input, thetrainer.label); 92 | lab = thetrainer.label.get(); 93 | //int J = train_ds.get_lowest_common_size(); 94 | int J = 2000; 95 | for (intg j = 0; j < J; ++j) { 96 | //train_ds.fprop(*thetrainer.input, thetrainer.label); 97 | //lab = thetrainer.label.get(); 98 | thetrainer.learn_sample(*thetrainer.input, lab, gdp); 99 | // use energy as distance for samples probabilities to be used 100 | ///// train_ds.set_answer_distance(energy.x.get()); 101 | // log.update(age, output, label.get(), energy); 102 | //train_ds.next_train(); 103 | } 104 | #ifdef __IPP__ 105 | cout << "lenet5\teblearn{ipp}\t" << J / (time_time() - t) << endl; 106 | #else 107 | cout << "lenet5\teblearn\t" << J / (time_time() - t) << endl; 108 | #endif 109 | return 0; 110 | } 111 | 112 | 113 | #if 0 114 | for (int i = 0; i < 100; ++i) { 115 | double t = time_time(); 116 | cerr << "Training... " << endl; 117 | thetrainer.train(train_ds, trainmeter, gdp, 1); // train 118 | cerr << "Training took" << t - time_time() << "seconds" << endl; 119 | cerr << "Testing on train... " << endl; 120 | thetrainer.test(train_ds, trainmeter, infp); // test 121 | cerr << "Testing on test... " << endl; 122 | thetrainer.test(test_ds, testmeter, infp); // test 123 | //stgui.display_datasource(thetrainer, test_ds, infp, 10, 10); // display 124 | //stgui.display_internals(thetrainer, test_ds, infp, 2); // display 125 | if (use_hessian) 126 | thetrainer.compute_diaghessian(train_ds, 100, 0.02); // recompute 2nd der 127 | cerr << "Iteration took" << t - time_time() << "seconds" << endl; 128 | } 129 | #endif 130 | -------------------------------------------------------------------------------- /eblearn/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./mnist_example_ipp.x /data/lisa/data/mnist 4 | # LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./mnist_example_noipp.x /data/lisa/data/mnist 5 | 6 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet_ipp.x > ${HOSTNAME}_eblearn_convnet_ipp.bmark 7 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet96_ipp.x > ${HOSTNAME}_eblearn_convnet96_ipp.bmark 8 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_ipp:$LD_LIBRARY_PATH ./convnet256_ipp.x > ${HOSTNAME}_eblearn_convnet256_ipp.bmark 9 | 10 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet_noipp.x > ${HOSTNAME}_eblearn_convnet.bmark 11 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet96_noipp.x > ${HOSTNAME}_eblearn_convnet96.bmark 12 | LD_LIBRARY_PATH=$PUB_PREFIX/eblearn_noipp:$LD_LIBRARY_PATH ./convnet256_noipp.x > ${HOSTNAME}_eblearn_convnet256.bmark 13 | -------------------------------------------------------------------------------- /numpy/aa_numpy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy as N 4 | import sys 5 | import time 6 | 7 | # c: aa.cc 8 | 9 | neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]] 10 | lr = 0.01 11 | 12 | rng = N.random.RandomState(342) 13 | 14 | w = rng.rand(nout, nhid) 15 | a = rng.randn(nhid) * 0.0 16 | b = rng.randn(nout) * 0.0 17 | x = (rng.rand(neg, nout)-0.5) * 1.5 18 | 19 | dot_time = 0.0 20 | 21 | t = time.time() 22 | for i in xrange(niter): 23 | tt = time.time() 24 | d = N.dot(x, w) 25 | dot_time += time.time() - tt 26 | 27 | hid = N.tanh(d + a) 28 | 29 | tt = time.time() 30 | d = N.dot(hid, w.T) 31 | dot_time += time.time() - tt 32 | out = N.tanh(d + b) 33 | 34 | g_out = out - x 35 | err = 0.5 * N.sum(g_out**2) 36 | 37 | g_hidwt = g_out * (1.0 - out**2) 38 | 39 | b -= lr * N.sum(g_hidwt, axis=0) 40 | 41 | tt = time.time() 42 | g_hid = N.dot(g_hidwt, w) 43 | dot_time += time.time() - tt 44 | 45 | g_hidin = g_hid * (1.0 - hid**2) 46 | 47 | tt = time.time() 48 | d = N.dot(g_hidwt.T, hid) 49 | dd = N.dot(x.T, g_hidin) 50 | dot_time += time.time() - tt 51 | 52 | gw = (d + dd) 53 | w -= lr * gw 54 | 55 | a -= lr * N.sum(g_hidin, axis=0) 56 | 57 | total_time = time.time() - t 58 | print 'time: ',total_time, 'err: ', err 59 | print ' of which', dot_time, 'was spent on dot. Fraction:', dot_time / total_time 60 | 61 | -------------------------------------------------------------------------------- /numpy/logreg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy as N 4 | import sys 5 | import time 6 | 7 | # c: aa.cc 8 | 9 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]] 10 | lr = 0.01 11 | 12 | rng = N.random.RandomState(342) 13 | 14 | # declare data 15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5 16 | y = (rng.rand(batchsize*niter, nout)-0.5) * 1.5 17 | 18 | # declare model weights 19 | w = rng.rand(nin, nout) 20 | b = rng.randn(nout) * 0.0 21 | 22 | t = time.time() 23 | for i in xrange(niter): 24 | x_i = x[i*batchsize:(i+1)*batchsize] 25 | y_i = y[i*batchsize:(i+1)*batchsize] 26 | 27 | hidin = N.dot(x_i, w) + b 28 | 29 | hidout = (N.tanh(hidin)+1)/2.0 # sigmoid 30 | 31 | g_hidout = hidout - y_i 32 | err = 0.5 * N.sum(g_hidout**2) 33 | 34 | g_hidin = g_hidout * hidout * (1.0 - hidout) 35 | 36 | b -= lr * N.sum(g_hidin, axis=0) 37 | w -= lr * N.dot(x_i.T, g_hidin) 38 | 39 | total_time = time.time() - t 40 | print 'mlp_%i_%i\tnumpy{%i}\t%.2f' %( 41 | nin, nout, batchsize, niter*batchsize/total_time) 42 | 43 | -------------------------------------------------------------------------------- /numpy/mlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy as N 4 | import sys 5 | import time 6 | 7 | # c: aa.cc 8 | 9 | nin, nhid, nout, batchsize, niter = [int(a) for a in sys.argv[1:]] 10 | lr = 0.01 11 | 12 | rng = N.random.RandomState(342) 13 | 14 | # declare data 15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5 16 | y = (rng.rand(batchsize*niter, nout)-0.5) * 1.5 17 | 18 | # declare model weights 19 | w = rng.rand(nin, nhid) 20 | b = rng.randn(nhid) * 0.0 21 | v = rng.rand(nhid, nout) 22 | c = rng.randn(nout) * 0.0 23 | 24 | t = time.time() 25 | for i in xrange(niter): 26 | x_i = x[i*batchsize:(i+1)*batchsize] 27 | y_i = y[i*batchsize:(i+1)*batchsize] 28 | 29 | hidin = N.dot(x_i, w) + b 30 | 31 | hidout = N.tanh(hidin) 32 | 33 | outin = N.dot(hidout, v) + c 34 | outout = (N.tanh(outin)+1)/2.0 35 | 36 | g_outout = outout - y_i 37 | err = 0.5 * N.sum(g_outout**2) 38 | 39 | g_outin = g_outout * outout * (1.0 - outout) 40 | 41 | g_hidout = N.dot(g_outin, v.T) 42 | g_hidin = g_hidout * (1 - hidout**2) 43 | 44 | b -= lr * N.sum(g_hidin, axis=0) 45 | c -= lr * N.sum(g_outin, axis=0) 46 | w -= lr * N.dot(x_i.T, g_hidin) 47 | v -= lr * N.dot(hidout.T, g_outin) 48 | 49 | total_time = time.time() - t 50 | print 'mlp_%i_%i_%i\tnumpy{%i}\t%.2f' %( 51 | nin, nhid, nout, batchsize, niter*batchsize/total_time) 52 | 53 | -------------------------------------------------------------------------------- /numpy/rbm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy as N 4 | import sys 5 | import time 6 | 7 | # c: aa.cc 8 | 9 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]] 10 | lr = 0.01 11 | 12 | rng = N.random.RandomState(342) 13 | 14 | # declare data 15 | x = (rng.rand(batchsize*niter, nin)-0.5) * 1.5 16 | 17 | # declare model weights 18 | a = rng.randn(nin) * 0.0 19 | w = rng.rand(nin, nout) 20 | b = rng.randn(nout) * 0.0 21 | 22 | def sigm(x): return (N.tanh(x)+1)/2 23 | 24 | def bern(x): return N.random.binomial(p=x,n=1) 25 | 26 | t = time.time() 27 | for i in xrange(niter): 28 | pos_vis = x[i*batchsize:(i+1)*batchsize] 29 | 30 | pos_hid = sigm(N.dot(pos_vis, w)+b) 31 | 32 | neg_vis = sigm(N.dot(bern(pos_hid), w.T)+a) 33 | 34 | neg_hid = sigm(N.dot(bern(neg_vis), w) + b) 35 | 36 | a += lr * N.sum(pos_vis - neg_vis, axis=0) 37 | b -= lr * N.sum(pos_hid - neg_hid, axis=0) 38 | w -= lr * (N.dot(pos_vis.T, pos_hid) - N.dot(neg_vis.T, neg_hid)) 39 | 40 | total_time = time.time() - t 41 | print 'cd1 rbm_bernoulli %i_%i\tnumpy{%i}\t%.2f' %( 42 | nin, nout, batchsize, niter*batchsize/total_time) 43 | 44 | 45 | -------------------------------------------------------------------------------- /numpy/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python mlp.py 784 500 10 1 1000 > ${HOSTNAME}_mlp_1.bmark 4 | python mlp.py 784 500 10 60 100 > ${HOSTNAME}_mlp_60.bmark 5 | 6 | python logreg.py 784 10 1 1000 > ${HOSTNAME}_lr_784_1.bmark 7 | python logreg.py 784 10 60 100 > ${HOSTNAME}_lr_784_60.bmark 8 | python logreg.py 32 10 1 1000 > ${HOSTNAME}_lr_32_1.bmark 9 | python logreg.py 32 10 60 100 > ${HOSTNAME}_lr_32_60.bmark 10 | python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_1.bmark 11 | python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_60.bmark 12 | -------------------------------------------------------------------------------- /reports/ascii.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import cPickle 5 | 6 | if __name__ == '__main__': 7 | assert sys.argv[1] == '--db' 8 | db = cPickle.load(open(sys.argv[2])) 9 | for entry in db: 10 | print entry 11 | 12 | -------------------------------------------------------------------------------- /reports/build_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | 6 | def build_results(path='.'): 7 | results = {} # map task -> impl -> time 8 | 9 | for root, dirs, files in os.walk(path): 10 | for bmark in [f for f in files if f.endswith('.bmark')]: 11 | for line in open(os.path.join(root,bmark)): 12 | if not line or line == "\n": 13 | continue 14 | try: 15 | task, impl, t = line[:-1].split('\t')[:3] 16 | except: 17 | print >> sys.stderr, "PARSE ERR:", line 18 | continue 19 | 20 | if task.startswith('#'): 21 | print >> sys.stderr, "Skipping", task, impl, t 22 | else: 23 | results.setdefault(task, {})[impl] = float(t) 24 | return results 25 | 26 | if __name__ == '__main__': 27 | r = build_results(sys.argv[1]) 28 | 29 | for k in r: 30 | for i in r[k]: 31 | print '%s\t%s\t%f' % (k,i,r[k][i]) 32 | 33 | if 0: 34 | 35 | keys = r.keys() 36 | keys.sort() 37 | 38 | for k in keys: 39 | v = r[k] 40 | print k 41 | r_k = [(v[i],i) for i in v] 42 | r_k.sort() 43 | r_k.reverse() 44 | for t, i in r_k: 45 | print " %10.2f - %s" %(t, i) 46 | print '' 47 | -------------------------------------------------------------------------------- /reports/show_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # a bar plot with errorbars 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import sys 6 | from pylab import * 7 | 8 | def rcolor(): 9 | return tuple(np.random.rand(3)) 10 | 11 | from build_csv import build_results 12 | 13 | results = build_results(sys.argv[1]) # dict task -> impl -> time 14 | 15 | n_tasks = len(results) 16 | 17 | tasks = results.keys() 18 | impls = set() 19 | for k, v in results.items(): 20 | impls.update(v.keys()) 21 | 22 | print tasks 23 | print impls 24 | fig = plt.figure() 25 | ax = fig.add_subplot(111) 26 | ind = np.arange(n_tasks) # the x locations for the groups 27 | width = 0.10 # the width of the bars 28 | 29 | rects = [] 30 | for i, impl in enumerate(impls): 31 | means = [] 32 | std = [] 33 | for t in tasks: 34 | std.append(0) 35 | try: 36 | means.append(1.0/results[t][impl]) 37 | except KeyError: 38 | means.append(0) 39 | rects.append(ax.bar(ind+i*width, means, width, color=rcolor(), log=True)) #, color='r', yerr=menStd) 40 | print "adding rect for", impl, means 41 | 42 | #womenMeans = (25, 32, 34, 20, 25) 43 | #womenStd = (3, 5, 2, 3, 3) 44 | #rects2 = ax.bar(ind+width, womenMeans, width, color='y', yerr=womenStd) 45 | 46 | # add some 47 | ax.set_ylabel('Examples / seconds') 48 | #ax.set_title('Scores by group and gender') 49 | ax.set_xticks(ind+width*len(impls)/2.0) 50 | print 'tasks', tasks 51 | ax.set_xticklabels( [t[:12] for t in tasks] ) 52 | print 'gca', gca().get_xticklabels() 53 | setp(gca().get_xticklabels(), rotation=30, fontsize=10) 54 | 55 | ax.legend( [r[0] for r in rects], impls, 'upper left' ) 56 | 57 | #def autolabel(rects): 58 | # attach some text labels 59 | # for rect in rects: 60 | # height = rect.get_height() 61 | # ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), 62 | # ha='center', va='bottom') 63 | #autolabel(rects1) 64 | #autolabel(rects2) 65 | 66 | subplots_adjust(left=.09, bottom=.14, right=.97, top=.95) 67 | 68 | savefig('blah.pdf') 69 | 70 | -------------------------------------------------------------------------------- /reports/task_pdfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # a bar plot with errorbars 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import sys 6 | from pylab import * 7 | 8 | def rcolor(): 9 | return 'b' 10 | return tuple(np.random.rand(3)) 11 | 12 | from build_csv import build_results 13 | 14 | results = build_results(sys.argv[1]) # dict task -> impl -> time 15 | 16 | 17 | for task in results: 18 | print task 19 | 20 | fig = plt.figure() 21 | ax = fig.add_subplot(111) 22 | width = 0.30 # the width of the bars 23 | 24 | scores = [(s,i) for (i,s) in results[task].items()] 25 | scores.sort() 26 | 27 | scores = scores[-8:] 28 | 29 | rect = ax.barh(-.15+np.arange(len(scores)), 30 | [s for (s,i) in scores], 31 | 0.3, # width 32 | color='b', 33 | log=False) 34 | 35 | # add some 36 | ax.set_title('Preliminary Benchmark Results: %s'% task) 37 | ax.set_yticklabels(['']+[i for (s,i) in scores], minor=True) 38 | #ax.set_ylabel('Training Speed (examples/sec)') 39 | #ax.set_xticks(np.arange(len(scores)), minor=False) 40 | #ax.set_xticklabels([i[:3] for (s,i) in scores])#, minor=True 41 | #setp(ax.get_xmajorticklabels(), rotation=90, fontsize=10) 42 | 43 | subplots_adjust(left=.29, bottom=.14, right=.97, top=.91) 44 | 45 | savefig('%s.pdf'%task) 46 | 47 | -------------------------------------------------------------------------------- /theano/aa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy 4 | import sys 5 | import time 6 | 7 | import theano 8 | import theano.tensor as T 9 | import theano.sandbox 10 | import theano.sandbox.wraplinker 11 | from theano.compile import module, Mode 12 | from theano.sandbox.wraplinker import ProfileMode 13 | from theano import gof, Op, Apply 14 | 15 | from theano.tensor import blas, opt 16 | 17 | # numpy: aa_numpy.py 18 | # c : aa.cc 19 | 20 | 21 | if 0: 22 | class Opt(object): 23 | merge = theano.gof.MergeOptimizer() 24 | gemm_opt_1 = theano.gof.TopoOptimizer(theano.tensor_opt.gemm_pattern_1) 25 | 26 | gemm_opt_2 = theano.gof.TopoOptimizer( # d -= a * (dot()+transpose(dot)) 27 | theano.gof.PatternSub( 28 | ( 29 | T.sub_inplace, 30 | 'd', 31 | ( 32 | T.mul, 33 | dict(pattern = (T.DimShuffle((), ['x', 'x'], inplace = True), 'a'), 34 | allow_multiple_clients = True), 35 | ( 36 | T.add, 37 | (T.dot, 'b', 'c'), 38 | (T.transpose_inplace, (T.dot, 'f', 'g')) 39 | ) 40 | ) 41 | ), 42 | ( 43 | T.gemm, 44 | ( 45 | T.gemm, 46 | 'd', 47 | (T.neg, 'a'), 48 | (T.transpose_inplace, 'g'), 49 | (T.transpose_inplace, 'f'), 50 | T.constant(1.0) 51 | ), 52 | (T.neg, 'a'), 53 | 'b', 54 | 'c', 55 | T.constant(1.0) 56 | ), 57 | allow_multiple_clients = False)) 58 | 59 | sqr = [] 60 | sqr.append( theano.gof.TopoOptimizer( 61 | theano.gof.PatternSub( 62 | (T.mul,'x', 'x'), 63 | (T.sqr, 'x'), allow_multiple_clients=True))) 64 | sqr.append(theano.gof.TopoOptimizer( 65 | theano.gof.PatternSub( 66 | (T.pow, 'x', (T.DimShuffle((), ['x', 'x'], inplace=True), T.constant(2))), 67 | (T.sqr, 'x'), allow_multiple_clients=True))) 68 | 69 | ident_opt_list = [] 70 | ident_opt_list.append( # remove explicit copies 71 | theano.gof.TopoOptimizer( 72 | theano.gof.PatternSub( 73 | (T.tensor_copy, 'x'), 74 | 'x', 75 | allow_multiple_clients=True))) 76 | ident_opt_list.append( # remove double-transpose 77 | theano.gof.TopoOptimizer( 78 | theano.gof.PatternSub( 79 | (T.transpose_inplace, (T.transpose_inplace, 'x')), 80 | 'x', 81 | allow_multiple_clients=True))) 82 | 83 | ident_opt_list.append( 84 | theano.gof.TopoOptimizer( 85 | theano.gof.PatternSub( 86 | (T.sqr, (T.sqrt,'x')), 87 | 'x', 88 | allow_multiple_clients=True))) 89 | ident_opt_list.append( 90 | theano.gof.TopoOptimizer( 91 | theano.gof.PatternSub( 92 | (T.sqrt, (T.sqr,'x')), 93 | 'x', 94 | allow_multiple_clients=True))) 95 | ident_opt_list.append( 96 | theano.gof.TopoOptimizer( 97 | theano.gof.PatternSub( 98 | (T.mul, 'x', (T.div,'y', 'x')), 99 | 'y', 100 | allow_multiple_clients=True))) 101 | 102 | ident_opt_list.append( 103 | theano.gof.TopoOptimizer( 104 | theano.gof.PatternSub( 105 | (T.mul, (T.div,'y', 'x'), 'x'), 106 | 'y', 107 | allow_multiple_clients=True))) 108 | 109 | ident_opt_list.append( 110 | theano.gof.TopoOptimizer( 111 | theano.gof.PatternSub( 112 | (T.div, (T.mul,'y', 'x'), 'x'), 113 | 'y', 114 | allow_multiple_clients=True))) 115 | 116 | ident_opt_list.append( 117 | theano.gof.TopoOptimizer( 118 | theano.gof.PatternSub( 119 | (T.div, (T.mul,'y', 'x'), 'y'), 120 | 'x', 121 | allow_multiple_clients=True))) 122 | 123 | def __call__(self, env): 124 | self.merge(env) 125 | #eliminate identities 126 | if 0: 127 | print 'SKIPPING optimizations' 128 | else: 129 | 130 | for opt in self.ident_opt_list: 131 | opt(env) 132 | 133 | for opt in self.sqr: 134 | opt(env) 135 | 136 | self.gemm_opt_1(env) 137 | self.gemm_opt_2(env) 138 | 139 | self.merge(env) 140 | 141 | def print_graph_linker(print_prog=True): 142 | if 1: 143 | imap = {None:'-'} 144 | def blah(i, node, thunk): 145 | imap[node] = str(i) 146 | if print_prog:# and node.op.__class__ is T.DimShuffle: 147 | if False and node.op == T.DimShuffle((), ['x', 'x'], inplace = True): 148 | print node.op == T.DimShuffle((), ['x', 'x'], inplace = True), 149 | print node.inputs[0], type(node.inputs[0]), 150 | print node.inputs[0].equals(T.constant(2)), 151 | outputs = node.outputs 152 | inputs = theano.gof.graph.inputs(outputs) 153 | print 'node ', i, node, 154 | print ':'.join([imap[inp.owner] for inp in node.inputs]) 155 | #print theano.sandbox.pprint.pp.process_graph(inputs, outputs) 156 | return theano.sandbox.wraplinker.WrapLinkerMany( 157 | [theano.gof.OpWiseCLinker()], 158 | [theano.sandbox.wraplinker.run_all 159 | ,blah 160 | #,theano.sandbox.wraplinker.numpy_notall_isfinite 161 | ]) 162 | else: 163 | return theano.gof.OpWiseCLinker() 164 | 165 | 166 | class M(module.Module): 167 | def __init__(self): 168 | super(M, self).__init__() 169 | 170 | x = T.matrix('x') # input, target 171 | self.w = module.Member(T.matrix('w')) # weights 172 | self.a = module.Member(T.vector('a')) # hid bias 173 | self.b = module.Member(T.vector('b')) # output bias 174 | 175 | self.hid = T.tanh(T.dot(x, self.w) + self.a) 176 | hid = self.hid 177 | 178 | self.out = T.tanh(T.dot(hid, self.w.T) + self.b) 179 | out = self.out 180 | 181 | self.err = 0.5 * T.sum((out - x)**2) 182 | err = self.err 183 | 184 | params = [self.w, self.a, self.b] 185 | 186 | gparams = T.grad(err, params) 187 | 188 | updates = [(p, p - 0.01 * gp) for p, gp in zip(params, gparams)] 189 | 190 | self.step = module.Method([x], err, updates=dict(updates)) 191 | 192 | mod = M() 193 | mode = 'FAST_RUN' 194 | #mode = ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) 195 | mode = Mode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker(nice_errors=True)) 196 | mode = Mode(optimizer='fast_run', linker='c') 197 | mode = Mode(optimizer='fast_run', linker='c|py') 198 | print mod.pretty(mode=mode) 199 | m = mod.make(mode=mode) 200 | 201 | neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]] 202 | rng = numpy.random.RandomState(342) 203 | m.w = rng.rand(nout, nhid) 204 | m.a = rng.randn(nhid) * 0.0 205 | m.b = rng.randn(nout) * 0.0 206 | 207 | x = (rng.rand(neg, nout)-0.5) * 1.5 208 | 209 | t = time.time() 210 | for i in xrange(niter): 211 | err = m.step(x) 212 | print 'time: ',time.time() - t, 'err: ', err 213 | try: 214 | mode.print_summary() 215 | pass 216 | except: 217 | pass 218 | 219 | 220 | -------------------------------------------------------------------------------- /theano/control.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from theano.misc.check_blas import execute 3 | 4 | sizes = [500, 1000, 1500, 2000, 2500] 5 | iters = 10 6 | 7 | for order in ['c', 'f']: 8 | for size in sizes: 9 | t = execute(verbose=False, M=size, N=size, K=size, iters=iters)[0] 10 | print "gemm theano{order_%s/%s/%d/%d}" % ( 11 | order, theano.config.floatX, iters, size), t 12 | -------------------------------------------------------------------------------- /theano/convnet.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import time 3 | 4 | import numpy 5 | from numpy import asarray, random 6 | 7 | from theano.tensor import lscalar, tanh, dot, grad, log, arange 8 | from theano.tensor.nnet import softmax 9 | from theano.tensor.nnet.conv import conv2d 10 | from theano.tensor.signal.downsample import max_pool_2d 11 | from theano import shared, function, config 12 | 13 | random.seed(2344) 14 | 15 | 16 | def rand(*size): 17 | return asarray(random.rand(*size), dtype=config.floatX) 18 | 19 | 20 | def randn(*size): 21 | return asarray(random.randn(*size), dtype=config.floatX) 22 | 23 | 24 | def randint(size, high): 25 | return asarray(random.randint(size=size, low=0, high=high), dtype='int32') 26 | 27 | 28 | def zeros(*size): 29 | return numpy.zeros(size, dtype=config.floatX) 30 | 31 | 32 | n_examples = 1000 33 | outputs = 10 34 | lr = numpy.asarray(0.01, dtype=config.floatX) 35 | 36 | data_x = shared(randn(n_examples, 1, 32, 32)) 37 | data_y = shared(randint((n_examples,), outputs)) 38 | 39 | si = lscalar() 40 | nsi = lscalar() 41 | sx = data_x[si:si + nsi] 42 | sy = data_y[si:si + nsi] 43 | 44 | bmark = open("%s_convnet_%s_%s.bmark" % (socket.gethostname(), 45 | config.device, config.floatX), 'w') 46 | 47 | if config.floatX == 'float32': 48 | prec = 'float' 49 | else: 50 | prec = 'double' 51 | 52 | 53 | def reportmodel(model, batchsize, v): 54 | bmark.write("%s\t" % model) 55 | bmark.write("theano{%s/%s/%i}\t" % ( 56 | config.device[0], prec, batchsize)) 57 | bmark.write("%.2f\n" % v) 58 | 59 | 60 | def eval_and_report(train, name, batchsizes, N=n_examples): 61 | for bs in batchsizes: 62 | assert N % bs == 0 # can't be cheatin now... 63 | t = time.time() 64 | for i in xrange(N / bs): 65 | cost = train(i * bs, bs) 66 | if not (i % (1000 / bs)): 67 | print i * bs, cost 68 | reportmodel(name, bs, N / (time.time() - t)) 69 | 70 | 71 | def bench_ConvSmall(batchsize): 72 | data_x.set_value(randn(n_examples, 1, 32, 32)) 73 | w0 = shared(rand(6, 1, 5, 5) * numpy.sqrt(6 / (25.))) 74 | b0 = shared(zeros(6)) 75 | w1 = shared(rand(16, 6, 5, 5) * numpy.sqrt(6 / (25.))) 76 | b1 = shared(zeros(16)) 77 | vv = shared(rand(16 * 5 * 5, 120) * numpy.sqrt(6.0 / 16. / 25)) 78 | cc = shared(zeros(120)) 79 | v = shared(zeros(120, outputs)) 80 | c = shared(zeros(outputs)) 81 | params = [w0, b0, w1, b1, v, c, vv, cc] 82 | 83 | c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 32, 32), 84 | filter_shape=(6, 1, 5, 5)) + b0.dimshuffle(0, 'x', 'x')) 85 | # this is not the correct leNet5 model, but it's closer to 86 | s0 = tanh(max_pool_2d(c0, (2, 2))) 87 | 88 | c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 14, 14), 89 | filter_shape=(16, 6, 5, 5)) + 90 | b1.dimshuffle(0, 'x', 'x')) 91 | s1 = tanh(max_pool_2d(c1, (2, 2))) 92 | 93 | p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c) 94 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 95 | cost = nll.mean() 96 | 97 | gparams = grad(cost, params) 98 | 99 | train = function([si, nsi], cost, 100 | updates=[(p, p - lr * gp) for p, gp in zip(params, gparams)]) 101 | 102 | eval_and_report(train, "ConvSmall", [batchsize], N=600) 103 | 104 | 105 | def bench_ConvMed(batchsize): 106 | data_x.set_value(randn(n_examples, 1, 96, 96)) 107 | w0 = shared(rand(6, 1, 7, 7) * numpy.sqrt(6 / (25.))) 108 | b0 = shared(zeros(6)) 109 | w1 = shared(rand(16, 6, 7, 7) * numpy.sqrt(6 / (25.))) 110 | b1 = shared(zeros(16)) 111 | vv = shared(rand(16 * 8 * 8, 120) * numpy.sqrt(6.0 / 16. / 25)) 112 | cc = shared(zeros(120)) 113 | v = shared(zeros(120, outputs)) 114 | c = shared(zeros(outputs)) 115 | params = [w0, b0, w1, b1, v, c, vv, cc] 116 | 117 | c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 96, 96), 118 | filter_shape=(6, 1, 7, 7)) + b0.dimshuffle(0, 'x', 'x')) 119 | # this is not the correct leNet5 model, but it's closer to 120 | s0 = tanh(max_pool_2d(c0, (3, 3))) 121 | 122 | c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 30, 30), 123 | filter_shape=(16, 6, 7, 7)) + b1.dimshuffle(0, 'x', 'x')) 124 | s1 = tanh(max_pool_2d(c1, (3, 3))) 125 | 126 | p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c) 127 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 128 | cost = nll.mean() 129 | 130 | gparams = grad(cost, params) 131 | 132 | train = function([si, nsi], cost, 133 | updates=[(p, p - lr * gp) for p, gp in zip(params, gparams)]) 134 | eval_and_report(train, "ConvMed", [batchsize], N=120) 135 | 136 | 137 | def bench_ConvLarge(batchsize): 138 | data_x.set_value(randn(n_examples, 1, 256, 256)) 139 | w0 = shared(rand(6, 1, 7, 7) * numpy.sqrt(6 / (25.))) 140 | b0 = shared(zeros(6)) 141 | w1 = shared(rand(16, 6, 7, 7) * numpy.sqrt(6 / (25.))) 142 | b1 = shared(zeros(16)) 143 | vv = shared(rand(16 * 11 * 11, 120) * numpy.sqrt(6.0 / 16. / 25)) 144 | cc = shared(zeros(120)) 145 | v = shared(zeros(120, outputs)) 146 | c = shared(zeros(outputs)) 147 | params = [w0, b0, w1, b1, v, c, vv, cc] 148 | 149 | c0 = tanh(conv2d(sx, w0, image_shape=(batchsize, 1, 256, 256), 150 | filter_shape=(6, 1, 7, 7)) + b0.dimshuffle(0, 'x', 'x')) 151 | # this is not the correct leNet5 model, but it's closer to 152 | s0 = tanh(max_pool_2d(c0, (5, 5))) 153 | 154 | c1 = tanh(conv2d(s0, w1, image_shape=(batchsize, 6, 50, 50), 155 | filter_shape=(16, 6, 7, 7)) + b1.dimshuffle(0, 'x', 'x')) 156 | s1 = tanh(max_pool_2d(c1, (4, 4))) 157 | 158 | p_y_given_x = softmax(dot(tanh(dot(s1.flatten(2), vv) + cc), v) + c) 159 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 160 | cost = nll.mean() 161 | 162 | gparams = grad(cost, params) 163 | 164 | train = function([si, nsi], cost, 165 | updates=[(p, p - lr * gp) for p, gp in zip(params, gparams)]) 166 | eval_and_report(train, "ConvLarge", [batchsize], N=120) 167 | 168 | if __name__ == '__main__': 169 | bench_ConvSmall(1) 170 | bench_ConvSmall(60) 171 | bench_ConvMed(1) 172 | bench_ConvMed(60) 173 | bench_ConvLarge(1) 174 | bench_ConvLarge(60) 175 | -------------------------------------------------------------------------------- /theano/mlp.py: -------------------------------------------------------------------------------- 1 | import time, socket 2 | from theano.tensor import lscalar, lvector, matrix, tanh, dot, grad, log, arange 3 | from theano.tensor.nnet import softmax, crossentropy_softmax_argmax_1hot_with_bias 4 | from theano import shared, function, config 5 | import numpy, theano 6 | from numpy import asarray, random 7 | random.seed(2344) 8 | 9 | import theano.tensor.blas_c 10 | 11 | def rand(*size): 12 | return asarray(random.rand(*size), dtype=config.floatX) 13 | def randn(*size): 14 | return asarray(random.randn(*size), dtype=config.floatX) 15 | def randint(size, high): 16 | return asarray(random.randint(size=size, low=0, high=high), dtype='int32') 17 | def zeros(*size): 18 | return numpy.zeros(size, dtype=config.floatX) 19 | 20 | n_examples=6000 21 | inputs=784 22 | outputs=10 23 | lr=numpy.asarray(0.01, dtype=config.floatX) 24 | 25 | batchsize=60 26 | 27 | data_x = shared(randn(n_examples, inputs)) 28 | data_y = shared(randint((n_examples,), outputs)) 29 | 30 | si = lscalar() 31 | nsi = lscalar() 32 | sx = data_x[si:si + nsi] 33 | sy = data_y[si:si + nsi] 34 | 35 | bmark = open("%smlp_%s_%s.bmark" %( 36 | socket.gethostname(), 37 | config.device, 38 | config.floatX), 39 | 'w') 40 | 41 | def reportmodel(model, batchsize, t): 42 | bmark.write("%s\t" % model) 43 | if config.floatX == 'float32': 44 | prec = 'float' 45 | else: 46 | prec = 'double' 47 | bmark.write("theano{%s/%s/%i}\t" % ( 48 | config.device[0], prec, batchsize)) 49 | bmark.write("%.2f\n"%(n_examples/t)) # report examples / second 50 | 51 | def eval_and_report(train, name): 52 | if 1: 53 | t = time.time() 54 | for i in xrange(n_examples): 55 | train(i, 1) 56 | reportmodel(name, 1, time.time()-t) 57 | 58 | if 0:# repeat w batchsize 59 | t = time.time() 60 | for i in xrange(n_examples/batchsize): 61 | cost = train(i*batchsize, batchsize) 62 | if not (i % 20): 63 | print i*batchsize, cost 64 | reportmodel(name, batchsize, time.time()-t) 65 | 66 | 67 | def online_mlp_784_10(): 68 | v = shared(zeros(outputs, inputs)) 69 | c = shared(zeros(outputs)) 70 | si = shared(0) # current training example index 71 | sx = data_x[si] 72 | sy = data_y[si] 73 | 74 | nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias( 75 | dot(sx, v.T).dimshuffle('x', 0), 76 | c, 77 | sy.dimshuffle('x')) 78 | cost = nll.mean() 79 | gv, gc = grad(cost, [v, c]) 80 | train = function([], [], 81 | updates={ 82 | v:v - lr * gv, 83 | c:c - lr * gc, 84 | si: (si + 1) % n_examples}) 85 | theano.printing.debugprint(train, file=open('foo_train', 'wb')) 86 | t = time.time() 87 | train.fn(n_calls=n_examples) 88 | dt = time.time() - t 89 | try: 90 | train.fn.update_profile(train.profile) 91 | except AttributeError: 92 | pass 93 | reportmodel('mlp_784_10_hack', 1, dt) 94 | if 1: 95 | t = time.time() 96 | for i in xrange(n_examples): 97 | train() 98 | dt = time.time() - t 99 | reportmodel('mlp_784_10_hack2', 1, dt) 100 | if 1: 101 | t = time.time() 102 | fn = train.fn 103 | for i in xrange(n_examples): fn() 104 | dt = time.time() - t 105 | reportmodel('mlp_784_10_hack3', 1, dt) 106 | 107 | def online_mlp_784_500_10(): 108 | HUs=500 109 | w = shared(rand(HUs, inputs) * numpy.sqrt(6 / (inputs + HUs))) 110 | b = shared(zeros(HUs)) 111 | v = shared(zeros(outputs,HUs)) 112 | c = shared(zeros(outputs)) 113 | si = shared(0) # current training example index 114 | sx = data_x[si] 115 | sy = data_y[si] 116 | 117 | nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias( 118 | dot(tanh(dot(sx, w.T)+b), v.T).dimshuffle('x', 0), 119 | c, 120 | sy.dimshuffle('x')) 121 | cost = nll.mean() 122 | gw, gb, gv, gc = grad(cost, [w, b, v, c]) 123 | train = function([], [], 124 | updates={ 125 | w:w - lr * gw, 126 | b:b - lr * gb, 127 | v:v - lr * gv, 128 | c:c - lr * gc, 129 | si: (si + 1) % n_examples}) 130 | theano.printing.debugprint(train, file=open('foo_train', 'wb')) 131 | t = time.time() 132 | train.fn(n_calls=n_examples) 133 | dt = time.time() - t 134 | try: 135 | train.fn.update_profile(train.profile) 136 | except AttributeError: 137 | pass 138 | reportmodel('mlp_784_500_10_hack', 1, dt) 139 | 140 | def online_mlp_784_1000_1000_1000_10(): 141 | w0 = shared(rand(inputs, 1000) * numpy.sqrt(6 / (inputs + 1000))) 142 | b0 = shared(zeros(1000)) 143 | w1 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000))) 144 | b1 = shared(zeros(1000)) 145 | w2 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000))) 146 | b2 = shared(zeros(1000)) 147 | v = shared(zeros(1000, outputs)) 148 | c = shared(zeros(outputs)) 149 | params=[w0,b0,w1,b1,w2,b2,v,c] 150 | 151 | si = shared(0) # current training example index 152 | sx = data_x[si] 153 | sy = data_y[si] 154 | h0 = tanh(dot(sx, w0)+b0) 155 | h1 = tanh(dot(h0, w1)+b1) 156 | h2 = tanh(dot(h1, w2)+b2) 157 | 158 | nll, p_y_given_x, _argmax = crossentropy_softmax_argmax_1hot_with_bias( 159 | dot(h2, v).dimshuffle('x', 0), 160 | c, 161 | sy.dimshuffle('x')) 162 | cost = nll.mean() 163 | gparams = grad(cost, params) 164 | updates = [(p,p-lr*gp) for p,gp in zip(params, gparams)] 165 | updates += [(si, (si + 1) % n_examples)] 166 | train = function([], [], updates=updates) 167 | theano.printing.debugprint(train, file=open('foo_train', 'wb')) 168 | t = time.time() 169 | train.fn(n_calls=n_examples) 170 | dt = time.time() - t 171 | try: 172 | train.fn.update_profile(train.profile) 173 | except AttributeError: 174 | pass 175 | reportmodel('mlp_784_1000_1000_1000_10_hack', 1, dt) 176 | 177 | def bench_logreg(): 178 | v = shared(zeros(outputs, inputs)) 179 | c = shared(zeros(outputs)) 180 | # 181 | # Note on the transposed-ness of v for some reason, this data layout is faster than the 182 | # non-transposed orientation. 183 | # The change doesn't make much difference in the deeper models, 184 | # but in this case it was more than twice as fast. 185 | # 186 | 187 | p_y_given_x = softmax(dot(sx, v.T) + c) 188 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 189 | cost = nll.mean() 190 | 191 | gv, gc = grad(cost, [v, c]) 192 | 193 | theano.printing.debugprint(grad(cost, [v, c]), file=open('foo', 'wb')) 194 | train = function([si, nsi], [], 195 | updates={ v:v - lr * gv, c:c - lr * gc }) 196 | theano.printing.debugprint(train, file=open('foo_train', 'wb')) 197 | 198 | eval_and_report(train, "mlp_784_10") 199 | print v.get_value().mean() 200 | print v.get_value()[:5,:5] 201 | 202 | def bench_mlp_500(): 203 | HUs=500 204 | w = shared(rand(HUs, inputs) * numpy.sqrt(6 / (inputs + HUs))) 205 | b = shared(zeros(HUs)) 206 | v = shared(zeros(outputs,HUs)) 207 | c = shared(zeros(outputs)) 208 | 209 | p_y_given_x = softmax(dot(tanh(dot(sx, w.T)+b), v.T)+c) 210 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 211 | cost = nll.mean() 212 | 213 | gw,gb,gv,gc = grad(cost, [w,b,v,c]) 214 | 215 | train = function([si, nsi], cost, 216 | updates={ w:w-lr*gw, 217 | b:b-lr*gb, 218 | v:v-lr*gv, 219 | c:c-lr*gc }) 220 | eval_and_report(train, "mlp_784_500_10") 221 | 222 | def bench_deep1000(): 223 | w0 = shared(rand(inputs, 1000) * numpy.sqrt(6 / (inputs + 1000))) 224 | b0 = shared(zeros(1000)) 225 | w1 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000))) 226 | b1 = shared(zeros(1000)) 227 | w2 = shared(rand(1000, 1000) * numpy.sqrt(6 / (1000+1000))) 228 | b2 = shared(zeros(1000)) 229 | v = shared(zeros(1000, outputs)) 230 | c = shared(zeros(outputs)) 231 | params=[w0,b0,w1,b1,w2,b2,v,c] 232 | 233 | h0 = tanh(dot(sx, w0)+b0) 234 | h1 = tanh(dot(h0, w1)+b1) 235 | h2 = tanh(dot(h1, w2)+b2) 236 | 237 | p_y_given_x = softmax(dot(h2, v)+c) 238 | nll = -log(p_y_given_x)[arange(sy.shape[0]), sy] 239 | cost = nll.mean() 240 | 241 | gparams = grad(cost, params) 242 | 243 | train = function([si, nsi], cost, 244 | updates=[(p,p-lr*gp) for p,gp in zip(params, gparams)]) 245 | eval_and_report(train, "mlp_784_1000_1000_1000_10") 246 | 247 | if __name__ == '__main__': 248 | online_mlp_784_10() 249 | online_mlp_784_500_10() 250 | bench_logreg() 251 | bench_mlp_500() 252 | #online_mlp_784_1000_1000_1000_10() 253 | #bench_deep1000() 254 | -------------------------------------------------------------------------------- /theano/rbm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | from __future__ import absolute_import 3 | import numpy as np 4 | import sys 5 | import time 6 | from theano.tensor import lscalar, dot, sum as tsum 7 | from theano.tensor.nnet import sigmoid 8 | from theano import shared, function, config 9 | 10 | rng = np.random.RandomState(342) 11 | 12 | def rand(*size): 13 | return np.asarray(rng.rand(*size), dtype=config.floatX) 14 | def randn(*size): 15 | return np.asarray(rng.randn(*size), dtype=config.floatX) 16 | def randint(size, high): 17 | return np.asarray(rng.randint(size=size, low=0, high=high), dtype='int32') 18 | def zeros(*size): 19 | return np.zeros(size, dtype=config.floatX) 20 | 21 | # c: aa.cc 22 | 23 | nin, nout, batchsize, niter = [int(a) for a in sys.argv[1:]] 24 | lr = 0.01 25 | 26 | # declare data 27 | data_x = shared(rand(batchsize*niter, nin)) 28 | si = lscalar() 29 | nsi = lscalar() 30 | 31 | # declare model weights 32 | a = shared(zeros(nin)) 33 | w = shared(zeros(nin, nout)) 34 | b = shared(zeros(nout)) 35 | 36 | import theano.sandbox.rng_mrg 37 | R = theano.sandbox.rng_mrg.MRG_RandomStreams() 38 | 39 | def bern(x, size): 40 | return R.binomial(size=size, p=x, n=1, dtype=config.floatX) 41 | 42 | pos_vis = data_x[si:si+nsi] 43 | 44 | pos_hid = sigmoid(dot(pos_vis, w)+b) 45 | 46 | neg_vis = sigmoid(dot(bern(pos_hid, (batchsize, nout)), w.T)+a) 47 | 48 | neg_hid = sigmoid(dot(bern(neg_vis, (batchsize, nin)), w) + b) 49 | 50 | new_a = a - lr * tsum(pos_vis - neg_vis, axis=0) 51 | new_b = b - lr * tsum(pos_hid - neg_hid, axis=0) 52 | new_w = w - lr * (dot(pos_vis.T, pos_hid) - dot(neg_vis.T, neg_hid)) 53 | 54 | f = function([si, nsi], [], updates={a:new_a, b:new_b, w:new_w}) 55 | 56 | t = time.time() 57 | for i in xrange(niter): 58 | f(i*batchsize, batchsize) 59 | 60 | print 'cd1 rbm_bernoulli %i_%i\ttheano{%s/%s/%i}\t%.2f' %( 61 | nin, nout, 62 | config.device[0], 63 | ('float' if config.floatX == 'float32' else 'double'), 64 | batchsize, 65 | niter*batchsize/(time.time() - t)) 66 | 67 | 68 | -------------------------------------------------------------------------------- /theano/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # FOR MAGGIE I INSTALLED MKL SO DO LIKE THIS: 5 | # LD_LIBRARY_PATH to include /u/bergstrj/pub/intel/mkl/10.2.4.032/lib/em64t 6 | # LIBRARY_PATH to include /u/bergstrj/pub/intel/mkl/10.2.4.032/lib/em64t 7 | # THEANO_FLAGS="device=cpu,floatX=float64,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def -lpthread" python mlp.py 8 | 9 | 10 | MKL32='linker=c|py_nogc,device=cpu,floatX=float32,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def' 11 | MKL64='linker=c|py_nogc,device=cpu,floatX=float64,blas.ldflags=-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_def' 12 | GPU32='linker=c|py_nogc,device=gpu0,floatX=float32' 13 | 14 | 15 | #THEANO_FLAGS="$MKL32" python mlp.py 16 | #THEANO_FLAGS="$MKL64" python mlp.py 17 | #THEANO_FLAGS="$GPU32" python mlp.py 18 | 19 | #THEANO_FLAGS="$MKL32" python convnet.py 20 | #THEANO_FLAGS="$MKL64" python convnet.py 21 | #THEANO_FLAGS="$GPU32" python convnet.py 22 | 23 | 24 | cat /proc/cpuinfo |grep "model name"|uniq > ${HOSTNAME}_config.conf 25 | free >> ${HOSTNAME}_config.conf 26 | uname -a >> ${HOSTNAME}_config.conf 27 | 28 | THEANO_FLAGS="$MKL32" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_cpu32_b1.bmark 29 | THEANO_FLAGS="$MKL32" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_cpu32_b60.bmark 30 | 31 | THEANO_FLAGS="$MKL64" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_cpu64_b1.bmark 32 | THEANO_FLAGS="$MKL64" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_cpu64_b60.bmark 33 | 34 | #THEANO_FLAGS="$GPU32" python rbm.py 1024 1024 1 100 > ${HOSTNAME}_rbm_gpu32_b1.bmark 35 | #THEANO_FLAGS="$GPU32" python rbm.py 1024 1024 60 20 > ${HOSTNAME}_rbm_gpu32_b60.bmark 36 | 37 | -------------------------------------------------------------------------------- /torch5/MiniBatchGradient.lua: -------------------------------------------------------------------------------- 1 | require "lab" 2 | 3 | local MiniBatchGradient = torch.class('nn.MiniBatchGradient') 4 | 5 | function MiniBatchGradient:__init(module, criterion, batchSize) 6 | self.learningRate = 0.01 7 | self.learningRateDecay = 0 8 | self.maxIteration = 25 9 | self.shuffleIndices = true 10 | self.module = module 11 | self.criterion = criterion 12 | self.batchSize = batchSize or -1 13 | end 14 | 15 | function MiniBatchGradient:train(dataset) 16 | local iteration = 1 17 | local currentLearningRate = self.learningRate 18 | local module = self.module 19 | local criterion = self.criterion 20 | 21 | local shuffledIndices = lab.randperm(dataset:size()) 22 | if not self.shuffleIndices then 23 | for t = 1,dataset:size() do 24 | shuffledIndices[t] = t 25 | end 26 | end 27 | 28 | -- fully batch? 29 | if self.batchSize < 1 then 30 | self.batchSize = dataset:size() 31 | end 32 | 33 | print("# MiniBatchGradient: training with batch size: " .. self.batchSize) 34 | 35 | while true do 36 | local currentError = 0 37 | module:zeroGradParameters() 38 | for t = 1,dataset:size() do 39 | local example = dataset[shuffledIndices[t]] 40 | local input = example[1] 41 | local target = example[2] 42 | 43 | currentError = currentError + criterion:forward(module:forward(input), target) 44 | 45 | module:backward(input, criterion:backward(module.output, target)) 46 | 47 | if t % self.batchSize == 0 then 48 | module:updateParameters(currentLearningRate) 49 | module:zeroGradParameters() 50 | end 51 | 52 | if self.hookExample then 53 | self.hookExample(self, example) 54 | end 55 | end 56 | 57 | if self.hookIteration then 58 | self.hookIteration(self, iteration) 59 | end 60 | 61 | currentError = currentError / dataset:size() 62 | print("# current error = " .. currentError) 63 | iteration = iteration + 1 64 | currentLearningRate = self.learningRate/(1+iteration*self.learningRateDecay) 65 | if self.maxIteration > 0 and iteration > self.maxIteration then 66 | print("# MiniBatchGradient: you have reached the maximum number of iterations") 67 | break 68 | end 69 | end 70 | end 71 | 72 | function MiniBatchGradient:write(file) 73 | file:writeDouble(self.learningRate) 74 | file:writeDouble(self.learningRateDecay) 75 | file:writeInt(self.maxIteration) 76 | file:writeBool(self.shuffleIndices) 77 | file:writeObject(self.module) 78 | file:writeObject(self.criterion) 79 | file:writeLong(self.batchSize) 80 | end 81 | 82 | function MiniBatchGradient:read(file) 83 | self.learningRate = file:readDouble() 84 | self.learningRateDecay = file:readDouble() 85 | self.maxIteration = file:readInt() 86 | self.shuffleIndices = file:readBool() 87 | self.module = file:readObject() 88 | self.criterion = file:readObject() 89 | self.batchSize = file:readLong() 90 | end 91 | -------------------------------------------------------------------------------- /torch5/mlp.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | 3 | require "lab" 4 | require "os" 5 | require "nn" 6 | 7 | 8 | n_examples=12000; 9 | outputs=10; 10 | 11 | io.output("torch5.bmark") 12 | 13 | if true then -- MLP 32/10 14 | dataset={}; 15 | function dataset:size() return n_examples end 16 | inputs=32; 17 | for i=1,dataset:size() do 18 | dataset[i] = {lab.randn(inputs), (i % outputs)+1} 19 | end 20 | mlp = nn.Sequential(); -- make a multi-layer perceptron 21 | mlp:add(nn.Linear(inputs, outputs)) 22 | mlp:add(nn.LogSoftMax()) 23 | 24 | criterion = nn.ClassNLLCriterion() 25 | trainer = nn.StochasticGradient(mlp, criterion) 26 | 27 | trainer.learningRate = 0.01 28 | trainer.shuffleIndices = false 29 | trainer.maxIteration = 1 30 | local x = os.clock() 31 | trainer:train(dataset) 32 | -- we're not using Xent, but using Xent would be even slower 33 | io.write(string.format("mlp_%i_%i", inputs, outputs), "\t", 34 | "torch5", "\t", 35 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 36 | end 37 | 38 | 39 | dataset={}; 40 | function dataset:size() return n_examples end 41 | inputs=784; 42 | 43 | 44 | for i=1,dataset:size() do 45 | dataset[i] = {lab.randn(inputs), (i % outputs)+1} 46 | end 47 | 48 | if true -- MLP 784/10 49 | then 50 | mlp = nn.Sequential(); -- make a multi-layer perceptron 51 | mlp:add(nn.Linear(inputs, outputs)) 52 | mlp:add(nn.LogSoftMax()) 53 | 54 | criterion = nn.ClassNLLCriterion() 55 | trainer = nn.StochasticGradient(mlp, criterion) 56 | 57 | trainer.learningRate = 0.01 58 | trainer.shuffleIndices = false 59 | trainer.maxIteration = 1 60 | local x = os.clock() 61 | trainer:train(dataset) 62 | -- we're not using Xent, but using Xent would be even slower 63 | io.write(string.format("mlp_%i_%i", inputs, outputs), "\t", 64 | "torch5", "\t", 65 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 66 | else 67 | io.write(string.format("# mlp_%i_%i", inputs, outputs), "\t", 68 | "torch5", "\t", 69 | "0.0", "\n") 70 | end 71 | 72 | 73 | if true -- MLP 784/500/10 74 | then 75 | 76 | mlp = nn.Sequential(); -- make a multi-layer perceptron 77 | mlp:add(nn.Linear(inputs, 500)) 78 | mlp:add(nn.Tanh()) 79 | mlp:add(nn.Linear(500, outputs)) 80 | mlp:add(nn.LogSoftMax()) 81 | 82 | criterion = nn.ClassNLLCriterion() 83 | trainer = nn.StochasticGradient(mlp, criterion) 84 | 85 | trainer.learningRate = 0.01 86 | trainer.shuffleIndices = false 87 | trainer.maxIteration = 1 88 | local x = os.clock() 89 | trainer:train(dataset) 90 | -- we're not using Xent, but using Xent would be even slower 91 | io.write(string.format("mlp_%i_500_%i", inputs, outputs), "\t", 92 | "torch5", "\t", 93 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 94 | else 95 | io.write(string.format("# mlp_%i_500_%i", inputs, outputs), "\t", 96 | "torch5", "\t", 97 | "0.0", "\n") 98 | end 99 | 100 | 101 | if true --MLP 784/1000/1000/1000/10 102 | then 103 | 104 | mlp = nn.Sequential(); -- make a multi-layer perceptron 105 | mlp:add(nn.Linear(inputs, 1000)) 106 | mlp:add(nn.Tanh()) 107 | mlp:add(nn.Linear(1000, 1000)) 108 | mlp:add(nn.Tanh()) 109 | mlp:add(nn.Linear(1000, 1000)) 110 | mlp:add(nn.Tanh()) 111 | mlp:add(nn.Linear(1000, outputs)) 112 | mlp:add(nn.LogSoftMax()) 113 | 114 | criterion = nn.ClassNLLCriterion() 115 | trainer = nn.StochasticGradient(mlp, criterion) 116 | 117 | trainer.learningRate = 0.01 118 | trainer.shuffleIndices = false 119 | trainer.maxIteration = 1 120 | local x = os.clock() 121 | trainer:train(dataset) 122 | -- we're not using Xent, but using Xent would be even slower 123 | io.write("mlp_784_1000_1000_1000_10", "\t", 124 | "torch5", "\t", 125 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 126 | 127 | else 128 | io.write("# mlp_784_1000_1000_1000_10", "\t", 129 | "torch5", "\t", 130 | "0.0", "\n") 131 | 132 | end 133 | 134 | dset_32x32={}; 135 | function dset_32x32:size() return n_examples end 136 | for i=1,dset_32x32:size() do 137 | dset_32x32[i] = {lab.randn(32,32,1), (i % outputs)+1} 138 | end 139 | 140 | if true --LeNet5-like 32x32 141 | then 142 | 143 | -- There is no max-pooling implemented, just avg pooling. 144 | -- So I added tanh between every layer to separate true conv layers with 145 | -- the subsampling (which is just a convolution with 1s) 146 | 147 | mlp = nn.Sequential(); -- make a multi-layer perceptron 148 | mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28 149 | mlp:add(nn.Tanh()) 150 | mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14 151 | mlp:add(nn.Tanh()) 152 | mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10 153 | mlp:add(nn.Tanh()) 154 | mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5 155 | mlp:add(nn.Tanh()) 156 | mlp:add(nn.Reshape(16*5*5)) 157 | mlp:add(nn.Linear(16*5*5, 120)) 158 | mlp:add(nn.Linear(120, outputs)) 159 | mlp:add(nn.LogSoftMax()) 160 | 161 | criterion = nn.ClassNLLCriterion() 162 | trainer = nn.StochasticGradient(mlp, criterion) 163 | 164 | trainer.learningRate = 0.01 165 | trainer.shuffleIndices = false 166 | trainer.maxIteration = 1 167 | local x = os.clock() 168 | trainer:train(dset_32x32) 169 | -- we're not using Xent, but using Xent would be even slower 170 | io.write("ConvSmall", "\t", 171 | "torch5", "\t", 172 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 173 | end 174 | 175 | dset_96x96={}; 176 | function dset_96x96:size() return 100 end 177 | for i=1,dset_96x96:size() do 178 | dset_96x96[i] = {lab.randn(96,96,1), (i % outputs)+1} 179 | end 180 | 181 | if true --LeNet5-like 96x96 182 | then 183 | 184 | -- There is no max-pooling implemented, just avg pooling. 185 | -- So I added tanh between every layer to separate true conv layers with 186 | -- the subsampling (which is just a convolution with 1s) 187 | 188 | mlp = nn.Sequential(); -- make a multi-layer perceptron 189 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90 190 | mlp:add(nn.Tanh()) 191 | mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30 192 | mlp:add(nn.Tanh()) 193 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24 194 | mlp:add(nn.Tanh()) 195 | mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8 196 | mlp:add(nn.Tanh()) 197 | mlp:add(nn.Reshape(16*8*8)) 198 | mlp:add(nn.Linear(16*8*8, 120)) 199 | mlp:add(nn.Linear(120, outputs)) 200 | mlp:add(nn.LogSoftMax()) 201 | 202 | criterion = nn.ClassNLLCriterion() 203 | trainer = nn.StochasticGradient(mlp, criterion) 204 | 205 | trainer.learningRate = 0.01 206 | trainer.shuffleIndices = false 207 | trainer.maxIteration = 1 208 | local x = os.clock() 209 | trainer:train(dset_96x96) 210 | -- we're not using Xent, but using Xent would be even slower 211 | io.write("ConvMed", "\t", 212 | "torch5", "\t", 213 | string.format("%.2f\n", dset_96x96:size()/(os.clock() - x)), "\n") 214 | end 215 | 216 | 217 | dset_256x256={}; 218 | function dset_256x256:size() return 20 end 219 | for i=1,dset_256x256:size() do 220 | dset_256x256[i] = {lab.randn(256,256,1), (i % outputs)+1} 221 | end 222 | 223 | if true --LeNet5-like 256x256 224 | then 225 | 226 | -- There is no max-pooling implemented, just avg pooling. 227 | -- So I added tanh between every layer to separate true conv layers with 228 | -- the subsampling (which is just a convolution with 1s) 229 | 230 | mlp = nn.Sequential(); -- make a multi-layer perceptron 231 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250 232 | mlp:add(nn.Tanh()) 233 | mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50 234 | mlp:add(nn.Tanh()) 235 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44 236 | mlp:add(nn.Tanh()) 237 | mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11 238 | mlp:add(nn.Tanh()) 239 | mlp:add(nn.Reshape(16*11*11)) 240 | mlp:add(nn.Linear(16*11*11, 120)) 241 | mlp:add(nn.Linear(120, outputs)) 242 | mlp:add(nn.LogSoftMax()) 243 | 244 | criterion = nn.ClassNLLCriterion() 245 | trainer = nn.StochasticGradient(mlp, criterion) 246 | 247 | trainer.learningRate = 0.01 248 | trainer.shuffleIndices = false 249 | trainer.maxIteration = 1 250 | local x = os.clock() 251 | trainer:train(dset_256x256) 252 | -- we're not using Xent, but using Xent would be even slower 253 | io.write("ConvLarge", "\t", 254 | "torch5", "\t", 255 | string.format("%.2f\n", dset_256x256:size()/(os.clock() - x)), "\n") 256 | end 257 | -------------------------------------------------------------------------------- /torch5/mlp_minibatch.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | 3 | require "lab" 4 | require "os" 5 | require "nn" 6 | 7 | -- When discussin Torch's performance by email, 8 | -- Ronan sent me this file for doing mini-batches. 9 | -- It seems like an unofficial feature, I don't know why this isn't in the 10 | -- main distribution... 11 | dofile('MiniBatchGradient.lua') 12 | 13 | 14 | n_examples=12000; 15 | outputs=10; 16 | 17 | io.output("torch5_minibatch.bmark") 18 | 19 | if true then -- MLP 32/10 20 | dataset={}; 21 | function dataset:size() return n_examples end 22 | inputs=32; 23 | for i=1,dataset:size() do 24 | dataset[i] = {lab.randn(inputs), (i % outputs)+1} 25 | end 26 | mlp = nn.Sequential(); -- make a multi-layer perceptron 27 | mlp:add(nn.Linear(inputs, outputs)) 28 | mlp:add(nn.LogSoftMax()) 29 | 30 | criterion = nn.ClassNLLCriterion() 31 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 32 | 33 | trainer.learningRate = 0.01 34 | trainer.shuffleIndices = false 35 | trainer.maxIteration = 1 36 | local x = os.clock() 37 | trainer:train(dataset) 38 | -- we're not using Xent, but using Xent would be even slower 39 | io.write(string.format("mlp_%i_%i", inputs, outputs), "\t", 40 | "torch5{60}", "\t", 41 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 42 | end 43 | 44 | 45 | dataset={}; 46 | function dataset:size() return n_examples end 47 | inputs=784; 48 | 49 | 50 | for i=1,dataset:size() do 51 | dataset[i] = {lab.randn(inputs), (i % outputs)+1} 52 | end 53 | 54 | if true -- MLP 784/10 55 | then 56 | mlp = nn.Sequential(); -- make a multi-layer perceptron 57 | mlp:add(nn.Linear(inputs, outputs)) 58 | mlp:add(nn.LogSoftMax()) 59 | 60 | criterion = nn.ClassNLLCriterion() 61 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 62 | 63 | trainer.learningRate = 0.01 64 | trainer.shuffleIndices = false 65 | trainer.maxIteration = 1 66 | local x = os.clock() 67 | trainer:train(dataset) 68 | -- we're not using Xent, but using Xent would be even slower 69 | io.write(string.format("mlp_%i_%i", inputs, outputs), "\t", 70 | "torch5{60}", "\t", 71 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 72 | else 73 | io.write(string.format("# mlp_%i_%i", inputs, outputs), "\t", 74 | "torch5{60}", "\t", 75 | "0.0", "\n") 76 | end 77 | 78 | 79 | if true -- MLP 784/500/10 80 | then 81 | 82 | mlp = nn.Sequential(); -- make a multi-layer perceptron 83 | mlp:add(nn.Linear(inputs, 500)) 84 | mlp:add(nn.Tanh()) 85 | mlp:add(nn.Linear(500, outputs)) 86 | mlp:add(nn.LogSoftMax()) 87 | 88 | criterion = nn.ClassNLLCriterion() 89 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 90 | 91 | trainer.learningRate = 0.01 92 | trainer.shuffleIndices = false 93 | trainer.maxIteration = 1 94 | local x = os.clock() 95 | trainer:train(dataset) 96 | -- we're not using Xent, but using Xent would be even slower 97 | io.write(string.format("mlp_%i_500_%i", inputs, outputs), "\t", 98 | "torch5{60}", "\t", 99 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 100 | else 101 | io.write(string.format("# mlp_%i_500_%i", inputs, outputs), "\t", 102 | "torch5{60}", "\t", 103 | "0.0", "\n") 104 | end 105 | 106 | 107 | if true --MLP 784/1000/1000/1000/10 108 | then 109 | 110 | mlp = nn.Sequential(); -- make a multi-layer perceptron 111 | mlp:add(nn.Linear(inputs, 1000)) 112 | mlp:add(nn.Tanh()) 113 | mlp:add(nn.Linear(1000, 1000)) 114 | mlp:add(nn.Tanh()) 115 | mlp:add(nn.Linear(1000, 1000)) 116 | mlp:add(nn.Tanh()) 117 | mlp:add(nn.Linear(1000, outputs)) 118 | mlp:add(nn.LogSoftMax()) 119 | 120 | criterion = nn.ClassNLLCriterion() 121 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 122 | 123 | trainer.learningRate = 0.01 124 | trainer.shuffleIndices = false 125 | trainer.maxIteration = 1 126 | local x = os.clock() 127 | trainer:train(dataset) 128 | -- we're not using Xent, but using Xent would be even slower 129 | io.write("mlp_784_1000_1000_1000_10", "\t", 130 | "torch5{60}", "\t", 131 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 132 | 133 | else 134 | io.write("# mlp_784_1000_1000_1000_10", "\t", 135 | "torch5{60}", "\t", 136 | "0.0", "\n") 137 | 138 | end 139 | 140 | dset_32x32={}; 141 | function dset_32x32:size() return n_examples end 142 | for i=1,dset_32x32:size() do 143 | dset_32x32[i] = {lab.randn(32,32,1), (i % outputs)+1} 144 | end 145 | 146 | if true --LeNet5-like 32x32 147 | then 148 | 149 | -- There is no max-pooling implemented, just avg pooling. 150 | -- So I added tanh between every layer to separate true conv layers with 151 | -- the subsampling (which is just a convolution with 1s) 152 | 153 | mlp = nn.Sequential(); -- make a multi-layer perceptron 154 | mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28 155 | mlp:add(nn.Tanh()) 156 | mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14 157 | mlp:add(nn.Tanh()) 158 | mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10 159 | mlp:add(nn.Tanh()) 160 | mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5 161 | mlp:add(nn.Tanh()) 162 | mlp:add(nn.Reshape(16*5*5)) 163 | mlp:add(nn.Linear(16*5*5, 120)) 164 | mlp:add(nn.Linear(120, outputs)) 165 | mlp:add(nn.LogSoftMax()) 166 | 167 | criterion = nn.ClassNLLCriterion() 168 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 169 | 170 | trainer.learningRate = 0.01 171 | trainer.shuffleIndices = false 172 | trainer.maxIteration = 1 173 | local x = os.clock() 174 | trainer:train(dset_32x32) 175 | -- we're not using Xent, but using Xent would be even slower 176 | io.write("ConvSmall", "\t", 177 | "torch5{60}", "\t", 178 | string.format("%.2f\n", n_examples/(os.clock() - x)), "\n") 179 | end 180 | 181 | dset_96x96={}; 182 | function dset_96x96:size() return 100 end 183 | for i=1,dset_96x96:size() do 184 | dset_96x96[i] = {lab.randn(96,96,1), (i % outputs)+1} 185 | end 186 | 187 | if true --LeNet5-like 96x96 188 | then 189 | 190 | -- There is no max-pooling implemented, just avg pooling. 191 | -- So I added tanh between every layer to separate true conv layers with 192 | -- the subsampling (which is just a convolution with 1s) 193 | 194 | mlp = nn.Sequential(); -- make a multi-layer perceptron 195 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90 196 | mlp:add(nn.Tanh()) 197 | mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30 198 | mlp:add(nn.Tanh()) 199 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24 200 | mlp:add(nn.Tanh()) 201 | mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8 202 | mlp:add(nn.Tanh()) 203 | mlp:add(nn.Reshape(16*8*8)) 204 | mlp:add(nn.Linear(16*8*8, 120)) 205 | mlp:add(nn.Linear(120, outputs)) 206 | mlp:add(nn.LogSoftMax()) 207 | 208 | criterion = nn.ClassNLLCriterion() 209 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 210 | 211 | trainer.learningRate = 0.01 212 | trainer.shuffleIndices = false 213 | trainer.maxIteration = 1 214 | local x = os.clock() 215 | trainer:train(dset_96x96) 216 | -- we're not using Xent, but using Xent would be even slower 217 | io.write("ConvMed", "\t", 218 | "torch5{60}", "\t", 219 | string.format("%.2f\n", dset_96x96:size()/(os.clock() - x)), "\n") 220 | end 221 | 222 | 223 | dset_256x256={}; 224 | function dset_256x256:size() return 20 end 225 | for i=1,dset_256x256:size() do 226 | dset_256x256[i] = {lab.randn(256,256,1), (i % outputs)+1} 227 | end 228 | 229 | if true --LeNet5-like 256x256 230 | then 231 | 232 | -- There is no max-pooling implemented, just avg pooling. 233 | -- So I added tanh between every layer to separate true conv layers with 234 | -- the subsampling (which is just a convolution with 1s) 235 | 236 | mlp = nn.Sequential(); -- make a multi-layer perceptron 237 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250 238 | mlp:add(nn.Tanh()) 239 | mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50 240 | mlp:add(nn.Tanh()) 241 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44 242 | mlp:add(nn.Tanh()) 243 | mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11 244 | mlp:add(nn.Tanh()) 245 | mlp:add(nn.Reshape(16*11*11)) 246 | mlp:add(nn.Linear(16*11*11, 120)) 247 | mlp:add(nn.Linear(120, outputs)) 248 | mlp:add(nn.LogSoftMax()) 249 | 250 | criterion = nn.ClassNLLCriterion() 251 | trainer = nn.MiniBatchGradient(mlp, criterion, 60) 252 | 253 | trainer.learningRate = 0.01 254 | trainer.shuffleIndices = false 255 | trainer.maxIteration = 1 256 | local x = os.clock() 257 | trainer:train(dset_256x256) 258 | -- we're not using Xent, but using Xent would be even slower 259 | io.write("ConvLarge", "\t", 260 | "torch5{60}", "\t", 261 | string.format("%.2f\n", dset_256x256:size()/(os.clock() - x)), "\n") 262 | end 263 | -------------------------------------------------------------------------------- /torch5/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ./mlp.lua 4 | 5 | mv torch5.bmark ${HOSTNAME}_torch5.bmark 6 | 7 | ./mlp_minibatch.lua 8 | mv torch5_minibatch.bmark ${HOSTNAME}_torch5_minibatch.bmark 9 | -------------------------------------------------------------------------------- /torch7/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | run.sh.results* 3 | lib 4 | -------------------------------------------------------------------------------- /torch7/README.txt: -------------------------------------------------------------------------------- 1 | The code in this directory was forked from 2 | 3 | https://github.com/andresy/benchmark/commit/cd81345962bc05fe4819a56a675681605ea1587f 4 | 5 | Installing Torch 7 6 | ------------------ 7 | 8 | Torch 7 (https://github.com/andresy/torch) is required to run the scripts in 9 | this folder. I had personal help from Koray to install torch7. It was 10 | straightforward once he convinced me not to use luarocks. Torch7 failed to 11 | find the openblas I installed, so I had to trick it post-compilation by 12 | setting the LD_LIBRARY_PATH to include a symlink with the right name to my 13 | libopenblas.so. Use ldd on the libTH.so built by torch7 to see what name you 14 | must give to this fake library. 15 | 16 | 17 | Running timing experiments 18 | -------------------------- 19 | 20 | The file run.sh produces a number of timing files whose names are of the form 21 | run.sh.results_${HOSTNAME}_b[1,10,100]_p[32,64][,_openmp,_cuda] 22 | 23 | The cuda trials are run on the GPU device 0, and simply fail if no GPU is 24 | present. 25 | 26 | 27 | Adding results to DB 28 | -------------------- 29 | 30 | To add the current timing results to ../db.pkl, type: 31 | 32 | $ python add_to_db.py --db ../db.pkl run.sh.results_* 33 | -------------------------------------------------------------------------------- /torch7/SpatialConvolutionFast.lua: -------------------------------------------------------------------------------- 1 | local SpatialConvolutionFast, parent = torch.class('nn.SpatialConvolutionFast', 'nn.Module') 2 | 3 | function SpatialConvolutionFast:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH) 4 | parent.__init(self) 5 | 6 | dW = dW or 1 7 | dH = dH or 1 8 | 9 | self.nInputPlane = nInputPlane 10 | self.nOutputPlane = nOutputPlane 11 | self.kW = kW 12 | self.kH = kH 13 | self.dW = dW 14 | self.dH = dH 15 | 16 | self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW) 17 | self.bias = torch.Tensor(nOutputPlane) 18 | self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW) 19 | self.gradBias = torch.Tensor(nOutputPlane) 20 | 21 | self.finput = torch.Tensor() 22 | self.fgradInput = torch.Tensor() 23 | 24 | self:reset() 25 | end 26 | 27 | function SpatialConvolutionFast:reset(stdv) 28 | if stdv then 29 | stdv = stdv * math.sqrt(3) 30 | else 31 | stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane) 32 | end 33 | self.weight:apply(function() 34 | return random.uniform(-stdv, stdv) 35 | end) 36 | self.bias:apply(function() 37 | return random.uniform(-stdv, stdv) 38 | end) 39 | end 40 | 41 | function SpatialConvolutionFast:forward(input) 42 | input = input:unfold(2, self.kH, self.dH) 43 | input = input:unfold(3, self.kW, self.dW) 44 | input = input:transpose(2,4) 45 | input = input:transpose(3,5) 46 | 47 | self.finput:resize(self.kW*self.kH*self.nInputPlane, input:size(4)*input:size(5)):copy(input) 48 | 49 | self.output:resize(self.nOutputPlane, input:size(4), input:size(5)) 50 | local output = input.new(self.output:storage(), 1, self.nOutputPlane, -1, input:size(4)*input:size(5), -1):copy( 51 | input.new(self.bias:storage(), 1, self.nOutputPlane, 1, input:size(4)*input:size(5), 0)) 52 | 53 | output:addmm(1, self.weight, self.finput) 54 | return self.output 55 | end 56 | 57 | function SpatialConvolutionFast:backward(input, gradOutput) 58 | if self.gradInput then 59 | gradOutput = input.new(gradOutput:storage(), 1, gradOutput:size(1), -1, gradOutput:size(2)*gradOutput:size(3), -1) 60 | 61 | self.fgradInput:resizeAs(self.finput):zero() 62 | self.fgradInput:addmm(1, self.weight:t(), gradOutput) 63 | 64 | self.gradInput:resizeAs(input):zero() 65 | local gradInput = self.gradInput:unfold(2, self.kH, self.dH) 66 | gradInput = gradInput:unfold(3, self.kW, self.dW) 67 | gradInput = gradInput:transpose(2,4) 68 | gradInput = gradInput:transpose(3,5) 69 | gradInput:add(self.fgradInput) 70 | 71 | return self.gradInput 72 | 73 | end 74 | end 75 | 76 | function SpatialConvolutionFast:accGradParameters(input, gradOutput, scale) 77 | gradOutput = input.new(gradOutput:storage(), 1, gradOutput:size(1), -1, gradOutput:size(2)*gradOutput:size(3), -1) 78 | self.gradWeight:addmm(1, gradOutput, self.finput:t()) 79 | input.new(self.gradBias:storage(), 1, gradOutput:size(1), 1, gradOutput:size(2), 0):add(gradOutput) 80 | end 81 | -------------------------------------------------------------------------------- /torch7/add_to_db.py: -------------------------------------------------------------------------------- 1 | """ 2 | Write the results of run.sh to the pickled database of timing results 3 | """ 4 | import os 5 | import sys 6 | import cPickle 7 | 8 | def main(): 9 | assert sys.argv[1] == '--db' 10 | try: 11 | db = cPickle.load(open(sys.argv[2])) 12 | except IOError: 13 | db = [] 14 | 15 | for results_file in sys.argv[3:]: 16 | template = dict() 17 | for lineno, line in enumerate(open(results_file)): 18 | if '=' in line: 19 | key = line[:line.index('=')] 20 | val = line[line.index('=') + 1:] 21 | if key in ('host', 'device'): 22 | template[key] = val.strip() 23 | elif key in ('OpenMP',): 24 | template[key] = bool(int(val)) 25 | elif key in ('batch', 'precision'): 26 | template[key] = int(val) 27 | else: 28 | raise ValueError(key) 29 | 30 | elif line.startswith('mlp'): 31 | problem, speed_str = line.split('\t') 32 | entry = dict(template) 33 | entry['problem'] = problem 34 | entry['speed'] = float(speed_str) 35 | db.append(entry) 36 | elif line.startswith('cnn'): 37 | problem, speed_str = line.split('\t') 38 | entry = dict(template) 39 | entry['problem'] = problem 40 | entry['speed'] = float(speed_str) 41 | db.append(entry) 42 | else: 43 | print "ERROR: ", line 44 | 45 | if 1: 46 | print "Writing database to", sys.argv[2] 47 | cPickle.dump(db, open(sys.argv[2], 'wb')) 48 | else: 49 | print "DEBUG FINAL DB:" 50 | for entry in db: 51 | print entry 52 | 53 | if __name__ == '__main__': 54 | sys.exit(main()) 55 | -------------------------------------------------------------------------------- /torch7/benchmark.lua: -------------------------------------------------------------------------------- 1 | require "lab" 2 | require "nn" 3 | 4 | cmd = torch.CmdLine() 5 | 6 | cmd:text() 7 | cmd:text('Benchmark Torch7') 8 | cmd:text() 9 | cmd:text() 10 | cmd:text('Misc options:') 11 | cmd:option('-nomlp', false, 'do not perform MLP tests') 12 | cmd:option('-nocnn', false, 'do not perform CNN tests') 13 | cmd:option('-nexmlp', 60000, '# of examples for the MLPs') 14 | cmd:option('-nexcnn', 6000, '# of examples for the CNNs') 15 | cmd:option('-hardtanh', false, 'use hardtanh instead of tanh') 16 | cmd:option('-convfast', false, 'use "fast" convolution code instead of standard') 17 | cmd:option('-openmp', false, 'use openmp *package*') 18 | cmd:option('-double', false, 'use doubles instead of floats') 19 | cmd:option('-cuda', false, 'use CUDA instead of floats') 20 | cmd:option('-gi', false, 'compute gradInput') 21 | cmd:option('-v', false, 'be verbose') 22 | cmd:option('-batch', 1, 'batch size') 23 | 24 | cmd:text() 25 | 26 | local params = cmd:parse(arg) 27 | 28 | random.manualSeed(5555) 29 | 30 | if params.v then 31 | printlog = print 32 | else 33 | printlog = print 34 | print = function() 35 | end 36 | end 37 | 38 | if params.openmp then 39 | require 'openmp' 40 | end 41 | 42 | if params.convfast then 43 | dofile('SpatialConvolutionFast.lua') 44 | nn.SpatialConvolution = nn.SpatialConvolutionFast 45 | end 46 | 47 | if params.hardtanh then 48 | nn.Tanh = nn.HardTanh 49 | end 50 | 51 | if params.double and params.cuda then 52 | error('make your choice between double and cuda!!') 53 | end 54 | 55 | if params.double then 56 | torch.setdefaulttensortype('torch.DoubleTensor') 57 | elseif params.cuda then 58 | require 'cunn' 59 | dofile('cudahacks.lua') 60 | torch.setdefaulttensortype('torch.CudaTensor') 61 | else 62 | torch.setdefaulttensortype('torch.FloatTensor') 63 | end 64 | 65 | local noutput = 10 66 | 67 | if not params.nomlp then 68 | 69 | local ninput = 784 70 | local dataset = {} 71 | local data = lab.randn(params.nexmlp, ninput) 72 | local label = torch.LongTensor(params.nexmlp) 73 | for i=1,params.nexmlp do 74 | label[i] = (i % noutput) + 1 75 | end 76 | 77 | if params.batch == 1 then 78 | function dataset:size() 79 | return params.nexmlp 80 | end 81 | 82 | setmetatable(dataset, {__index = function(self, index) 83 | return {data[index], label[index]} 84 | end}) 85 | else 86 | assert(params.nexmlp % params.batch == 0, '# of examples must be divisible with batch size') 87 | function dataset:size() 88 | return params.nexmlp/params.batch 89 | end 90 | setmetatable(dataset, {__index = function(self, index) 91 | return {data:narrow(1,(index-1)*params.batch+1, params.batch), 92 | label:narrow(1,(index-1)*params.batch+1, params.batch)} 93 | end}) 94 | end 95 | 96 | if true then -- MLP 784/10 97 | collectgarbage() 98 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 99 | mlp:add(nn.Linear(ninput, noutput)) 100 | 101 | if params.cuda then 102 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 103 | torch.setdefaulttensortype('torch.FloatTensor') 104 | end 105 | 106 | mlp:add(nn.LogSoftMax()) 107 | 108 | if not params.gi then 109 | if params.v then 110 | print('# do not compute gradInput') 111 | end 112 | mlp:get(1).gradInput = nil 113 | end 114 | 115 | local criterion = nn.ClassNLLCriterion() 116 | 117 | if params.cuda then 118 | torch.setdefaulttensortype('torch.CudaTensor') 119 | end 120 | 121 | local trainer = nn.StochasticGradient(mlp, criterion) 122 | 123 | trainer.learningRate = 0.01 124 | trainer.shuffleIndices = false 125 | trainer.maxIteration = 1 126 | local t = torch.Timer() 127 | trainer:train(dataset) 128 | printlog(string.format("mlp_%i_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real)) 129 | end 130 | 131 | if true then -- MLP 784/500/10 132 | collectgarbage() 133 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 134 | mlp:add(nn.Linear(ninput, 500)) 135 | mlp:add(nn.Tanh()) 136 | mlp:add(nn.Linear(500, noutput)) 137 | 138 | if params.cuda then 139 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 140 | torch.setdefaulttensortype('torch.FloatTensor') 141 | end 142 | 143 | mlp:add(nn.LogSoftMax()) 144 | 145 | if not params.gi then 146 | if params.v then 147 | print('# do not compute gradInput') 148 | end 149 | mlp:get(1).gradInput = nil 150 | end 151 | 152 | local criterion = nn.ClassNLLCriterion() 153 | 154 | if params.cuda then 155 | torch.setdefaulttensortype('torch.CudaTensor') 156 | end 157 | 158 | local trainer = nn.StochasticGradient(mlp, criterion) 159 | 160 | trainer.learningRate = 0.01 161 | trainer.shuffleIndices = false 162 | trainer.maxIteration = 1 163 | local t = torch.Timer() 164 | trainer:train(dataset) 165 | printlog(string.format("mlp_%i_500_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real)) 166 | end 167 | 168 | 169 | if true then --MLP 784/1000/1000/1000/10 170 | collectgarbage() 171 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 172 | mlp:add(nn.Linear(ninput, 1000)) 173 | mlp:add(nn.Tanh()) 174 | mlp:add(nn.Linear(1000, 1000)) 175 | mlp:add(nn.Tanh()) 176 | mlp:add(nn.Linear(1000, 1000)) 177 | mlp:add(nn.Tanh()) 178 | mlp:add(nn.Linear(1000, noutput)) 179 | 180 | if params.cuda then 181 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 182 | torch.setdefaulttensortype('torch.FloatTensor') 183 | end 184 | 185 | mlp:add(nn.LogSoftMax()) 186 | 187 | if not params.gi then 188 | if params.v then 189 | print('# do not compute gradInput') 190 | end 191 | mlp:get(1).gradInput = nil 192 | end 193 | 194 | local criterion = nn.ClassNLLCriterion() 195 | 196 | if params.cuda then 197 | torch.setdefaulttensortype('torch.CudaTensor') 198 | end 199 | 200 | local trainer = nn.StochasticGradient(mlp, criterion) 201 | 202 | trainer.learningRate = 0.01 203 | trainer.shuffleIndices = false 204 | trainer.maxIteration = 1 205 | local t = torch.Timer() 206 | trainer:train(dataset) 207 | printlog(string.format("mlp_%i_1000_1000_1000_%i\t%.2f", ninput, noutput, params.nexmlp/t:time().real)) 208 | end 209 | end 210 | 211 | if not params.nocnn then 212 | 213 | function createcnndataset(nex,w,h) 214 | local dataset = {} 215 | local data = lab.randn(nex, 1, w, h) 216 | local label = torch.LongTensor(params.nexmlp) 217 | for i=1,params.nexmlp do 218 | label[i] = (i % noutput) + 1 219 | end 220 | 221 | if params.batch == 1 then 222 | function dataset:size() 223 | return nex 224 | end 225 | 226 | setmetatable(dataset, {__index = function(self, index) 227 | return {data[index], label[index]} 228 | end}) 229 | else 230 | assert(nex % params.batch == 0, '# of examples must be divisible with batch size') 231 | function dataset:size() 232 | return nex/params.batch 233 | end 234 | setmetatable(dataset, {__index = function(self, index) 235 | return {data:narrow(1,(index-1)*params.batch+1, params.batch), 236 | label:narrow(1,(index-1)*params.batch+1, params.batch)} 237 | end}) 238 | end 239 | 240 | return dataset 241 | end 242 | 243 | if true then --LeNet5-like 32x32 244 | collectgarbage() 245 | local dataset = createcnndataset(params.nexcnn, 32, 32) 246 | 247 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 248 | mlp:add(nn.SpatialConvolution(1, 6, 5, 5)) -- output 28x28 249 | mlp:add(nn.Tanh()) 250 | mlp:add(nn.SpatialSubSampling(6, 2, 2, 2, 2)) --output 14x14 251 | mlp:add(nn.Tanh()) 252 | mlp:add(nn.SpatialConvolution(6, 16, 5, 5)) -- output 10x10 253 | mlp:add(nn.Tanh()) 254 | mlp:add(nn.SpatialSubSampling(16, 2, 2, 2, 2)) -- output 5x5 255 | mlp:add(nn.Tanh()) 256 | mlp:add(nn.Reshape(16*5*5)) 257 | mlp:add(nn.Linear(16*5*5, 120)) 258 | mlp:add(nn.Linear(120, noutput)) 259 | 260 | if params.cuda then 261 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 262 | torch.setdefaulttensortype('torch.FloatTensor') 263 | end 264 | 265 | mlp:add(nn.LogSoftMax()) 266 | 267 | if not params.gi then 268 | if params.v then 269 | print('# do not compute gradInput') 270 | end 271 | mlp:get(1).gradInput = nil 272 | end 273 | 274 | local criterion = nn.ClassNLLCriterion() 275 | 276 | if params.cuda then 277 | torch.setdefaulttensortype('torch.CudaTensor') 278 | end 279 | 280 | local trainer = nn.StochasticGradient(mlp, criterion) 281 | 282 | trainer.learningRate = 0.01 283 | trainer.shuffleIndices = false 284 | trainer.maxIteration = 1 285 | local t = torch.Timer() 286 | trainer:train(dataset) 287 | printlog(string.format("cnn_32x32\t%.2f", params.nexcnn/t:time().real)) 288 | end 289 | 290 | if true then --LeNet5-like 96x96 291 | collectgarbage() 292 | local dataset = createcnndataset(params.nexcnn, 96, 96) 293 | 294 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 295 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 90x90 296 | mlp:add(nn.Tanh()) 297 | mlp:add(nn.SpatialSubSampling(6, 3, 3, 3, 3)) --output 30x30 298 | mlp:add(nn.Tanh()) 299 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 24x24 300 | mlp:add(nn.Tanh()) 301 | mlp:add(nn.SpatialSubSampling(16, 3, 3, 3, 3)) -- output 8x8 302 | mlp:add(nn.Tanh()) 303 | mlp:add(nn.Reshape(16*8*8)) 304 | mlp:add(nn.Linear(16*8*8, 120)) 305 | mlp:add(nn.Linear(120, noutput)) 306 | 307 | if params.cuda then 308 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 309 | torch.setdefaulttensortype('torch.FloatTensor') 310 | end 311 | 312 | mlp:add(nn.LogSoftMax()) 313 | 314 | if not params.gi then 315 | if params.v then 316 | print('# do not compute gradInput') 317 | end 318 | mlp:get(1).gradInput = nil 319 | end 320 | 321 | local criterion = nn.ClassNLLCriterion() 322 | 323 | if params.cuda then 324 | torch.setdefaulttensortype('torch.CudaTensor') 325 | end 326 | 327 | local trainer = nn.StochasticGradient(mlp, criterion) 328 | 329 | trainer.learningRate = 0.01 330 | trainer.shuffleIndices = false 331 | trainer.maxIteration = 1 332 | local t = torch.Timer() 333 | trainer:train(dataset) 334 | printlog(string.format("cnn_96x96\t%.2f", params.nexcnn/t:time().real)) 335 | end 336 | 337 | if true then --LeNet5-like 256x256 338 | collectgarbage() 339 | local dataset = createcnndataset(params.nexcnn, 256, 256) 340 | 341 | local mlp = nn.Sequential(); -- make a multi-layer perceptron 342 | mlp:add(nn.SpatialConvolution(1, 6, 7, 7)) -- output 250x250 343 | mlp:add(nn.Tanh()) 344 | mlp:add(nn.SpatialSubSampling(6, 5, 5, 5, 5)) --output 50x50 345 | mlp:add(nn.Tanh()) 346 | mlp:add(nn.SpatialConvolution(6, 16, 7, 7)) -- output 44x44 347 | mlp:add(nn.Tanh()) 348 | mlp:add(nn.SpatialSubSampling(16, 4, 4, 4, 4)) -- output 11x11 349 | mlp:add(nn.Tanh()) 350 | mlp:add(nn.Reshape(16*11*11)) 351 | mlp:add(nn.Linear(16*11*11, 120)) 352 | mlp:add(nn.Linear(120, noutput)) 353 | 354 | if params.cuda then 355 | mlp:add(nn.Copy('torch.CudaTensor', 'torch.FloatTensor')) 356 | torch.setdefaulttensortype('torch.FloatTensor') 357 | end 358 | 359 | mlp:add(nn.LogSoftMax()) 360 | 361 | if not params.gi then 362 | if params.v then 363 | print('# do not compute gradInput') 364 | end 365 | mlp:get(1).gradInput = nil 366 | end 367 | 368 | local criterion = nn.ClassNLLCriterion() 369 | 370 | if params.cuda then 371 | torch.setdefaulttensortype('torch.CudaTensor') 372 | end 373 | 374 | local trainer = nn.StochasticGradient(mlp, criterion) 375 | 376 | trainer.learningRate = 0.01 377 | trainer.shuffleIndices = false 378 | trainer.maxIteration = 1 379 | local t = torch.Timer() 380 | trainer:train(dataset) 381 | printlog(string.format("cnn_256x256\t%.2f", params.nexcnn/t:time().real)) 382 | end 383 | end 384 | -------------------------------------------------------------------------------- /torch7/cudahacks.lua: -------------------------------------------------------------------------------- 1 | torch.CudaTensor.lab = {} 2 | 3 | local lab = torch.CudaTensor.lab 4 | 5 | function lab.randn(...) 6 | local t = torch.FloatTensor.lab.randn(...) 7 | return torch.Tensor(t:size()):copy(t) 8 | end 9 | 10 | -- local nn = torch.CudaTensor.nn 11 | 12 | -- function nn.LogSoftMax_forward(self, input) 13 | -- local t = torch.FloatTensor(input:size()):copy(input) 14 | -- self.output = torch.FloatTensor() 15 | -- return torch.FloatTensor.nn.LogSoftMax_forward(self, t) 16 | -- end 17 | 18 | -------------------------------------------------------------------------------- /torch7/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # HACKS TO USE OPENBLAS 4 | export LIBRARY_PATH=./lib:~/.VENV/base/lib:$LIBRARY_PATH 5 | export LD_LIBRARY_PATH=./lib:~/.VENV/base/lib:$LD_LIBRARY_PATH 6 | 7 | #-convfast use "fast" convolution code instead of standard [false] 8 | #-openmp use openmp *package* [false] 9 | #-double use doubles instead of floats [false] 10 | #-cuda use CUDA instead of floats [false] 11 | #-batch batch size [1] 12 | #-gi compute gradInput [false] 13 | #-v be verbose [false] 14 | 15 | # this would use GEMM for convolution, Koray said this was not use 16 | # and it makes a huge unrolled matrix for large problems. 17 | USE_CONVFAST="" 18 | 19 | for batchsize in 1 10 100 ; do 20 | for PREC in 32 64 ; do 21 | if true ; then 22 | OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC} 23 | echo "Running normal" $OUTPUT 24 | echo "host=$HOSTNAME" > "$OUTPUT" 25 | echo "device=CPU" >> "$OUTPUT" 26 | echo "OpenMP=0" >> "$OUTPUT" 27 | echo "batch=$batchsize" >> "$OUTPUT" 28 | echo "precision=$PREC" >> "$OUTPUT" 29 | if [ $PREC = 32 ] ; then 30 | USE_DOUBLE="" 31 | else 32 | USE_DOUBLE="-double" 33 | fi 34 | 35 | ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE >> "$OUTPUT" 36 | fi 37 | 38 | if true ; then 39 | OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC}_openmp 40 | echo "Running OpenMP " $OUTPUT 41 | echo "host=$HOSTNAME" > "$OUTPUT" 42 | echo "device=CPU" >> "$OUTPUT" 43 | echo "OpenMP=1" >> "$OUTPUT" 44 | echo "batch=$batchsize" >> "$OUTPUT" 45 | echo "precision=$PREC" >> "$OUTPUT" 46 | if [ $PREC = 32 ] ; then 47 | USE_DOUBLE="" 48 | else 49 | USE_DOUBLE="-double" 50 | fi 51 | ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE -openmp >> "$OUTPUT" 52 | fi 53 | 54 | if true ; then 55 | OUTPUT=run.sh.results_${HOSTNAME}_b${batchsize}_p${PREC}_cuda 56 | echo "Running CUDA " $OUTPUT 57 | echo "host=$HOSTNAME" > "$OUTPUT" 58 | echo "device=GTX480" >> "$OUTPUT" 59 | echo "OpenMP=0" >> "$OUTPUT" 60 | echo "batch=$batchsize" >> "$OUTPUT" 61 | echo "precision=32" >> "$OUTPUT" 62 | if [ $PREC = 32 ] ; then 63 | USE_DOUBLE="" 64 | else 65 | USE_DOUBLE="-double" 66 | fi 67 | ~/local/bin/lua benchmark.lua -batch $batchsize $USE_DOUBLE -cuda >> "$OUTPUT" 68 | fi 69 | done 70 | done 71 | 72 | 73 | --------------------------------------------------------------------------------