├── .gitignore ├── LICENSE.txt ├── README.md ├── README.txt ├── examples ├── cmp_debug.py ├── nips2011_dbn.py ├── nips2011_dbn_show_svg.py ├── nips2011_nnet.py └── plot_trials.py ├── hpnnet ├── __init__.py ├── nips2011.py ├── nips2011_dbn.py ├── nnet.py ├── orig_dbn.py ├── pylearn_pca.py ├── pyll_stubs.py ├── skdata_learning_algo.py └── tests │ ├── test_nips2011.py │ ├── test_nips2011_dbn.py │ ├── test_nnet.py │ └── test_pylearn_pca.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | 38 | #VIM 39 | *.swp 40 | *.swo 41 | 42 | *.log 43 | 44 | distribute* 45 | examples/*.pkl 46 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, James Bergstra 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hyperopt-nnet 2 | ============= 3 | 4 | This package provides a 5 | [hyperopt](http://jaberg.github.io/hyperopt)-compatible neural network 6 | implementation. 7 | 8 | Currently, it can be used to tune neural network hyperparameters for data sets 9 | provided as [skdata](http://jaberg.github.io/skdata) protocols. 10 | 11 | See the `./examples` subdirectory for sample training scripts (e.g. `nips2011_nnet.py`) 12 | and a plotting script (`plot_trials.py`). 13 | 14 | The `hpnnet.nips2011` file implements the search parameterization used in 15 | Bergstra, Bardenet, Bengio, and Kegl ("[Algorithms for Hyper-parameter 16 | Optimization](http://books.nips.cc/papers/files/nips24/NIPS2011_1385.pdf)") from NIPS 2011. 17 | 18 | 19 | Dependencies 20 | ------------ 21 | 22 | * NumPy 23 | * Sklearn 24 | * Theano 25 | * Skdata (github master, not PyPI) 26 | * Hyperopt (github master, not PyPI) 27 | * matplotlib (for plotting) 28 | * IPython (for parallel search, option 1) 29 | * MongoDB (for parallel search, option 2) 30 | 31 | 32 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | 2 | Hyperopt-NNet 3 | 4 | -------------------------------------------------------------------------------- /examples/cmp_debug.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class ObjectId(object): 4 | def __init__(self, _id): 5 | self._id = _id 6 | 7 | class ISODate(object): 8 | def __init__(self, datestring): 9 | self.datestring = datestring 10 | 11 | config1 = { "_id" : ObjectId("4cd9e13e8a077c67400000d6"), "argd" : { "pca_energy" : 12 | 0.99, "preprocessing" : "pca", "W_init_algo" : "old", 13 | "W_init_algo_old_multiplier" : 1.121444322228466, "dataset_name" : 14 | "rectangles", "sup_max_epochs" : 4000, "sup_min_epochs" : 300, "squash" : 15 | "sigmoid", "iseed" : 5, "batchsize" : 100, "n_hid" : 61, "lr" : 16 | 0.003537415031816308, "lr_anneal_start" : 13768, "l2_penalty" : 17 | 0.000015964911748563186 }, "book_time" : 18 | ISODate("2010-11-10T00:04:56.467Z"), "cmd" : "nnet_fn_1", "owner" : 19 | "brams0a.iro.umontreal.ca:13364", "refresh_time" : 20 | ISODate("2010-11-10T00:05:15.983Z"), "result" : { "best_epoch_test" : 21 | 0.5289400000000007, "best_epoch" : 127, "best_epoch_valid" : 22 | 0.5700000000000001, "version" : 1, "best_epoch_train" : 23 | 0.7132604202506101 }, "status" : 2, "version" : 605 } 24 | 25 | 26 | config2 = { "_id" : ObjectId("4cd9e13e8a077c6740000021"), "argd" : { "pca_energy" : 27 | 0.99, "preprocessing" : "raw", "W_init_algo" : "old", 28 | "W_init_algo_old_multiplier" : 0.9229159592523637, "dataset_name" : 29 | "rectangles", "sup_max_epochs" : 4000, "sup_min_epochs" : 300, 30 | "squash" : "sigmoid", "iseed" : 5, "batchsize" : 20, "n_hid" : 917, 31 | "lr" : 6.610082599376332, "lr_anneal_start" : 21782, "l2_penalty" : 32 | 0.000017125453152969375 }, "book_time" : 33 | ISODate("2010-11-10T00:11:47.329Z"), "cmd" : "nnet_fn_1", "owner" : 34 | "maggie26.iro.umontreal.ca:544", "refresh_time" : 35 | ISODate("2010-11-10T00:20:21.452Z"), "result" : { "best_epoch_test" : 36 | 0.93328, "best_epoch" : 232, "best_epoch_valid" : 0.965, "version" 37 | : 1, "best_epoch_train" : 0.0009210755676031113 }, "status" : 2, 38 | "version" : 933 } 39 | 40 | import hyperopt.pyll 41 | from hyperopt.pyll_utils import expr_to_config 42 | from hpnnet.nips2011 import nnet1_preproc_space 43 | 44 | from hpnnet.skdata_learning_algo import eval_fn 45 | from skdata.larochelle_etal_2007.view import RectanglesVectorXV 46 | 47 | def run_config(config): 48 | argd = config['argd'] 49 | def config_lookup(key): 50 | if key == 'scale_mult1': 51 | return argd['W_init_algo_old_multiplier'] 52 | 53 | if key == 'scale_heur1': 54 | if 'old' == argd['W_init_algo']: 55 | return 0 56 | else: 57 | assert 'Xavier' == argd['W_init_algo'] 58 | return 1 59 | 60 | if key == 'preproc': 61 | return {'raw': 0, 'normalize': 1, 'pca': 2}[ 62 | argd['preprocessing']] 63 | 64 | if key == 'batch_size': 65 | return 0 if 20 == argd['batchsize'] else 1 66 | 67 | if key == 'nhid1': 68 | return argd['n_hid'] 69 | 70 | if key == 'dist1': 71 | return 0 if argd['W_init_algo'] == 'old' else 1 72 | 73 | if key == 'squash': 74 | return 0 if argd['squash'] == 'tanh' else 1 75 | 76 | if key == 'colnorm_thresh': 77 | return 1e-7 78 | 79 | if key == 'l2_penalty_nz': 80 | return argd['l2_penalty'] 81 | 82 | if key == 'l2_penalty': 83 | return 0 if argd['l2_penalty'] == 0 else 1 84 | 85 | if key == 'iseed': 86 | # convert from seed value to choice index 87 | return argd['iseed'] - 5 88 | 89 | try: 90 | return argd[key] 91 | except KeyError: 92 | print 'Returning GarbageCollected for %s' % key 93 | return hyperopt.pyll.base.GarbageCollected 94 | 95 | expr = nnet1_preproc_space() 96 | hps = {} 97 | expr_to_config(expr, None, hps) 98 | print config 99 | memo = {} 100 | for k, v in hps.items(): 101 | #print k, v 102 | memo[v['node']] = config_lookup(k) 103 | 104 | print memo 105 | rval = eval_fn( 106 | expr=expr, 107 | memo=memo, 108 | ctrl=None, 109 | protocol_cls=RectanglesVectorXV) 110 | print '-' * 80 111 | print 'COMPUTED RESULTS IN TERMS OF *ERROR*' 112 | print rval['loss'] 113 | print '-' * 80 114 | print 'SAVED RESULTS IN TERMS OF *ACCURACY*' 115 | print config['result'] 116 | print '-' * 80 117 | 118 | 119 | if __name__ == '__main__': 120 | sys.exit(run_config(config1)) 121 | #sys.exit(run_config(config2)) 122 | 123 | -------------------------------------------------------------------------------- /examples/nips2011_dbn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Evaluating one-layer neural networks on data sets from skdata. 4 | 5 | Run this script like e.g. 6 | 7 | $ ipcluster start # in shell 1 8 | $ ./nips2011_nnet1.py rectangles # in shell 2 9 | 10 | This will conduct an ipython-based parallel search for the best 11 | neural network for the "rectangles" data set, and store the results 12 | as a Trials object called "iptrials_rectangles.pkl". You can plot 13 | the results by typing for example: 14 | 15 | $ ./plot_trials.py iptrials_rectangles.pkl 16 | 17 | """ 18 | 19 | __author__ = "James Bergstra" 20 | __license__ = "BSD-3" 21 | 22 | import cPickle 23 | from functools import partial 24 | import sys 25 | 26 | from IPython.parallel import Client 27 | from hyperopt import tpe 28 | from hyperopt.ipy import IPythonTrials 29 | from hpnnet.skdata_learning_algo import eval_fn 30 | from hpnnet.nips2011_dbn import preproc_space 31 | 32 | def get_iptrials(filename): 33 | client = Client() 34 | try: 35 | iptrials = cPickle.load(open(filename)) 36 | iptrials._client = client 37 | except IOError: 38 | iptrials = IPythonTrials(client) 39 | except (EOFError, cPickle.PickleError): 40 | print "ERROR: unpickling FAILED" 41 | iptrials = IPythonTrials(client) 42 | return iptrials 43 | 44 | 45 | def main_rectangles(filename='iptrials_rectangles_dbn.pkl'): 46 | from skdata.larochelle_etal_2007.view import RectanglesVectorXV 47 | iptrials = get_iptrials(filename) 48 | 49 | rectangles_eval_fn = partial(eval_fn, 50 | protocol_cls=RectanglesVectorXV) 51 | 52 | for max_evals in [10, 25, 50]: 53 | iptrials.fmin( 54 | fn=rectangles_eval_fn, 55 | space=preproc_space(), 56 | algo=tpe.suggest, 57 | max_evals=max_evals, 58 | verbose=1, 59 | pass_expr_memo_ctrl=True, 60 | ) 61 | iptrials.wait() 62 | iptrials.refresh() 63 | ofile = open(filename, 'w') 64 | cPickle.dump(iptrials, ofile) 65 | ofile.close() 66 | 67 | 68 | def main_MRBI(filename='iptrials_MRBI_dbn.pkl'): 69 | from skdata.larochelle_etal_2007.view \ 70 | import MNIST_RotatedBackgroundImages_VectorXV as Protocol 71 | iptrials = get_iptrials(filename) 72 | 73 | dataset_eval_fn = partial(eval_fn, protocol_cls=Protocol) 74 | 75 | for max_evals in range(20, 100, 200): 76 | iptrials.fmin( 77 | fn=dataset_eval_fn, 78 | space=preproc_space(), 79 | algo=tpe.suggest, 80 | max_evals=max_evals, 81 | verbose=1, 82 | pass_expr_memo_ctrl=True, 83 | ) 84 | iptrials.wait() 85 | iptrials.refresh() 86 | ofile = open(filename, 'w') 87 | cPickle.dump(iptrials, ofile) 88 | ofile.close() 89 | 90 | 91 | def main_convex(filename='iptrials_convex_dbn.pkl'): 92 | from skdata.larochelle_etal_2007.view import ConvexVectorXV as Protocol 93 | iptrials = get_iptrials(filename) 94 | 95 | dataset_eval_fn = partial(eval_fn, 96 | protocol_cls=Protocol) 97 | 98 | for max_evals in range(10, 50, 10): 99 | iptrials.fmin( 100 | fn=dataset_eval_fn, 101 | space=preproc_space(), 102 | algo=tpe.suggest, 103 | max_evals=max_evals, 104 | verbose=1, 105 | pass_expr_memo_ctrl=True, 106 | ) 107 | iptrials.wait() 108 | iptrials.refresh() 109 | ofile = open(filename, 'w') 110 | cPickle.dump(iptrials, ofile) 111 | ofile.close() 112 | 113 | 114 | def main(): 115 | cmd = 'main_' + sys.argv[1] 116 | main_fn = globals()[cmd] 117 | return main_fn(*sys.argv[2:]) 118 | 119 | if __name__ == '__main__': 120 | sys.exit(main()) 121 | 122 | -------------------------------------------------------------------------------- /examples/nips2011_dbn_show_svg.py: -------------------------------------------------------------------------------- 1 | """ 2 | To see the relationship between hyperparameters in the nips2011_dbn space: 3 | 4 | python nips2011_dbn_show_svg.py && dot -Tpng dbn.dot > dbn.png && eog dbn.png 5 | 6 | """ 7 | 8 | from hpnnet.nips2011_dbn import preproc_space 9 | from hyperopt.graphviz import dot_hyperparameters 10 | open('dbn.dot', 'wb').write(dot_hyperparameters(preproc_space())) 11 | 12 | -------------------------------------------------------------------------------- /examples/nips2011_nnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Evaluating one-layer neural networks on data sets from skdata. 4 | 5 | Run this script like e.g. 6 | 7 | $ ipcluster start --n=2 # in shell 1 8 | $ ./nips2011_nnet1.py rectangles # in shell 2 9 | 10 | This will conduct an ipython-based parallel search for the best 11 | neural network for the "rectangles" data set, and store the results 12 | as a Trials object called "iptrials_rectangles.pkl". You can plot 13 | the results by typing for example: 14 | 15 | $ ./plot_trials.py iptrials_rectangles.pkl 16 | 17 | """ 18 | 19 | __author__ = "James Bergstra" 20 | __license__ = "BSD-3" 21 | 22 | import cPickle 23 | from functools import partial 24 | import sys 25 | 26 | from IPython.parallel import Client 27 | from hyperopt import tpe 28 | from hyperopt import rand 29 | from hyperopt.ipy import IPythonTrials 30 | from hpnnet.skdata_learning_algo import eval_fn 31 | from hpnnet.nips2011 import nnet1_preproc_space 32 | 33 | def get_iptrials(filename): 34 | client = Client() 35 | try: 36 | iptrials = cPickle.load(open(filename)) 37 | iptrials._client = client 38 | except IOError: 39 | iptrials = IPythonTrials(client) 40 | except (EOFError, cPickle.PickleError): 41 | print "ERROR: unpickling FAILED" 42 | iptrials = IPythonTrials(client) 43 | return iptrials 44 | 45 | 46 | def main_rectangles(filename='iptrials_rectangles.pkl'): 47 | from skdata.larochelle_etal_2007.view import RectanglesVectorXV 48 | iptrials = get_iptrials(filename) 49 | 50 | rectangles_eval_fn = partial(eval_fn, 51 | protocol_cls=RectanglesVectorXV) 52 | 53 | for max_evals in [10, 25, 50]: 54 | iptrials.fmin( 55 | fn=rectangles_eval_fn, 56 | space=nnet1_preproc_space(), 57 | algo=rand.suggest, 58 | max_evals=max_evals, 59 | verbose=1, 60 | pass_expr_memo_ctrl=True, 61 | ) 62 | iptrials.wait() 63 | iptrials.refresh() 64 | ofile = open(filename, 'w') 65 | cPickle.dump(iptrials, ofile) 66 | ofile.close() 67 | 68 | 69 | def main_MRBI(filename='iptrials_MRBI.pkl'): 70 | from skdata.larochelle_etal_2007.view \ 71 | import MNIST_RotatedBackgroundImages_VectorXV as Protocol 72 | iptrials = get_iptrials(filename) 73 | 74 | dataset_eval_fn = partial(eval_fn, protocol_cls=Protocol) 75 | 76 | for max_evals in range(20, 100, 200): 77 | iptrials.fmin( 78 | fn=dataset_eval_fn, 79 | space=nnet1_preproc_space(), 80 | algo=tpe.suggest, 81 | max_evals=max_evals, 82 | verbose=1, 83 | pass_expr_memo_ctrl=True, 84 | ) 85 | iptrials.wait() 86 | iptrials.refresh() 87 | ofile = open(filename, 'w') 88 | cPickle.dump(iptrials, ofile) 89 | ofile.close() 90 | 91 | 92 | def main_convex(filename='iptrials_convex.pkl'): 93 | from skdata.larochelle_etal_2007.view import ConvexVectorXV as Protocol 94 | iptrials = get_iptrials(filename) 95 | 96 | dataset_eval_fn = partial(eval_fn, 97 | protocol_cls=Protocol) 98 | 99 | for max_evals in range(10, 50, 10): 100 | iptrials.fmin( 101 | fn=dataset_eval_fn, 102 | space=nnet1_preproc_space(), 103 | algo=tpe.suggest, 104 | max_evals=max_evals, 105 | verbose=1, 106 | pass_expr_memo_ctrl=True, 107 | ) 108 | iptrials.wait() 109 | iptrials.refresh() 110 | ofile = open(filename, 'w') 111 | cPickle.dump(iptrials, ofile) 112 | ofile.close() 113 | 114 | 115 | def main(): 116 | cmd = 'main_' + sys.argv[1] 117 | main_fn = globals()[cmd] 118 | return main_fn(*sys.argv[2:]) 119 | 120 | if __name__ == '__main__': 121 | sys.exit(main()) 122 | -------------------------------------------------------------------------------- /examples/plot_trials.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import cPickle 3 | import sys 4 | from hyperopt.plotting import main_plot_history 5 | 6 | trials = cPickle.load(open(sys.argv[1])) 7 | 8 | main_plot_history(trials) 9 | 10 | -------------------------------------------------------------------------------- /hpnnet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | import os 4 | import psutil 5 | p = psutil.Process(os.getpid()) 6 | all_cpus = list(range(psutil.NUM_CPUS)) 7 | if p.get_cpu_affinity() != all_cpus: 8 | print 'Setting CPU AFFINITY to %s' % str(all_cpus) 9 | p.set_cpu_affinity(all_cpus) 10 | except: 11 | pass 12 | -------------------------------------------------------------------------------- /hpnnet/nips2011.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Network (NNet) and Deep Belief Network (DBN) search spaces used in [1] 3 | and [2]. 4 | 5 | The functions in this file return pyll graphs that can be used as the `space` 6 | argument to e.g. `hyperopt.fmin`. The pyll graphs include hyperparameter 7 | constructs (e.g. `hyperopt.hp.uniform`) so `hyperopt.fmin` can perform 8 | hyperparameter optimization. 9 | 10 | See ./skdata_learning_algo.py for example usage of these functions. 11 | 12 | 13 | [1] Bergstra, J., Bardenet, R., Bengio, Y., Kegl, B. (2011). Algorithms 14 | for Hyper-parameter optimization, NIPS 2011. 15 | 16 | [2] Bergstra, J., Bengio, Y. (2012). Random Search for Hyper-Parameter 17 | Optimization, JMLR 13:281--305. 18 | 19 | """ 20 | 21 | __author__ = "James Bergstra" 22 | __license__ = "BSD-3" 23 | 24 | import numpy as np 25 | 26 | from hyperopt.pyll import scope 27 | from hyperopt import hp 28 | 29 | import pyll_stubs 30 | import nnet # -- load scope with nnet symbols 31 | 32 | 33 | def nnet1_preproc_space(sup_min_epochs=300, sup_max_epochs=2000, 34 | max_seconds=60 * 60): 35 | """ 36 | Return a hyperopt-compatible pyll expression for a trained neural network. 37 | 38 | The trained neural network will have one hidden layer, and may 39 | have an affine first layer that does column normalization or PCA 40 | pre-processing. 41 | 42 | The training program is built using stub literals `pyll_stubs.train_task` 43 | and `pyll_stubs.valid_task`. When evaluating the pyll program, these 44 | literals must be replaced with skdata Task objects with 45 | `vector_classification` semantics. See `skdata_learning_algo.py` for how 46 | to use the `use_obj_for_literal_in_memo` function to swap live Task 47 | objects in for these stubs. 48 | 49 | The search space described by this function corresponds to the one-layer 50 | neural network with pre-processing used in [1] and [2]. 51 | 52 | """ 53 | time_limit = scope.time() + max_seconds 54 | 55 | train_task_x = scope.getattr(pyll_stubs.train_task, 'x') 56 | nnet0 = scope.NNet([], n_out=scope.getattr(train_task_x, 'shape')[1]) 57 | nnet1 = hp.choice('preproc', 58 | [ 59 | # -- raw XXX set up something for n_in arg of hidden layer 60 | nnet0, 61 | # -- normalize 62 | scope.nnet_add_layer( 63 | nnet0, 64 | scope.column_normalize_layer( 65 | train_task_x, 66 | std_thresh=hp.loguniform('colnorm_thresh', 67 | np.log(1e-9), 68 | np.log(1e-3)))), 69 | # -- pca (with bias to throw away a lot) 70 | scope.nnet_add_layer( 71 | nnet0, 72 | scope.pca_layer( 73 | train_task_x, 74 | energy=hp.uniform('pca_energy', .5, 1), 75 | eps=1e-14)), 76 | ]) 77 | hidden_layer = scope.random_sigmoid_layer( 78 | n_in=scope.getattr(nnet1, 'n_out'), 79 | n_out=hp.qloguniform( 80 | 'nhid1', np.log(16), np.log(1024), q=16), 81 | dist=hp.choice('dist1', ['uniform', 'normal']), 82 | scale_heuristic=hp.choice('scale_heur1', [ 83 | ('old', hp.uniform('scale_mult1', .2, 2)), 84 | ('Glorot', )]), 85 | seed=hp.choice('iseed', [5, 6, 7, 8]), 86 | squash=hp.choice('squash', ['tanh', 'logistic']), 87 | ) 88 | nnet2 = scope.nnet_add_layer(nnet1, hidden_layer) 89 | nnet3 = scope.nnet_add_layer( 90 | nnet2, 91 | scope.zero_softmax_layer( 92 | n_in=scope.getattr(nnet2, 'n_out'), 93 | n_out=scope.getattr(pyll_stubs.train_task, 'n_classes'))) 94 | 95 | nnet4 = scope.nnet_sgd_finetune_classifier( 96 | nnet3, 97 | pyll_stubs.train_task, 98 | pyll_stubs.valid_task, 99 | fixed_nnet=nnet1, # -- don't fine-tune this first part of nnet3 100 | max_epochs=sup_max_epochs, 101 | min_epochs=sup_min_epochs, 102 | batch_size=hp.choice('batch_size', [20, 100]), 103 | lr=hp.lognormal('lr', np.log(.01), 3.), 104 | lr_anneal_start=hp.qloguniform( 105 | 'lr_anneal_start', np.log(100), np.log(10000), q=1), 106 | l2_penalty=hp.choice('l2_penalty', [ 107 | 0, 108 | hp.lognormal('l2_penalty_nz', np.log(1.0e-6), 2.)]), 109 | time_limit=time_limit, 110 | ) 111 | 112 | return nnet4 113 | 114 | 115 | -------------------------------------------------------------------------------- /hpnnet/nips2011_dbn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Belief Network (DBN) search spaces used in [1] and [2]. 3 | 4 | The functions in this file return pyll graphs that can be used as the `space` 5 | argument to e.g. `hyperopt.fmin`. The pyll graphs include hyperparameter 6 | constructs (e.g. `hyperopt.hp.uniform`) so `hyperopt.fmin` can perform 7 | hyperparameter optimization. 8 | 9 | See ./skdata_learning_algo.py for example usage of these functions. 10 | 11 | 12 | [1] Bergstra, J., Bardenet, R., Bengio, Y., Kegl, B. (2011). Algorithms 13 | for Hyper-parameter optimization, NIPS 2011. 14 | 15 | [2] Bergstra, J., Bengio, Y. (2012). Random Search for Hyper-Parameter 16 | Optimization, JMLR 13:281--305. 17 | 18 | """ 19 | 20 | __author__ = "James Bergstra" 21 | __license__ = "BSD-3" 22 | 23 | import numpy as np 24 | 25 | from hyperopt.pyll import scope 26 | from hyperopt import hp 27 | 28 | import pyll_stubs 29 | import nnet # -- load scope with nnet symbols 30 | 31 | 32 | def preproc_space( 33 | sup_min_epochs=300, 34 | sup_max_epochs=2000, 35 | max_seconds=60 * 60, 36 | ): 37 | """ 38 | Return a hyperopt-compatible pyll expression for a trained neural network. 39 | 40 | The trained neural network will have 0, 1, 2, or 3 hidden layers, and may 41 | have an affine first layer that does column normalization or PCA 42 | pre-processing. 43 | 44 | Each layer of the network will be pre-trained by some amount of 45 | contrastive divergence before being fine-tuning by SGD. 46 | 47 | The training program is built using stub literals `pyll_stubs.train_task` 48 | and `pyll_stubs.valid_task`. When evaluating the pyll program, these 49 | literals must be replaced with skdata Task objects with 50 | `vector_classification` semantics. See `skdata_learning_algo.py` for how 51 | to use the `use_obj_for_literal_in_memo` function to swap live Task 52 | objects in for these stubs. 53 | 54 | The search space described by this function corresponds to the DBN model 55 | used in [1] and [2]. 56 | 57 | """ 58 | 59 | train_task_x = scope.getattr(pyll_stubs.train_task, 'x') 60 | nnet0 = scope.NNet([], n_out=scope.getattr(train_task_x, 'shape')[1]) 61 | nnet1 = hp.choice('preproc', 62 | [ 63 | nnet0, # -- raw data 64 | scope.nnet_add_layers( # -- ZCA of data 65 | nnet0, 66 | scope.zca_layer( 67 | train_task_x, 68 | energy=hp.uniform('pca_energy', .5, 1), 69 | eps=1e-14, 70 | )), 71 | ]) 72 | 73 | param_seed = hp.choice('iseed', [5, 6, 7, 8]) 74 | 75 | time_limit = scope.time() + max_seconds 76 | 77 | nnets = [nnet1] 78 | nnet_i_pt = nnet1 79 | for ii, cd_epochs_max in enumerate([3000, 2000, 1500]): 80 | layer = scope.random_sigmoid_layer( 81 | # -- hack to get different seeds for dif't layers 82 | seed=param_seed + cd_epochs_max, 83 | n_in=scope.getattr(nnet_i_pt, 'n_out'), 84 | n_out=hp.qloguniform('n_hid_%i' % ii, 85 | np.log(2**7), 86 | np.log(2**12), 87 | q=16), 88 | dist=hp.choice('W_idist_%i' % ii, ['uniform', 'normal']), 89 | scale_heuristic=hp.choice( 90 | 'W_ialgo_%i' % ii, [ 91 | ('old', hp.lognormal('W_imult_%i' % ii, 0, 1)), 92 | ('Glorot',)]), 93 | squash='logistic', 94 | ) 95 | nnet_i_raw = scope.nnet_add_layer(nnet_i_pt, layer) 96 | # -- repeatedly calculating lower-layers wastes some CPU, but keeps 97 | # memory usage much more stable across jobs (good for cluster) 98 | # and the wasted CPU is not so much overall. 99 | nnet_i_pt = scope.nnet_pretrain_top_layer_cd( 100 | nnet_i_raw, 101 | train_task_x, 102 | lr=hp.lognormal('cd_lr_%i' % ii, np.log(.01), 2), 103 | seed=1 + hp.randint('cd_seed_%i' % ii, 10), 104 | n_epochs=hp.qloguniform('cd_epochs_%i' % ii, 105 | np.log(1), 106 | np.log(cd_epochs_max), 107 | q=1), 108 | # -- for whatever reason (?), this was fixed at 100 109 | batchsize=100, 110 | sample_v0s=hp.choice('sample_v0s_%i' % ii, [False, True]), 111 | lr_anneal_start=hp.qloguniform('lr_anneal_%i' % ii, 112 | np.log(10), 113 | np.log(10000), 114 | q=1), 115 | time_limit=time_limit, 116 | ) 117 | nnets.append(nnet_i_pt) 118 | 119 | # this prior is not what I would do now, but it is what I did then... 120 | nnet_features = hp.pchoice( 121 | 'depth', 122 | [(.5, nnets[0]), 123 | (.25, nnets[1]), 124 | (.125, nnets[2]), 125 | (.125, nnets[3])]) 126 | 127 | sup_nnet = scope.nnet_add_layer( 128 | nnet_features, 129 | scope.zero_softmax_layer( 130 | n_in=scope.getattr(nnet_features, 'n_out'), 131 | n_out=scope.getattr(pyll_stubs.train_task, 'n_classes'))) 132 | 133 | 134 | nnet4, report = scope.nnet_sgd_finetune_classifier( 135 | sup_nnet, 136 | pyll_stubs.train_task, 137 | pyll_stubs.valid_task, 138 | fixed_nnet=nnet1, 139 | max_epochs=sup_max_epochs, 140 | min_epochs=sup_min_epochs, 141 | batch_size=hp.choice('batch_size', [20, 100]), 142 | lr=hp.lognormal('lr', np.log(.01), 3.), 143 | lr_anneal_start=hp.qloguniform( 144 | 'lr_anneal_start', 145 | np.log(100), 146 | np.log(10000), 147 | q=1), 148 | l2_penalty=hp.choice('l2_penalty', [ 149 | 0, 150 | hp.lognormal('l2_penalty_nz', np.log(1.0e-6), 2.)]), 151 | time_limit=time_limit, 152 | ) 153 | 154 | return nnet4, report 155 | 156 | 157 | -------------------------------------------------------------------------------- /hpnnet/nnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Training and construction routines for neural networks. 3 | 4 | """ 5 | 6 | __author__ = "James Bergstra" 7 | __license__ = "BSD-3" 8 | 9 | import copy 10 | import time as time_module 11 | import numpy as np 12 | import theano 13 | import theano.tensor as TT 14 | try: 15 | # -- TODO: only import this if we intend to use Theano's GPU codegen 16 | raise ImportError() 17 | import theano.sandbox.cuda.rng_curand 18 | RandomStreams = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams 19 | except ImportError: 20 | try: 21 | import theano.sandbox.rng_mrg 22 | RandomStreams = theano.sandbox.rng_mrg.MRG_RandomStreams 23 | except ImportError: 24 | RandomStreams = TT.shared_randomstreams.RandomStreams 25 | 26 | from hyperopt.pyll import scope 27 | 28 | 29 | _x = theano.tensor.dmatrix() 30 | _v = theano.tensor.dmatrix() 31 | softmax = theano.function([_x], theano.tensor.nnet.softmax(_x)) 32 | 33 | dot_mm = theano.function([_x, _v], theano.tensor.dot(_x, _v)) 34 | 35 | def np_dot(a, b): 36 | return dot_mm(a, b) 37 | 38 | 39 | class DivergenceError(Exception): 40 | """An iterative numerical algorithm diverged (step size too large). 41 | """ 42 | 43 | 44 | # TODO: move this to hyperopt.pyll 45 | @scope.define 46 | def time(): 47 | return time_module.time() 48 | 49 | 50 | @scope.define 51 | class NNet(object): 52 | def __init__(self, layers, n_out=None): 53 | self.layers = list(layers) 54 | self._n_out = n_out 55 | 56 | @property 57 | def n_out(self): 58 | if not self.layers: 59 | if self._n_out is None: 60 | raise IndexError('n_out: no layers') 61 | else: 62 | return self._n_out 63 | return self.layers[-1].n_out 64 | 65 | @property 66 | def n_in(self): 67 | if not self.layers: 68 | raise IndexError('n_in: no layers') 69 | return self.layers[0].n_in 70 | 71 | def predict(self, X, chunk=256): 72 | preds = [] 73 | for i in range(0, len(X), chunk): 74 | t0 = time_module.time() 75 | Xi = X[i: i + chunk] 76 | for layer in self.layers: 77 | Xi = layer(Xi) 78 | preds.extend(np.argmax(Xi, axis=1)) 79 | t1 = time_module.time() 80 | if t1 - t0 > .1: 81 | print 'WARNING: predicting single chunk took', (t1 - t0) 82 | print 'ETA = %s' % ((len(X) - i) / 256. * (t1 - t0)) 83 | assert len(preds) == len(X), (len(preds), len(X)) 84 | return preds 85 | 86 | 87 | class Layer(object): 88 | def __init__(self, W, b): 89 | self.W = W 90 | self.b = b 91 | 92 | @property 93 | def n_out(self): 94 | return self.W.shape[1] 95 | 96 | @property 97 | def n_in(self): 98 | return self.W.shape[0] 99 | 100 | 101 | class AffineLayer(Layer): 102 | def __call__(self, X): 103 | return np_dot(X, self.W) + self.b 104 | 105 | def theano_compute(self, X, W, b): 106 | return TT.dot(X, W) + b 107 | 108 | 109 | class AffineLayerPre(Layer): 110 | def __call__(self, X): 111 | return np_dot(X + self.b, self.W) 112 | 113 | def theano_compute(self, X, W, b): 114 | return TT.dot(X + b, W) 115 | 116 | 117 | class AffineElemwiseLayer(Layer): 118 | def __call__(self, X): 119 | return X * self.W + self.b 120 | 121 | def theano_compute(self, X, W, b): 122 | return X * W + b 123 | 124 | @property 125 | def n_in(self): 126 | assert self.W.shape[0] == 1 127 | return (self.W + self.b).shape[1] 128 | 129 | 130 | class LogisticLayer(Layer): 131 | def __call__(self, X): 132 | return 1. / (1. + np.exp(-np_dot(X, self.W) - self.b)) 133 | 134 | def theano_compute(self, X, W, b): 135 | return 1. / (1. + TT.exp(-TT.dot(X, W) - b)) 136 | 137 | 138 | class SoftmaxLayer(Layer): 139 | def __call__(self, X): 140 | return softmax(np_dot(X, self.W) + self.b) 141 | 142 | def theano_compute(self, X, W, b): 143 | return TT.nnet.softmax(TT.dot(X, W) + b) 144 | 145 | 146 | class TanhLayer(Layer): 147 | def __call__(self, X): 148 | return np.tanh(np_dot(X, self.W) + self.b) 149 | 150 | def theano_compute(self, X, W, b): 151 | return TT.tanh(TT.dot(X, W) + b) 152 | 153 | 154 | class ClipLayer(Layer): 155 | def __call__(self, X): 156 | tmp = np_dot(X, self.W) + self.b 157 | return np.clip(tmp, 0, 1) 158 | 159 | def theano_compute(self, X, W, b): 160 | tmp = theano.dot(X, self.W) + self.b 161 | rval = TT.clip(tmp, 0, 1) 162 | assert rval.dtype == X.dtype 163 | return rval 164 | 165 | 166 | 167 | @scope.define 168 | def layer_transform(layer, X): 169 | return layer(X) 170 | 171 | 172 | @scope.define 173 | def nnet_transform(nnet, X): 174 | for layer in nnet.layers: 175 | X = layer(X) 176 | return X 177 | 178 | 179 | @scope.define 180 | def nnet_add_layer(nnet, layer): 181 | return NNet(nnet.layers + [layer]) 182 | 183 | @scope.define 184 | def nnet_add_layers(nnet, layers): 185 | return NNet(nnet.layers + list(layers)) 186 | 187 | 188 | @scope.define 189 | def pca_layer(X, energy, eps): 190 | import pylearn_pca 191 | (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( 192 | X=X, 193 | max_energy_fraction=energy) 194 | centering_offset = centered_trainset[0] - X[0] 195 | 196 | W = eigvecs / np.sqrt(eigvals + eps) 197 | print('PCA kept %i of %i components' % (W.shape[1], X.shape[1])) 198 | return AffineLayerPre( 199 | W.astype(X.dtype), 200 | centering_offset.astype(X.dtype)) 201 | 202 | 203 | @scope.define 204 | def zca_layer(X, energy, eps): 205 | """ 206 | Return a pair of layers whose output when filtering X will be X's ZCA. 207 | 208 | energy - retain at least this much energy with the principle components 209 | eps - add this to the eigenvalues when computing PCA responses to prevent 210 | division-by-zero and suppress weak components in the PCA 211 | representation. 212 | """ 213 | import pylearn_pca 214 | (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( 215 | X=X, 216 | max_energy_fraction=energy) 217 | 218 | centering_offset = centered_trainset[0] - X[0] 219 | W = eigvecs / np.sqrt(eigvals + eps) 220 | print('ZCA kept %i of %i components' % (W.shape[1], X.shape[1])) 221 | l0 = AffineLayerPre(W.astype(X.dtype), centering_offset.astype(X.dtype)) 222 | l1 = ClipLayer(eigvecs.T.copy().astype(X.dtype), np.asarray(0, dtype=X.dtype)) 223 | return [l0, l1] 224 | 225 | 226 | @scope.define 227 | def column_normalize_layer(X, std_thresh): 228 | mean = np.mean(X, axis=0).reshape((1, X.shape[1])) 229 | std = np.std(X, axis=0).reshape((1, X.shape[1])) 230 | return AffineElemwiseLayer( 231 | W=1. / (std + std_thresh), 232 | b=-mean) 233 | 234 | 235 | @scope.define 236 | def nnet_pretrain_top_layer_cd(nnet, 237 | X, 238 | lr, 239 | n_epochs, 240 | seed, 241 | batchsize, 242 | sample_v0s, 243 | lr_anneal_start, 244 | time_limit=None): 245 | """ 246 | Return a new pre-trained version of Layer, trained by contrastive 247 | divergence. This is not stochastic maximum-likelihood or persistive CD, 248 | this is the so-called "CD-1" algorithm. 249 | """ 250 | dtype = str(X.dtype) 251 | s_rng = RandomStreams(int(seed)) 252 | s_features = theano.shared(X, borrow=True) 253 | s_batchsize = TT.as_tensor_variable(batchsize) 254 | s_idx = TT.lscalar() 255 | s_lr = theano.shared(np.asarray(lr, dtype=X.dtype)) 256 | if not nnet.layers: 257 | raise ValueError('nnet_pretrain_top_layer_cd:' 258 | ' at least one layer required') 259 | v0m = s_features[s_idx * s_batchsize: (s_idx + 1) * s_batchsize] 260 | # -- filter features through lowermost layers 261 | for layer in nnet.layers[:-1]: 262 | s_W = theano.shared(layer.W) 263 | s_b = theano.shared(layer.b) 264 | tmp = layer.theano_compute(v0m, s_W, s_b) 265 | assert tmp.dtype == v0m.dtype, layer 266 | v0m = tmp 267 | 268 | # -- start CD on top layer 269 | if not isinstance(nnet.layers[-1], LogisticLayer): 270 | raise TypeError('CD pretraining only works for' 271 | ' nnets with Logistic top layer') 272 | n_in = nnet.layers[-1].n_in 273 | n_out = nnet.layers[-1].n_out 274 | print('rbm training n_in=%i n_out=%i batchsize=%i' % ( 275 | n_in, n_out, batchsize)) 276 | s_W = theano.shared(nnet.layers[-1].W) 277 | s_b = theano.shared(nnet.layers[-1].b) 278 | s_a = theano.shared(np.zeros(n_in, dtype=s_b.dtype)) 279 | if str(X.dtype) != str(s_W.dtype): 280 | raise TypeError('data and W have different dtypes') 281 | if sample_v0s: 282 | v0s = TT.cast( 283 | v0m > s_rng.uniform( 284 | size=(batchsize, nnet.layers[-1].n_in)), 285 | dtype) 286 | else: 287 | v0s = v0m 288 | 289 | h0m = TT.nnet.sigmoid(TT.dot(v0s, s_W) + s_b) 290 | h0s = TT.cast(s_rng.uniform(size=(batchsize, n_out)) < h0m, dtype) 291 | v1m = TT.nnet.sigmoid(TT.dot(h0s, s_W.T) + s_a) 292 | v1s = TT.cast(s_rng.uniform(size=(batchsize, n_in)) < v1m, dtype) 293 | h1m = TT.nnet.sigmoid(TT.dot(v1s, s_W) + s_b) 294 | 295 | # -- compile CD1 update function 296 | cd1_fn = theano.function([s_idx], 297 | [abs(v0m - v1m).mean()], 298 | updates=[ 299 | (s_W, s_W + s_lr * ( 300 | TT.dot(v0s.T, h0m) - TT.dot(v1s.T, h1m))), 301 | (s_a, s_a + s_lr * ( 302 | (v0s - v1s).sum(axis=0))), 303 | (s_b, s_b + s_lr * ( 304 | (h0m - h1m).sum(axis=0))), 305 | ], 306 | ) 307 | n_batches_per_epoch = len(X) / batchsize 308 | if len(X) > (batchsize * n_batches_per_epoch): 309 | n_batches_per_epoch += 1 310 | for epoch in xrange(int(n_epochs)): 311 | if time_limit and time_module.time() > time_limit: 312 | break 313 | e_lr = lr * min(1, (float(lr_anneal_start) / (epoch + 1))) 314 | s_lr.set_value(float(e_lr)) 315 | 316 | costs = [cd1_fn(bi) for bi in xrange(n_batches_per_epoch)] 317 | if not epoch % 10: 318 | print('CD1 epoch:%i avg L1: %f'% (epoch, np.mean(costs))) 319 | if not np.isfinite(np.mean(costs)): 320 | raise DivergenceError('CD went crazy') 321 | 322 | new_top_layer = LogisticLayer(W=s_W.get_value(borrow=True), 323 | b=s_b.get_value(borrow=True)) 324 | new_nnet = NNet(nnet.layers[:-1] + [new_top_layer]) 325 | return new_nnet 326 | 327 | 328 | @scope.define 329 | def random_sigmoid_layer(n_in, n_out, dist, 330 | scale_heuristic, seed, squash, 331 | dtype='float32'): 332 | 333 | rng = np.random.RandomState(seed) 334 | if dist == 'uniform': 335 | WT = rng.rand(n_out, n_in) * 2 - 1 336 | elif dist == 'normal': 337 | WT = rng.randn(n_out, n_in) 338 | else: 339 | raise ValueError('W_init_dist', dist) 340 | 341 | # N.B. the weights are transposed so that as the number of hidden units 342 | # changes, 343 | # the first hidden units are always the same vectors. this makes it 344 | # easier to isolate the effect of random initialization from the other 345 | # hyperparameters (otherwise changing n_out would be pretty much 346 | # equivalent to re-seeding). 347 | W = WT.T.astype(dtype) 348 | 349 | if scale_heuristic[0] == 'old': 350 | W *= scale_heuristic[1] / np.sqrt(n_in) 351 | elif scale_heuristic[0] == 'Glorot': 352 | W *= np.sqrt(6.0 / (n_in + n_out)) 353 | else: 354 | raise ValueError(scale_heuristic) 355 | 356 | b = np.zeros(n_out, dtype=dtype) 357 | if squash == 'logistic': 358 | return LogisticLayer(W, b) 359 | elif squash == 'tanh': 360 | return TanhLayer(W, b) 361 | else: 362 | raise NotImplementedError('squashing function', squash) 363 | 364 | 365 | @scope.define 366 | def zero_softmax_layer(n_in, n_out, dtype='float32'): 367 | W = np.zeros((n_in, n_out), dtype=dtype) 368 | b = np.zeros(n_out, dtype=dtype) 369 | return SoftmaxLayer(W, b) 370 | 371 | 372 | @scope.define_info(o_len=2) 373 | def nnet_sgd_finetune_classifier(nnet, train_task, valid_task, fixed_nnet, 374 | max_epochs, min_epochs, batch_size, lr, lr_anneal_start, l2_penalty, 375 | time_limit=None, dtype='float32'): 376 | 377 | layers = nnet.layers 378 | 379 | fixed_layers = [l for l in nnet.layers if l in fixed_nnet.layers] 380 | tuned_layers = [l for l in nnet.layers if l not in fixed_nnet.layers] 381 | 382 | # we need all the fixed layers to precede all the tuned layers 383 | assert layers[:len(fixed_layers)] == fixed_layers 384 | 385 | # Figure something out for validation 386 | if valid_task is None: 387 | from sklearn import cross_validation 388 | N = len(train_task.x) 389 | kf = cross_validation.KFold(N, 5) 390 | train_idxs, valid_idxs = iter(kf).next() 391 | 392 | idxmap = np.random.RandomState(123).permutation(N) 393 | 394 | train_x = train_task.x[idxmap[train_idxs]] 395 | valid_x = train_task.x[idxmap[valid_idxs]] 396 | train_y = train_task.y[idxmap[train_idxs]] 397 | valid_y = train_task.y[idxmap[valid_idxs]] 398 | else: 399 | train_x = train_task.x 400 | valid_x = valid_task.x 401 | train_y = train_task.y 402 | valid_y = valid_task.y 403 | 404 | # Filter X through the fixed layers, aka apply 405 | # pre-processing 406 | for layer in fixed_layers: 407 | train_x = layer(train_x) 408 | valid_x = layer(valid_x) 409 | 410 | train_x = train_x.astype(dtype) 411 | valid_x = valid_x.astype(dtype) 412 | 413 | shared_train_x = theano.shared(train_x, borrow=True) 414 | shared_valid_x = theano.shared(valid_x, borrow=True) 415 | shared_train_y = theano.shared(train_y, borrow=True) 416 | shared_valid_y = theano.shared(valid_y, borrow=True) 417 | 418 | batch_idx = TT.iscalar() 419 | s_lr = TT.scalar(dtype=dtype) 420 | 421 | batch_train_x = shared_train_x[batch_idx * batch_size:(batch_idx + 1) * batch_size] 422 | batch_valid_x = shared_valid_x[batch_idx * batch_size:(batch_idx + 1) * batch_size] 423 | batch_train_y = shared_train_y[batch_idx * batch_size:(batch_idx + 1) * batch_size] 424 | batch_valid_y = shared_valid_y[batch_idx * batch_size:(batch_idx + 1) * batch_size] 425 | 426 | params = [] 427 | Ws = [] 428 | bs = [] 429 | 430 | l2_cost = 0 431 | 432 | for layer in tuned_layers: 433 | s_W = theano.shared(layer.W) 434 | s_b = theano.shared(layer.b) 435 | batch_train_x = layer.theano_compute(batch_train_x, s_W, s_b) 436 | batch_valid_x = layer.theano_compute(batch_valid_x, s_W, s_b) 437 | Ws.append(s_W) 438 | bs.append(s_b) 439 | l2_cost = l2_cost + (s_W ** 2).sum() 440 | #batch_train_x = theano.printing.Print('x')(batch_train_x) 441 | 442 | # -- the topmost layer is the classifier, so at this point batch_train_x 443 | # represents the softmax classifier output. 444 | train_probs = batch_train_x 445 | train_loss = TT.mean( 446 | TT.nnet.categorical_crossentropy(train_probs, batch_train_y)) 447 | regularized_loss = train_loss + l2_penalty * l2_cost 448 | params = Ws + bs 449 | gparams = TT.grad(regularized_loss, params) 450 | updates = [(p, p - s_lr * gp) for (p, gp) in zip(params, gparams)] 451 | train_fn = theano.function( 452 | [batch_idx, s_lr], regularized_loss, 453 | updates=updates, 454 | allow_input_downcast=True) 455 | 456 | # -- the topmost layer is the classifier, so at this point batch_valid_x 457 | # represents the softmax classifier output. 458 | valid_err_rate = TT.mean( 459 | TT.neq(batch_valid_y, TT.argmax(batch_valid_x, axis=1))) 460 | valid_err_rate_fn = theano.function([batch_idx], valid_err_rate) 461 | 462 | report = {} 463 | report['best_epoch'] = -1 464 | report['best_epoch_valid'] = 1.0 465 | report['best_epoch_avg_train_reg_loss'] = 1.0 466 | report['best_epoch_test'] = 1.0 467 | report['status'] = 'ok' 468 | valid_err_rate = float('inf') 469 | test_err_rate = float('inf') 470 | avg_regularized_loss = float('inf') 471 | 472 | n_train_batches = len(train_x) // batch_size 473 | n_valid_batches = len(valid_x) // batch_size 474 | 475 | for epoch in xrange(max_epochs): 476 | valid_err_rate = float(np.mean([valid_err_rate_fn(i) 477 | for i in range(n_valid_batches)])) 478 | valid_err_rate_std_thresh = 0.5 * np.sqrt(valid_err_rate * 479 | (1 - valid_err_rate) / (n_valid_batches * batch_size)) 480 | 481 | if valid_err_rate < ( 482 | report['best_epoch_valid'] - valid_err_rate_std_thresh): 483 | report['best_epoch'] = epoch 484 | report['best_epoch_test'] = test_err_rate 485 | report['best_epoch_valid'] = valid_err_rate 486 | report['best_epoch_avg_train_reg_loss'] = avg_regularized_loss 487 | best_params = copy.deepcopy(params) 488 | 489 | e_lr = lr 490 | e_lr *= min(1, lr_anneal_start / float(epoch + 1)) 491 | 492 | print('Epoch=%i best epoch %i valid %f test %f ' 493 | ' best_train %f cur_train %f lr %f' % ( 494 | epoch, report['best_epoch'], 495 | report['best_epoch_valid'], 496 | report['best_epoch_test'], 497 | report['best_epoch_avg_train_reg_loss'], 498 | avg_regularized_loss, 499 | e_lr)) 500 | 501 | if epoch > max(min_epochs, 2 * report['best_epoch']): 502 | break 503 | if time_limit is not None and time_module.time() > time_limit: 504 | break 505 | # -- loop comprehension does one epoch of training 506 | avg_regularized_loss = float(np.mean([train_fn(i, e_lr) for i in 507 | range(n_train_batches)])) 508 | if not np.isfinite(avg_regularized_loss): 509 | report['status'] = 'fail' 510 | report['status_info'] = ('avg_regularized_loss %f' % 511 | avg_regularized_loss) 512 | return None, report 513 | 514 | if report['best_epoch'] >= 0: 515 | best_nnet = NNet(list(fixed_layers)) 516 | best_Ws = best_params[:len(Ws)] 517 | best_bs = best_params[len(Ws):] 518 | for tuned, W, b in zip(tuned_layers, best_Ws, best_bs): 519 | best_nnet.layers.append( 520 | tuned.__class__( 521 | W.get_value(), 522 | b.get_value())) 523 | print 'nnet_sgd_finetune: ', report 524 | return best_nnet, report 525 | else: 526 | report['status'] = 'fail' 527 | report['status_info'] = 'noprog' 528 | return None, report 529 | 530 | 531 | -------------------------------------------------------------------------------- /hpnnet/orig_dbn.py: -------------------------------------------------------------------------------- 1 | # THIS IS AN OLD FILE TO REIMPLEMENT, NOT MEANT TO BE IMPORTED 2 | raise ImportError() 3 | 4 | 5 | """Deep Belief Netork as Bandit 6 | """ 7 | import copy 8 | import cPickle 9 | import logging 10 | import os 11 | import subprocess 12 | import sys 13 | import time 14 | logger = logging.getLogger(__name__) 15 | 16 | import numpy 17 | from bson import SON, BSON 18 | 19 | import theano 20 | from theano import tensor 21 | 22 | # scikit-data 23 | from skdata.tasks import classification_train_valid_test 24 | 25 | # XXX use scikits-learn for PCA 26 | import pylearn_pca 27 | 28 | from base import Bandit 29 | from utils import json_call 30 | 31 | try: 32 | RandomStreams = theano.sandbox.cuda.CURAND_RandomStreams 33 | except: 34 | RandomStreams = tensor.shared_randomstreams.RandomStreams 35 | 36 | from ht_dist2 import rSON2, one_of, rlist, uniform, lognormal, ceil_lognormal 37 | 38 | 39 | class LogisticRegression(object): 40 | def __init__(self, x, w, b, params=None): 41 | if params is None: 42 | params = [] 43 | self.input = x 44 | self.output = tensor.nnet.softmax(tensor.dot(x, w) + b) 45 | self.l1 = abs(w).sum() 46 | self.l2_sqr = (w**2).sum() 47 | self.argmax = tensor.argmax( 48 | tensor.dot(x, w) + b, 49 | axis=x.ndim - 1) 50 | self.w = w 51 | self.b = b 52 | self.params = params 53 | 54 | @classmethod 55 | def new(cls, input, n_in, n_out, dtype=None, name=None): 56 | if dtype is None: 57 | dtype = input.dtype 58 | if name is None: 59 | name = cls.__name__ 60 | logger.debug('allocating params w, b: %s' % str((n_in, n_out, dtype))) 61 | w = theano.shared( 62 | numpy.zeros((n_in, n_out), dtype=dtype), 63 | name='%s.w' % name) 64 | b = theano.shared( 65 | numpy.zeros((n_out,), dtype=dtype), 66 | name='%s.b' % name) 67 | return cls(input, w, b, params=[w, b]) 68 | 69 | 70 | def nll(self, target): 71 | """Return the negative log-likelihood of the prediction of this model under a given 72 | target distribution. Passing symbolic integers here means 1-hot. 73 | WRITEME 74 | """ 75 | return tensor.nnet.categorical_crossentropy(self.output, target) 76 | 77 | def errors(self, target): 78 | """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. 79 | """ 80 | if target.ndim != self.argmax.ndim: 81 | raise TypeError('target should have the same shape as self.argmax', 82 | ('target', target.type, 'argmax', self.argmax.type)) 83 | if target.dtype.startswith('int'): 84 | return theano.tensor.neq(self.argmax, target) 85 | else: 86 | raise NotImplementedError() 87 | 88 | 89 | def sgd_updates(params, grads, stepsizes): 90 | """Return a list of (pairs) that can be used as updates in theano.function to implement 91 | stochastic gradient descent. 92 | 93 | :param params: variables to adjust in order to minimize some cost 94 | :type params: a list of variables (theano.function will require shared variables) 95 | :param grads: the gradient on each param (with respect to some cost) 96 | :type grads: list of theano expressions 97 | :param stepsizes: step by this amount times the negative gradient on each iteration 98 | :type stepsizes: [symbolic] scalar or list of one [symbolic] scalar per param 99 | """ 100 | try: 101 | iter(stepsizes) 102 | except Exception: 103 | stepsizes = [stepsizes for p in params] 104 | if len(params) != len(grads): 105 | raise ValueError('params and grads have different lens') 106 | updates = [(p, p - step * gp) for (step, p, gp) in zip(stepsizes, params, grads)] 107 | return updates 108 | 109 | 110 | def geom(lower, upper, round=1): 111 | ll = numpy.log(lower) 112 | lu = numpy.log(upper) 113 | return ceil_lognormal(.5 * (ll + lu), .4 * (lu - ll), round) 114 | 115 | 116 | def dbn_template(dataset_name='skdata.larochelle_etal_2007.Rectangles', 117 | sup_min_epochs=300, 118 | sup_max_epochs=4000): 119 | template = rSON2( 120 | 'preprocessing', one_of( 121 | rSON2( 122 | 'kind', 'raw'), 123 | rSON2( 124 | 'kind', 'zca', 125 | 'energy', uniform(0.5, 1.0))), 126 | 'dataset_name', dataset_name, 127 | 'sup_max_epochs', sup_max_epochs, 128 | 'sup_min_epochs', sup_min_epochs, 129 | 'iseed', one_of(5, 6, 7, 8), 130 | 'batchsize', one_of(20, 100), 131 | 'lr', lognormal(numpy.log(.01), 3), 132 | 'lr_anneal_start', geom(100, 10000), 133 | 'l2_penalty', one_of(0, lognormal(numpy.log(1.0e-6), 2)), 134 | 'next_layer', one_of(None, 135 | rSON2( 136 | 'n_hid', geom(2**7, 2**12, round=16), 137 | 'W_init_dist', one_of('uniform', 'normal'), 138 | 'W_init_algo', one_of('old', 'Xavier'), 139 | 'W_init_algo_old_multiplier', lognormal(0.0, 1.0), 140 | 'cd_epochs', geom(1, 3000), 141 | 'cd_batchsize', 100, 142 | 'cd_sample_v0s', one_of(False, True), 143 | 'cd_lr', lognormal(numpy.log(.01), 2), 144 | 'cd_lr_anneal_start', geom(10, 10000), 145 | 'next_layer', one_of(None, 146 | rSON2( 147 | 'n_hid', geom(2**7, 2**12, round=16), 148 | 'W_init_dist', one_of('uniform', 'normal'), 149 | 'W_init_algo', one_of('old', 'Xavier'), 150 | 'W_init_algo_old_multiplier', lognormal(0.0, 1.0), 151 | 'cd_epochs', geom(1, 2000), 152 | 'cd_batchsize', 100, 153 | 'cd_sample_v0s', one_of(False, True), 154 | 'cd_lr', lognormal(numpy.log(.01), 2), 155 | 'cd_lr_anneal_start', geom(10, 10000), 156 | 'next_layer', one_of(None, 157 | rSON2( 158 | 'n_hid', geom(2**7, 2**12, round=16), 159 | 'W_init_dist', one_of('uniform', 'normal'), 160 | 'W_init_algo', one_of('old', 'Xavier'), 161 | 'W_init_algo_old_multiplier', lognormal(0., 1.), 162 | 'cd_epochs', geom(1, 1500), 163 | 'cd_batchsize', 100, 164 | 'cd_sample_v0s', one_of(False, True), 165 | 'cd_lr', lognormal(numpy.log(.01), 2), 166 | 'cd_lr_anneal_start', geom(10, 10000), 167 | 'next_layer', None, 168 | ))))))) 169 | return template 170 | 171 | 172 | def nnet1_template(dataset_name='skdata.larochelle_etal_2007.Rectangles', 173 | sup_min_epochs=30, # THESE ARE KINDA SMALL FOR SERIOUS RESULTS 174 | sup_max_epochs=400): 175 | template = rSON2( 176 | 'preprocessing', one_of( 177 | rSON2( 178 | 'kind', 'raw'), 179 | rSON2( 180 | 'kind', 'zca', 181 | 'energy', uniform(0.5, 1.0))), 182 | 'dataset_name', dataset_name, 183 | 'sup_max_epochs', sup_max_epochs, 184 | 'sup_min_epochs', sup_min_epochs, 185 | 'iseed', one_of(5, 6, 7, 8), 186 | 'batchsize', one_of(20, 100), 187 | 'lr', lognormal(numpy.log(.01), 3), 188 | 'lr_anneal_start', geom(100, 10000), 189 | 'l2_penalty', one_of(0, lognormal(numpy.log(1.0e-6), 3)), 190 | 'next_layer', rSON2( 191 | 'n_hid', geom(2**4, 2**10, round=16), 192 | 'W_init_dist', one_of('uniform', 'normal'), 193 | 'W_init_algo', one_of('old', 'Xavier'), 194 | 'W_init_algo_old_multiplier', uniform(.2, 2), 195 | 'cd_epochs', 0, 196 | 'cd_batchsize', 100, 197 | 'cd_sample_v0s', one_of(False, True), 198 | 'cd_lr', lognormal(numpy.log(.01), 3), 199 | 'cd_lr_anneal_start', geom(10, 10000), 200 | 'next_layer', None)) 201 | return template 202 | 203 | 204 | def preprocess_data(config, ctrl): 205 | dataset = json_call(config['dataset_name']) 206 | train, valid, test = classification_train_valid_test(dataset) 207 | X_train, y_train = numpy.asarray(train[0]), numpy.asarray(train[1]) 208 | X_valid, y_valid = numpy.asarray(valid[0]), numpy.asarray(valid[1]) 209 | X_test, y_test = numpy.asarray(test[0]), numpy.asarray(test[1]) 210 | 211 | if config['preprocessing']['kind'] == 'pca': 212 | # compute pca of input (TODO: retrieve only pca_whitened input) 213 | raise NotImplementedError('rewrite since cut and paste') 214 | (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples( 215 | X=dataset['inputs'][:dataset['n_train']], 216 | max_energy_fraction=config['pca_energy']) 217 | eigmean = dataset['inputs'][0] - centered_trainset[0] 218 | 219 | whitened_inputs = pylearn_pca.pca_whiten((eigvals,eigvecs), 220 | dataset['inputs']-eigmean) 221 | ctrl.info('PCA kept %i of %i components'%(whitened_inputs.shape[1], 222 | dataset['n_inputs'])) 223 | elif config['preprocessing']['kind'] == 'zca': 224 | (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples( 225 | X=X_train, 226 | max_energy_fraction=config['preprocessing']['energy']) 227 | eigmean = X_train[0] - centered_trainset[0] 228 | 229 | def whiten(X): 230 | X = pylearn_pca.pca_whiten((eigvals,eigvecs), 231 | X - eigmean) 232 | X = pylearn_pca.pca_whiten_inverse((eigvals, eigvecs), 233 | X) + eigmean 234 | X = X.astype('float32') 235 | X_min = X.min() 236 | X_max = X.max() 237 | ctrl.info('ZCA min:%f max:%f' % (X_min, X_max)) 238 | if X_min < 0 or X_max > 1.0: 239 | ctrl.info('ZCA clamping return value to (0, 1) interval') 240 | X = numpy.clip(X, 0, 1, out=X) 241 | return X 242 | 243 | X_train, X_valid, X_test = [whiten(X) 244 | for X in [X_train, X_valid, X_test]] 245 | 246 | elif config['preprocessing']['kind'] == 'normalize': 247 | raise NotImplementedError('rewrite since cut and paste') 248 | n_train=dataset['n_train'] 249 | whitened_inputs = dataset['inputs'] 250 | whitened_inputs = whitened_inputs - whitened_inputs[:n_train].mean(axis=0) 251 | whitened_inputs /= whitened_inputs[:n_train].std(axis=0)+1e-7 252 | elif config['preprocessing']['kind'] == 'raw': 253 | pass 254 | else: 255 | raise ValueError( 256 | 'unrecognized preprocessing', 257 | config['preprocessing']['kind']) 258 | 259 | for Xy in 'X', 'y': 260 | for suffix in 'train', 'valid', 'test': 261 | varname = '%s_%s'%(Xy, suffix) 262 | var = locals()[varname] 263 | ctrl.info('%s shape=%s max=%f min=%f' % ( 264 | varname, 265 | var.shape, 266 | var.max(), 267 | var.min())) 268 | 269 | s_X_train = theano.shared(X_train) 270 | s_y_train = theano.shared(y_train) 271 | s_X_valid = theano.shared(X_valid) 272 | s_y_valid = theano.shared(y_valid) 273 | s_X_test = theano.shared(X_test) 274 | s_y_test = theano.shared(y_test) 275 | 276 | return (dataset, 277 | (s_X_train, s_y_train), 278 | (s_X_valid, s_y_valid), 279 | (s_X_test, s_y_test)) 280 | 281 | 282 | def train_rbm(s_rng, s_idx, s_batchsize, s_features, W, vbias, hbias, n_in, 283 | n_hid, batchsize, sample_v0s, 284 | cdlr, n_epochs, n_batches_per_epoch, lr_anneal_start, 285 | givens={}, 286 | time_limit=None): 287 | logger.info('rbm training n_in=%i n_hid=%i batchsize=%i' % ( 288 | n_in, n_hid, batchsize)) 289 | v0m = s_features 290 | if sample_v0s: 291 | v0s = tensor.cast( 292 | s_rng.uniform(size=(batchsize, n_in)) < v0m, 293 | 'float32') 294 | else: 295 | v0s = v0m 296 | 297 | h0m = tensor.nnet.sigmoid(tensor.dot(v0s, W) + hbias) 298 | h0s = tensor.cast(s_rng.uniform(size=(batchsize, n_hid)) < h0m, 'float32') 299 | v1m = tensor.nnet.sigmoid(tensor.dot(h0s, W.T)+vbias) 300 | v1s = tensor.cast(s_rng.uniform(size=(batchsize, n_in)) < v1m, 'float32') 301 | h1m = tensor.nnet.sigmoid(tensor.dot(v1s, W) + hbias) 302 | 303 | s_lr = tensor.scalar(dtype='float32') 304 | 305 | logger.debug('compiling cd1_fn') 306 | cd1_fn = theano.function([s_idx, s_batchsize, s_lr], 307 | [abs(v0m-v1m).mean()], 308 | updates={ 309 | W: W + s_lr * ( 310 | tensor.dot(v0s.T, h0m) - tensor.dot(v1s.T, h1m)), 311 | vbias: vbias + s_lr * ( 312 | (v0s - v1s).sum(axis=0)), 313 | hbias: hbias + s_lr * ( 314 | (h0m - h1m).sum(axis=0)), 315 | }, 316 | givens=givens) 317 | for epoch in xrange(n_epochs): 318 | costs = [] 319 | if time_limit and time.time() > time_limit: 320 | break 321 | e_lr = cdlr * min(1, (float(lr_anneal_start)/(epoch+1))) 322 | for batch_idx in xrange(n_batches_per_epoch): 323 | costs.append(cd1_fn(batch_idx, batchsize, e_lr)) 324 | if not epoch % 10: 325 | logger.info('CD1 epoch:%i avg L1: %f'% (epoch, numpy.mean(costs))) 326 | if costs: 327 | return dict(final_recon_l1=float(numpy.mean(costs)),) 328 | else: 329 | return dict(final_recon_l1=float('nan')) 330 | 331 | _dataset_cache = {} 332 | 333 | class DBN_Base(Bandit): 334 | def dryrun_config(self, *args, **kwargs): 335 | return dict( 336 | lr=.01, 337 | sup_max_epochs=500, 338 | sup_min_epochs=50, 339 | batchsize=10, 340 | preprocessing=dict(kind='zca', energy=0.8), 341 | iseed=5, 342 | n_layers=1, 343 | next_layer = dict( 344 | n_hid=50, 345 | W_init_dist='uniform', 346 | W_init_algo='Xavier', 347 | cd_epochs=100, 348 | cd_batchsize=50, 349 | cd_sample_v0s=True, 350 | cd_lr=0.1, 351 | cd_lr_anneal_start=3, 352 | next_layer = dict( 353 | n_hid=75, 354 | W_init_dist='uniform', 355 | W_init_algo='old', 356 | W_init_algo_old_multiplier=2.2, 357 | cd_epochs=70, 358 | cd_batchsize=10, 359 | cd_sample_v0s=False, 360 | cd_lr=0.01, 361 | cd_lr_anneal_start=30 362 | ), 363 | ), 364 | l2_penalty=0.1, 365 | lr_anneal_start=20, 366 | dataset_name='skdata.larochelle_etal_2007.Rectangles', 367 | ) 368 | 369 | @classmethod 370 | def evaluate(cls, config, ctrl): 371 | time_limit = time.time() + 60 * 60 # 1hr from now 372 | rval = SON(dbn_train_fn_version=1) 373 | 374 | ctrl.info('starting dbn_train_fn') 375 | kv = config.items() 376 | kv.sort() 377 | for k,v in kv: 378 | ctrl.info('key=%s\t%s' %(k,str(v))) 379 | 380 | rng = numpy.random.RandomState(config['iseed']) 381 | s_rng = RandomStreams(int(rng.randint(2**30))) 382 | 383 | dataset, train_Xy, valid_Xy, test_Xy = preprocess_data(config, ctrl) 384 | 385 | # allocate learning function parameters 386 | s_inputs_all = tensor.fmatrix('inputs') 387 | s_labels_all = tensor.ivector('labels') 388 | s_idx = tensor.lscalar('batch_idx') 389 | s_batchsize=tensor.lscalar('batch_size') 390 | s_low = s_idx * s_batchsize 391 | s_high = s_low + s_batchsize 392 | s_inputs = s_inputs_all[s_low:s_high] 393 | s_labels = s_labels_all[s_low:s_high] 394 | s_lr = tensor.scalar('lr') 395 | s_features = s_inputs # s_features will be modified in the model-building loop 396 | 397 | weights = [] 398 | vbiases = [] 399 | hbiases = [] 400 | 401 | n_inputs_i = valid_Xy[0].get_value(borrow=True).shape[1] 402 | 403 | rval['cd_reports'] = [] 404 | 405 | try: 406 | layer_config = config['next_layer'] 407 | # allocate model parameters 408 | while layer_config: 409 | i = len(rval['cd_reports']) 410 | n_hid_i = layer_config['n_hid'] 411 | if layer_config['W_init_dist']=='uniform': 412 | W = rng.uniform(low=-1,high=1,size=(n_hid_i, n_inputs_i)).T.astype('float32') 413 | elif layer_config['W_init_dist'] == 'normal': 414 | W = rng.randn(n_hid_i, n_inputs_i).T.astype('float32') 415 | else: 416 | raise ValueError('W_init_dist', layer_config['W_init_dist']) 417 | 418 | if layer_config['W_init_algo'] == 'old': 419 | #N.B. the weights are transposed so that as the number of hidden units changes, 420 | # the first hidden units are always the same vectors. 421 | # this makes it easier to isolate the effect of random initialization 422 | # from the other hyper-parameters under review 423 | W *= layer_config['W_init_algo_old_multiplier'] / numpy.sqrt(n_inputs_i) 424 | elif layer_config['W_init_algo'] == 'Xavier': 425 | W *= numpy.sqrt(6.0 / (n_inputs_i + n_hid_i)) 426 | else: 427 | raise ValueError(layer_config['W_init_algo']) 428 | 429 | layer_idx = len(rval['cd_reports']) 430 | weights.append(theano.shared(W, 'W_%i' % layer_idx)) 431 | hbiases.append(theano.shared(numpy.zeros(n_hid_i, dtype='float32'), 432 | 'h_%i' % layer_idx)) 433 | vbiases.append(theano.shared(numpy.zeros(n_inputs_i, dtype='float32'), 434 | 'v_%i' % layer_idx)) 435 | del W 436 | 437 | # allocate RBM training function for this layer 438 | # this version re-calculates the training set every time 439 | # TODO: cache the training set for each layer 440 | # TODO: consider sparsity? 441 | # TODO: consider momentum? 442 | if layer_config['cd_epochs']: 443 | cd_report = train_rbm( 444 | s_rng, s_idx, s_batchsize, s_features, 445 | W=weights[-1], 446 | vbias=vbiases[-1], 447 | hbias=hbiases[-1], 448 | n_in=n_inputs_i, 449 | n_hid=n_hid_i, 450 | batchsize=layer_config['cd_batchsize'], 451 | sample_v0s=layer_config['cd_sample_v0s'], 452 | cdlr=layer_config['cd_lr'] / float(layer_config['cd_batchsize']), 453 | n_epochs=layer_config['cd_epochs'], 454 | n_batches_per_epoch=dataset.descr['n_train'] // layer_config['cd_batchsize'], 455 | lr_anneal_start=layer_config['cd_lr_anneal_start'], 456 | givens = { 457 | s_inputs_all: tensor.as_tensor_variable(train_Xy[0]) 458 | }, 459 | time_limit=time_limit 460 | ) 461 | else: 462 | cd_report = None 463 | rval['cd_reports'].append(cd_report) 464 | 465 | # update s_features to point to top layer 466 | s_features = tensor.nnet.sigmoid( 467 | tensor.dot(s_features, weights[-1]) + hbiases[-1]) 468 | n_inputs_i = n_hid_i 469 | layer_config = layer_config.get('next_layer', None) 470 | 471 | except (MemoryError,): 472 | rval['abort'] = 'MemoryError' 473 | rval['status'] = 'ok' 474 | rval['loss'] = 1.0 475 | rval['best_epoch_valid'] = 0.0 476 | return rval 477 | 478 | # allocate model 479 | 480 | logreg = LogisticRegression.new(s_features, n_in=n_inputs_i, 481 | n_out=dataset.descr['n_classes']) 482 | traincost = logreg.nll(s_labels).mean() 483 | def ssq(X): 484 | return (X**2).sum() 485 | traincost = traincost + config['l2_penalty'] * ( 486 | sum([ssq(w_i) for w_i in weights]) + ssq(logreg.w)) 487 | # params = weights+hbiases+vbiases+logreg.params 488 | # vbiases are not involved in the supervised network 489 | params = weights + hbiases + logreg.params 490 | train_logreg_fn = theano.function([s_idx, s_lr], 491 | [logreg.nll(s_labels).mean()], 492 | updates=sgd_updates( 493 | params=params, 494 | grads=tensor.grad(traincost, params), 495 | stepsizes=[s_lr] * len(params)), 496 | givens={s_batchsize:config['batchsize'], 497 | s_inputs_all: tensor.as_tensor_variable(train_Xy[0]), 498 | s_labels_all: train_Xy[1]}) 499 | valid_logreg_fn = theano.function([s_idx], 500 | logreg.errors(s_labels).mean(), 501 | givens={s_batchsize:config['batchsize'], 502 | s_inputs_all: tensor.as_tensor_variable(valid_Xy[0]), 503 | s_labels_all: valid_Xy[1]}) 504 | test_logreg_fn = theano.function([s_idx], 505 | logreg.errors(s_labels).mean(), 506 | givens={s_batchsize:config['batchsize'], 507 | s_inputs_all: tensor.as_tensor_variable(test_Xy[0]), 508 | s_labels_all: test_Xy[1]}) 509 | 510 | rval['best_epoch'] = -1 511 | rval['best_epoch_valid'] = -1 512 | rval['best_epoch_train'] = -1 513 | rval['best_epoch_test'] = -1 514 | rval['status'] = 'ok' 515 | valid_rate=-1 516 | test_rate=-1 517 | train_rate=-1 518 | 519 | n_train_batches = dataset.descr['n_train'] // config['batchsize'] 520 | n_valid_batches = dataset.descr['n_valid'] // config['batchsize'] 521 | n_test_batches = dataset.descr['n_test'] // config['batchsize'] 522 | 523 | n_iters = 0 524 | for epoch in xrange(config['sup_max_epochs']): 525 | e_lr = config['lr'] 526 | e_lr *= min(1, config['lr_anneal_start'] / float(n_iters+1)) #anneal learning rate 527 | valid_rate = float(1 - numpy.mean([valid_logreg_fn(i) 528 | for i in range(n_valid_batches)])) 529 | valid_rate_std_thresh = 0.5 * numpy.sqrt(valid_rate * 530 | (1 - valid_rate) / (n_valid_batches * config['batchsize'])) 531 | 532 | if valid_rate > (rval['best_epoch_valid']+valid_rate_std_thresh): 533 | rval['best_epoch'] = epoch 534 | rval['best_epoch_test'] = test_rate 535 | rval['best_epoch_valid'] = valid_rate 536 | rval['best_epoch_train'] = train_rate 537 | best_params = copy.deepcopy(params) 538 | logger.info('Epoch=%i best epoch %i valid %f test %f best_epoch_train %f prev_train %f'%( 539 | epoch, rval['best_epoch'], rval['best_epoch_valid'], rval['best_epoch_test'], 540 | rval['best_epoch_train'], train_rate)) 541 | #ctrl.info('Epoch %i train nll: %f'%(epoch, train_rate)) 542 | ctrl.checkpoint(rval) 543 | 544 | if epoch > config['sup_min_epochs'] and epoch > 2*rval['best_epoch']: 545 | break 546 | if time.time() > time_limit: 547 | break 548 | train_rate = float(numpy.mean([train_logreg_fn(i,e_lr) for i in 549 | range(n_train_batches)])) 550 | if not numpy.isfinite(train_rate): 551 | do_test = False 552 | rval['status'] = 'fail' 553 | rval['status_info'] = 'train_rate %f' % train_rate 554 | break 555 | ++n_iters 556 | 557 | do_test = 1 558 | if do_test and rval['status'] == 'ok': 559 | # copy best params back into place 560 | for p, bp in zip(params, best_params): 561 | p.set_value(bp.get_value()) 562 | rval['best_epoch_test'] = 1 - float( 563 | numpy.mean( 564 | [test_logreg_fn(i) for i in range(n_test_batches)])) 565 | rval['loss'] = 1.0 - rval['best_epoch_valid'] 566 | ctrl.info('rval: %s' % str(rval)) 567 | return rval 568 | 569 | @classmethod 570 | def loss(cls, result, config=None): 571 | """Extract the scalar-valued loss from a result document 572 | """ 573 | try: 574 | if numpy.isnan(float(result['loss'])): 575 | return None 576 | else: 577 | return float(result['loss']) 578 | except KeyError, TypeError: 579 | return None 580 | 581 | @classmethod 582 | def loss_variance(cls, result, config=None): 583 | if config['dataset_name'] not in _dataset_cache: 584 | _dataset_cache[config['dataset_name']] = json_call( 585 | config['dataset_name']) 586 | dataset = _dataset_cache[config['dataset_name']] 587 | n_valid = dataset.descr['n_valid'] 588 | p = cls.loss(result, config) 589 | if p is None: 590 | return None 591 | return p * (1.0 - p) / (n_valid - 1) 592 | 593 | @classmethod 594 | def true_loss(cls, result, config=None): 595 | try: 596 | rval = float(1 - result['best_epoch_test']) 597 | if 0 <= rval <= 1: 598 | return rval 599 | except (KeyError, TypeError): 600 | return None 601 | 602 | @classmethod 603 | def status(cls, result, config=None): 604 | """Extract the job status from a result document 605 | """ 606 | if (result['status'] == 'ok' and 607 | (cls.loss(result) is None 608 | or cls.true_loss(result) is None)): 609 | return 'fail' 610 | else: 611 | return result['status'] 612 | 613 | 614 | def DBN_Convex(): 615 | return DBN_Base( 616 | dbn_template( 617 | dataset_name='skdata.larochelle_etal_2007.Convex')) 618 | 619 | 620 | def DBN_MRBI(): 621 | ds = 'skdata.larochelle_etal_2007.MNIST_RotatedBackgroundImages' 622 | return DBN_Base(dbn_template(dataset_name=ds)) 623 | 624 | 625 | class Dummy_DBN_Base(Bandit): 626 | """ 627 | A DBN_Base stub. 628 | 629 | This class is used in unittests of optimization algorithms to ensure they 630 | can deal with large nested specifications that include lots of distribution 631 | types. 632 | 633 | The evaluate function simply returns a random score. 634 | """ 635 | def __init__(self): 636 | Bandit.__init__(self, template=dbn_template()) 637 | self.rng = numpy.random.RandomState(234) 638 | 639 | def evaluate(self, argd, ctrl): 640 | rval = dict(dbn_train_fn_version=-1) 641 | # XXX: TODO: make up a loss function that depends on argd. 642 | rval['status'] = 'ok' 643 | rval['best_epoch_valid'] = float(self.rng.rand()) 644 | rval['loss'] = 1.0 - rval['best_epoch_valid'] 645 | return rval 646 | 647 | def loss_variance(self, result, config=None): 648 | return 0.01**2 649 | -------------------------------------------------------------------------------- /hpnnet/pylearn_pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | There is potentially a lot of approaches to PCA, this file may get there eventually. 4 | 5 | 6 | Elements of this implementation have been borrowed from the MDP toolkit: 7 | mdp/nodes/pca_nodes.py 8 | """ 9 | __author__ = "James Bergstra" 10 | __license__ = "BSD-3" 11 | 12 | #TODO: estimate number of principle components by cross-validation (early stopping) 13 | 14 | #TODO: include the original feature means in the `pca` tuple object so that the full transform 15 | # can be saved, applied to new datasets, and approximately inverted. 16 | 17 | import numpy as np 18 | import scipy.linalg 19 | 20 | try: 21 | import theano 22 | dx, dy = theano.tensor.dmatrix(), theano.tensor.dmatrix() 23 | ddot = theano.function([dx, dy], theano.dot(dx, dy)) 24 | fx, fy = theano.tensor.fmatrix(), theano.tensor.fmatrix() 25 | fdot = theano.function([fx, fy], theano.dot(fx, fy)) 26 | def np_dot(a, b): 27 | if '32' in str(a.dtype): 28 | return fdot(a, b) 29 | else: 30 | return ddot(a, b) 31 | except ImportError: 32 | pass 33 | 34 | def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None): 35 | """Return (eigvals, eigvecs) of data with covariance `cov`. 36 | 37 | The returned eigvals will be a np ndarray vector. 38 | The returned eigvecs will be a np ndarray matrix whose *cols* are the eigenvectors. 39 | 40 | This is recommended for retrieving many components from high-dimensional data. 41 | 42 | :param cov: data covariance matrix 43 | :type cov: a np ndarray 44 | 45 | :returns: (eigvals, eigvecs) of decomposition 46 | """ 47 | 48 | w, v = scipy.linalg.eigh(a=cov, lower=lower) 49 | # definition of eigh 50 | # a * v[:,i] = w[i] * vr[:,i] 51 | # v.H * v = identity 52 | 53 | 54 | # total variance (vartot) can be computed at this point: 55 | vartot = w.sum() 56 | 57 | # sort the eigenvals and vecs by decreasing magnitude 58 | a = np.argsort(w)[::-1] 59 | w = w[a] 60 | v = v[:,a] 61 | 62 | if max_components != None: 63 | w = w[:max_components] 64 | v = v[:, :max_components] 65 | 66 | if max_energy_fraction != None: 67 | if not (0.0 <= max_energy_fraction <= 1.0): 68 | raise ValueError('illegal value for max_energy_fraction', max_energy_fraction) 69 | if max_energy_fraction < 1.0: 70 | energy = 0 71 | i = 0 72 | while (energy < max_energy_fraction * vartot) and (i < len(w)): 73 | energy += w[i] 74 | i += 1 75 | w = w[:(i-1)] 76 | v = v[:,:(i-1)] 77 | return w,v 78 | 79 | 80 | def pca_from_examples(X, max_components=None, max_energy_fraction=None, 81 | x_centered=False, inplace=False): 82 | """Return ((eigvals, eigvecs), centered_X) of observations `X` (1-per-row) 83 | 84 | This function exists to wrap several algorithms for getting the principle components. 85 | 86 | :param max_components: 87 | Return no more than this many principle components. 88 | 89 | :param max_energy_fraction: 90 | Return [only] enough components to account for this fraction of the energy (aka 91 | variance) in the observations. 92 | 93 | :param x_centered: 94 | True means to consider X as having mean 0 (even if it actually doesn't!) 95 | 96 | :param inplace: 97 | If False, we copy X before using it. Otherwise we modify it. 98 | 99 | :returns: ((eigvals, eigvecs), centered_X) of PCA decomposition 100 | 101 | """ 102 | if not inplace: 103 | X = X.copy() 104 | centered_X = X 105 | if not x_centered: 106 | centered_X -= np.mean(centered_X, axis=0) 107 | cov_X = np_dot(centered_X.T, centered_X) / (len(X)- 1) 108 | evals, evecs = pca_from_cov(cov_X, max_components=max_components, 109 | max_energy_fraction=max_energy_fraction) 110 | return ((evals, evecs), centered_X) 111 | 112 | 113 | def pca_whiten((eigvals, eigvecs), centered_X,eps=1e-14): 114 | """ 115 | Return the projection of X onto it's principle components. 116 | 117 | The return value has the same number of rows as X, but the number of columns is the number 118 | of principle components. Columns of the return value have mean 0, variance 1, and are 119 | uncorrelated. 120 | 121 | :param pca: the (w,v) pair returned by e.g. pca_from_examples(X) 122 | 123 | """ 124 | pca_of_X = np_dot(centered_X, eigvecs) 125 | pca_of_X /= np.sqrt(eigvals+eps) 126 | return pca_of_X 127 | 128 | 129 | def pca_whiten_inverse((eigvals, eigvecs), whitened_X, eps=1e-14): 130 | """ 131 | Return an approximate inverse of the `pca_whiten` transform. 132 | 133 | The inverse is not perfect because pca_whitening discards the least-significant components 134 | of the data. 135 | """ 136 | return np_dot(whitened_X * (np.sqrt(eigvals+eps)), eigvecs.T) 137 | 138 | 139 | def pca_whiten2(pca_from_examples_rval, eps=1e-14): 140 | """ 141 | Return the projection of X onto it's principle components. 142 | 143 | The return value has the same number of rows as X, but the number of columns is the number 144 | of principle components. Columns of the return value have mean 0, variance 1, and are 145 | uncorrelated. 146 | 147 | .. code-block:: python 148 | 149 | X = data 150 | (evals, evecs), whitened_X = pca_whiten( 151 | pca_from_examples(X, max_components=10), 152 | eps=1e-3) 153 | 154 | :param pca_from_examples_rval: the ((eigvals, eigvecs), centered_X) 155 | pair returned by e.g. pca_from_examples(X). 156 | 157 | :returns: ((eigvals, eigvecs), whitened_X) 158 | 159 | """ 160 | ((eigvals, eigvecs), centered_X) = pca_from_examples_rval 161 | pca_of_X = np_dot(centered_X, eigvecs) 162 | pca_of_X /= np.sqrt(eigvals+eps) 163 | return ((eigvals, eigvecs), pca_of_X) 164 | 165 | 166 | def zca_whiten((eigvals, eigvecs), centered_X): 167 | """Return the PCA of X but rotated back into the original vector space. 168 | 169 | See also fft_whiten.py 170 | """ 171 | pca_of_X = pca_whiten((eigvals, eigvecs), centered_X) 172 | return np_dot(pca_of_X, eigvecs.T) 173 | 174 | 175 | -------------------------------------------------------------------------------- /hpnnet/pyll_stubs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Singleton objects that serve as placeholders in pyll graphs. 3 | 4 | These are used by e.g. ./nips2011.py 5 | """ 6 | 7 | class train_task(object): 8 | """`train` argument to skdata.LearningAlgo's best_model method 9 | """ 10 | 11 | class valid_task(object): 12 | """`valid` argument to skdata.LearningAlgo's best_model method 13 | """ 14 | 15 | class ctrl(object): 16 | """Hyperopt Ctrl object passed to worker eval_fn. 17 | """ 18 | 19 | 20 | -------------------------------------------------------------------------------- /hpnnet/skdata_learning_algo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Training and construction routines for neural networks. 3 | 4 | """ 5 | 6 | __author__ = "James Bergstra" 7 | __license__ = "BSD-3" 8 | 9 | import numpy as np 10 | from skdata.base import SemanticsDelegator 11 | from hyperopt.utils import use_obj_for_literal_in_memo 12 | from hyperopt.pyll import rec_eval 13 | import pyll_stubs 14 | 15 | 16 | class PyllLearningAlgo(SemanticsDelegator): 17 | def __init__(self, expr, memo, ctrl): 18 | self.expr = expr 19 | self.memo = dict(memo) 20 | self.ctrl = ctrl 21 | self.validation_sets = [] 22 | self.results = { 23 | 'best_model': [], 24 | 'loss': [], 25 | } 26 | 27 | 28 | def best_model_vector_classification(self, train, valid): 29 | # TODO: use validation set if not-None 30 | memo = dict(self.memo) 31 | use_obj_for_literal_in_memo(self.expr, train, pyll_stubs.train_task, memo) 32 | use_obj_for_literal_in_memo(self.expr, valid, pyll_stubs.valid_task, memo) 33 | use_obj_for_literal_in_memo(self.expr, self.ctrl, pyll_stubs.ctrl, memo) 34 | model, report = rec_eval(self.expr, memo=memo) 35 | if model: 36 | model.trained_on = train.name 37 | if valid and valid.name not in self.validation_sets: 38 | self.validation_sets.append(valid.name) 39 | self.results['best_model'].append( 40 | { 41 | 'train_name': train.name, 42 | 'valid_name': valid.name if valid else None, 43 | 'model': model, 44 | 'report': report, 45 | }) 46 | return model 47 | 48 | 49 | def loss_vector_classification(self, model, task): 50 | if model is None: 51 | err_rate = 1.0 52 | self.results['loss'].append( 53 | { 54 | 'err_rate': err_rate, 55 | 'task_name': task.name, 56 | }) 57 | else: 58 | p = model.predict(task.x) 59 | err_rate = np.mean(p != task.y) 60 | 61 | # save as string to save space and maintain 62 | # readability 63 | assert np.max(p) < 10 64 | p_str = ''.join(map(str, p)) 65 | 66 | self.results['loss'].append( 67 | { 68 | 'model_trained_on': model.trained_on, 69 | 'predictions': p_str, 70 | 'err_rate': err_rate, 71 | 'n': len(p), 72 | 'task_name': task.name, 73 | }) 74 | 75 | return err_rate 76 | 77 | 78 | def eval_fn(expr, memo, ctrl, protocol_cls): 79 | import nnet # -- ensure pyll symbols are loaded 80 | import hyperopt 81 | assert 'time' in hyperopt.pyll.scope._impls 82 | protocol = protocol_cls() 83 | algo = PyllLearningAlgo(expr, memo, ctrl) 84 | protocol.protocol(algo) 85 | results = algo.results 86 | valid_losses = [] 87 | true_loss = None 88 | for dct in results['best_model']: 89 | del dct['model'] # -- too big, not worth saving 90 | valid_losses.append(dct['report']['best_epoch_valid']) 91 | 92 | for dct in results['loss']: 93 | if dct['task_name'] == 'test': 94 | true_loss = dct['err_rate'] 95 | 96 | if valid_losses: 97 | rval = { 98 | 'loss': float(np.mean(valid_losses)), 99 | 'status': 'ok', 100 | 'algo_results': results, 101 | } 102 | else: 103 | rval = { 104 | 'status': 'fail', 105 | 'algo_results': results, 106 | } 107 | if true_loss != None: 108 | rval['true_loss'] = true_loss 109 | print 'true_loss: ', true_loss 110 | else: 111 | print 'No true_loss' 112 | 113 | return rval 114 | 115 | -------------------------------------------------------------------------------- /hpnnet/tests/test_nips2011.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | #import numpy as np 3 | import hyperopt 4 | #from hyperopt import pyll 5 | from hyperopt.fmin import fmin_pass_expr_memo_ctrl 6 | 7 | from hpnnet.nips2011 import nnet1_preproc_space 8 | #from hpnnet.skdata_learning_algo import PyllLearningAlgo 9 | from hpnnet.skdata_learning_algo import eval_fn 10 | 11 | from skdata.larochelle_etal_2007.view import RectanglesVectorXV 12 | from skdata.larochelle_etal_2007.view \ 13 | import MNIST_RotatedBackgroundImages_VectorXV 14 | 15 | def test_nnet_rectangles(): 16 | 17 | rectangles_eval_fn = partial(eval_fn, 18 | protocol_cls=RectanglesVectorXV) 19 | 20 | fmin_pass_expr_memo_ctrl(rectangles_eval_fn) 21 | 22 | trials = hyperopt.Trials() 23 | 24 | hyperopt.fmin( 25 | rectangles_eval_fn, 26 | space=nnet1_preproc_space(sup_min_epochs=20, sup_max_epochs=40), 27 | max_evals=10, 28 | algo=hyperopt.rand.suggest, 29 | trials=trials, 30 | ) 31 | 32 | def test_nnet_mrbi(): 33 | 34 | rectangles_eval_fn = partial(eval_fn, 35 | protocol_cls=MNIST_RotatedBackgroundImages_VectorXV) 36 | 37 | fmin_pass_expr_memo_ctrl(rectangles_eval_fn) 38 | 39 | trials = hyperopt.Trials() 40 | 41 | hyperopt.fmin( 42 | rectangles_eval_fn, 43 | space=nnet1_preproc_space(sup_min_epochs=20, sup_max_epochs=40), 44 | max_evals=10, 45 | algo=hyperopt.rand.suggest, 46 | trials=trials, 47 | ) 48 | -------------------------------------------------------------------------------- /hpnnet/tests/test_nips2011_dbn.py: -------------------------------------------------------------------------------- 1 | from hpnnet import nips2011_dbn 2 | from hyperopt.pyll.stochastic import sample 3 | 4 | from functools import partial 5 | #import numpy as np 6 | import hyperopt 7 | #from hyperopt import pyll 8 | from hyperopt.fmin import fmin_pass_expr_memo_ctrl 9 | 10 | from hpnnet.skdata_learning_algo import eval_fn 11 | 12 | from skdata.larochelle_etal_2007.view import RectanglesVectorXV 13 | 14 | def test_preproc_space(): 15 | rectangles_eval_fn = partial(eval_fn, 16 | protocol_cls=RectanglesVectorXV) 17 | 18 | fmin_pass_expr_memo_ctrl(rectangles_eval_fn) 19 | 20 | trials = hyperopt.Trials() 21 | space = nips2011_dbn.preproc_space() 22 | 23 | hyperopt.fmin( 24 | rectangles_eval_fn, 25 | space=space, 26 | max_evals=10, 27 | algo=hyperopt.rand.suggest, 28 | trials=trials, 29 | ) 30 | 31 | 32 | -------------------------------------------------------------------------------- /hpnnet/tests/test_nnet.py: -------------------------------------------------------------------------------- 1 | 2 | if 0: 3 | 4 | def test_column_normalize_layer(): 5 | raise NotImplementedError() 6 | 7 | 8 | def test_pca_layer(): 9 | raise NotImplementedError() 10 | 11 | 12 | def test_zca_layer(): 13 | raise NotImplementedError() 14 | 15 | -------------------------------------------------------------------------------- /hpnnet/tests/test_pylearn_pca.py: -------------------------------------------------------------------------------- 1 | import skdata.svhn.view 2 | from hpnnet import pylearn_pca 3 | import matplotlib.pyplot as plt 4 | 5 | SHOW = False 6 | 7 | def test_zca(): 8 | view = skdata.svhn.view.CroppedDigitsStratifiedKFoldView1() 9 | 10 | print view.splits[0].train.x.shape 11 | 12 | N = 1000 13 | 14 | first_x = view.splits[0].train.x[:N].reshape(N, -1) 15 | 16 | eigstuff, centered_x = pylearn_pca.pca_from_examples(first_x, max_energy_fraction=.99) 17 | zca_x = pylearn_pca.zca_whiten(eigstuff, centered_x) 18 | offset = centered_x[0] - first_x[0] 19 | print zca_x[0].min() 20 | print zca_x[1].max() 21 | assert zca_x[0].min() > -.5 22 | assert zca_x[0].max() < 2 23 | 24 | for i in range(4): 25 | plt.subplot(4, 2, 2 * i + 1) 26 | print 'first_x', first_x[i].min() 27 | print 'first_x', first_x[i].max() 28 | plt.imshow(first_x[i].reshape(32, 32, 3)) 29 | zca_i = zca_x[i] 30 | # -- the range of zca output is kind of arbitrary, 31 | # it is unit-normal-ish 32 | mi = zca_i.min() 33 | ma = zca_i.max() 34 | print 'range', i, mi, ma 35 | assert mi > -3 36 | assert ma < 3 37 | plt.subplot(4, 2, 2 * i + 2) 38 | plt.imshow((zca_i.reshape(32, 32, 3) - mi) / (ma - mi + 1e-7)) 39 | if SHOW: 40 | plt.show() 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ distribute- and pip-enabled setup.py """ 5 | 6 | import logging 7 | import os 8 | import re 9 | 10 | # ----- overrides ----- 11 | 12 | # set these to anything but None to override the automatic defaults 13 | packages = None 14 | package_name = None 15 | package_data = None 16 | scripts = None 17 | # --------------------- 18 | 19 | 20 | # ----- control flags ----- 21 | 22 | # fallback to setuptools if distribute isn't found 23 | setup_tools_fallback = True 24 | 25 | # don't include subdir named 'tests' in package_data 26 | skip_tests = False 27 | 28 | # print some extra debugging info 29 | debug = True 30 | 31 | # ------------------------- 32 | 33 | if debug: logging.basicConfig(level=logging.DEBUG) 34 | # distribute import and testing 35 | try: 36 | import distribute_setup 37 | distribute_setup.use_setuptools() 38 | logging.debug("distribute_setup.py imported and used") 39 | except ImportError: 40 | # fallback to setuptools? 41 | # distribute_setup.py was not in this directory 42 | if not (setup_tools_fallback): 43 | import setuptools 44 | if not (hasattr(setuptools,'_distribute') and \ 45 | setuptools._distribute): 46 | raise ImportError("distribute was not found and fallback to setuptools was not allowed") 47 | else: 48 | logging.debug("distribute_setup.py not found, defaulted to system distribute") 49 | else: 50 | logging.debug("distribute_setup.py not found, defaulting to system setuptools") 51 | 52 | import setuptools 53 | 54 | def find_scripts(): 55 | return [s for s in setuptools.findall('scripts/') if os.path.splitext(s)[1] != '.pyc'] 56 | 57 | def package_to_path(package): 58 | """ 59 | Convert a package (as found by setuptools.find_packages) 60 | e.g. "foo.bar" to usable path 61 | e.g. "foo/bar" 62 | 63 | No idea if this works on windows 64 | """ 65 | return package.replace('.','/') 66 | 67 | def find_subdirectories(package): 68 | """ 69 | Get the subdirectories within a package 70 | This will include resources (non-submodules) and submodules 71 | """ 72 | try: 73 | subdirectories = os.walk(package_to_path(package)).next()[1] 74 | except StopIteration: 75 | subdirectories = [] 76 | return subdirectories 77 | 78 | def subdir_findall(dir, subdir): 79 | """ 80 | Find all files in a subdirectory and return paths relative to dir 81 | 82 | This is similar to (and uses) setuptools.findall 83 | However, the paths returned are in the form needed for package_data 84 | """ 85 | strip_n = len(dir.split('/')) 86 | path = '/'.join((dir, subdir)) 87 | return ['/'.join(s.split('/')[strip_n:]) for s in setuptools.findall(path)] 88 | 89 | def find_package_data(packages): 90 | """ 91 | For a list of packages, find the package_data 92 | 93 | This function scans the subdirectories of a package and considers all 94 | non-submodule subdirectories as resources, including them in 95 | the package_data 96 | 97 | Returns a dictionary suitable for setup(package_data=) 98 | """ 99 | package_data = {} 100 | for package in packages: 101 | package_data[package] = [] 102 | for subdir in find_subdirectories(package): 103 | if '.'.join((package, subdir)) in packages: # skip submodules 104 | logging.debug("skipping submodule %s/%s" % (package, subdir)) 105 | continue 106 | if skip_tests and (subdir == 'tests'): # skip tests 107 | logging.debug("skipping tests %s/%s" % (package, subdir)) 108 | continue 109 | package_data[package] += subdir_findall(package_to_path(package), subdir) 110 | return package_data 111 | 112 | # ----------- Override defaults here ---------------- 113 | if packages is None: packages = setuptools.find_packages() 114 | 115 | if len(packages) == 0: raise Exception("No valid packages found") 116 | 117 | if package_name is None: package_name = packages[0] 118 | 119 | if package_data is None: package_data = find_package_data(packages) 120 | 121 | if scripts is None: scripts = find_scripts() 122 | 123 | setuptools.setup( 124 | name = package_name, 125 | version = '0.0.1', 126 | packages = packages, 127 | scripts = scripts, 128 | url = 'http://github.com/jaberg/hyperopt-nnet/', 129 | author = 'James Bergstra', 130 | author_email = 'anon@anon.com', 131 | description = 'Hyperparameter optimization for neural networks', 132 | long_description = open('README.txt').read(), 133 | classifiers = [ 134 | 'Development Status :: 3 - Alpha', 135 | 'Intended Audience :: Education', 136 | 'Intended Audience :: Science/Research', 137 | 'Intended Audience :: Developers', 138 | 'Environment :: Console', 139 | 'License :: OSI Approved :: BSD License', 140 | 'Operating System :: MacOS :: MacOS X', 141 | 'Operating System :: Microsoft :: Windows', 142 | 'Operating System :: POSIX', 143 | 'Operating System :: Unix', 144 | 'Programming Language :: Python', 145 | 'Topic :: Scientific/Engineering', 146 | 'Topic :: Software Development', 147 | ], 148 | platforms = ['Linux', 'OS-X', 'Windows'], 149 | license = 'BSD (3-clause)', 150 | keywords = 'hyperparameter model selection neural networks machine learning', 151 | package_data = package_data, 152 | include_package_data = True, 153 | install_requires = reversed([ 154 | 'numpy', 155 | 'scipy', 156 | 'nose', 157 | 'Theano']), 158 | ) 159 | --------------------------------------------------------------------------------