├── LICENSE.txt ├── README.md ├── batch_norm.py ├── binary_connect.py ├── cifar10.py ├── mnist.py ├── svhn.py └── svhn_preprocessing.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Please checkout our latest work, 2 | [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](http://arxiv.org/abs/1602.02830), 3 | and the associated [github repository](https://github.com/MatthieuCourbariaux/BinaryNet).** 4 | 5 | # BinaryConnect 6 | 7 | ## Motivations 8 | 9 | The goal of this repository is to enable the reproduction of the experiments described in 10 | [BinaryConnect: Training Deep Neural Networks with binary weights during propagations](http://arxiv.org/abs/1511.00363). 11 | You may want to checkout our subsequent work: 12 | * [Neural Networks with Few Multiplications](http://arxiv.org/abs/1510.03009) 13 | * [BinaryNet: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](http://arxiv.org/abs/1602.02830) 14 | 15 | ## Requirements 16 | 17 | * Python, Numpy, Scipy 18 | * [Theano](http://deeplearning.net/software/theano/install.html) (Bleeding edge version) 19 | * [Pylearn2](http://deeplearning.net/software/pylearn2/) 20 | * [Lasagne](http://lasagne.readthedocs.org/en/latest/user/installation.html) 21 | * [PyTables](http://www.pytables.org/usersguide/installation.html) (only for the SVHN dataset) 22 | * a fast Nvidia GPU or a large amount of patience 23 | 24 | ## MNIST 25 | 26 | python mnist.py 27 | 28 | This python script trains an MLP on MNIST with the stochastic version of BinaryConnect. 29 | It should run for about 30 minutes on a GTX 680 GPU. 30 | The final test error should be around **1.15%**. 31 | Please note that this is NOT the experiment reported in the article (which is in the "master" branch of the repository). 32 | 33 | ## CIFAR-10 34 | 35 | python cifar10.py 36 | 37 | This python script trains a CNN on CIFAR-10 with the stochastic version of BinaryConnect. 38 | It should run for about 20 hours on a Titan Black GPU. 39 | The final test error should be around **8.27%**. 40 | 41 | ## SVHN 42 | 43 | export SVHN_LOCAL_PATH=/Tmp/SVHN/ 44 | python svhn_preprocessing.py 45 | 46 | This python script (taken from Pylearn2) computes a preprocessed (GCN and LCN) version of the SVHN dataset in a temporary folder (SVHN_LOCAL_PATH). 47 | 48 | python svhn.py 49 | 50 | This python script trains a CNN on SVHN with the stochastic version of BinaryConnect. 51 | It should run for about 2 days on a Titan Black GPU. 52 | The final test error should be around **2.15%**. 53 | 54 | ## How to play with it 55 | 56 | The python scripts mnist.py, cifar10.py and svhn.py contain all the relevant hyperparameters. 57 | It is very straightforward to modify them. 58 | binary_connect.py contains the binarization function (called binarization). 59 | 60 | Have fun! 61 | -------------------------------------------------------------------------------- /batch_norm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Preliminary implementation of batch normalization for Lasagne. 5 | Does not include a way to properly compute the normalization factors over the 6 | full training set for testing, but can be used as a drop-in for training and 7 | validation. 8 | 9 | Author: Jan Schlüter 10 | """ 11 | 12 | import numpy as np 13 | import lasagne 14 | import theano 15 | import theano.tensor as T 16 | 17 | class BatchNormLayer(lasagne.layers.Layer): 18 | 19 | def __init__(self, incoming, axes=None, epsilon=0.01, alpha=0.5, 20 | nonlinearity=None, **kwargs): 21 | """ 22 | Instantiates a layer performing batch normalization of its inputs, 23 | following Ioffe et al. (http://arxiv.org/abs/1502.03167). 24 | 25 | @param incoming: `Layer` instance or expected input shape 26 | @param axes: int or tuple of int denoting the axes to normalize over; 27 | defaults to all axes except for the second if omitted (this will 28 | do the correct thing for dense layers and convolutional layers) 29 | @param epsilon: small constant added to the standard deviation before 30 | dividing by it, to avoid numeric problems 31 | @param alpha: coefficient for the exponential moving average of 32 | batch-wise means and standard deviations computed during training; 33 | the larger, the more it will depend on the last batches seen 34 | @param nonlinearity: nonlinearity to apply to the output (optional) 35 | """ 36 | super(BatchNormLayer, self).__init__(incoming, **kwargs) 37 | if axes is None: 38 | # default: normalize over all but the second axis 39 | axes = (0,) + tuple(range(2, len(self.input_shape))) 40 | elif isinstance(axes, int): 41 | axes = (axes,) 42 | self.axes = axes 43 | self.epsilon = epsilon 44 | self.alpha = alpha 45 | if nonlinearity is None: 46 | nonlinearity = lasagne.nonlinearities.identity 47 | self.nonlinearity = nonlinearity 48 | shape = list(self.input_shape) 49 | broadcast = [False] * len(shape) 50 | for axis in self.axes: 51 | shape[axis] = 1 52 | broadcast[axis] = True 53 | if any(size is None for size in shape): 54 | raise ValueError("BatchNormLayer needs specified input sizes for " 55 | "all dimensions/axes not normalized over.") 56 | dtype = theano.config.floatX 57 | self.mean = self.add_param(lasagne.init.Constant(0), shape, 'mean', 58 | trainable=False, regularizable=False) 59 | self.std = self.add_param(lasagne.init.Constant(1), shape, 'std', 60 | trainable=False, regularizable=False) 61 | self.beta = self.add_param(lasagne.init.Constant(0), shape, 'beta', 62 | trainable=True, regularizable=True) 63 | self.gamma = self.add_param(lasagne.init.Constant(1), shape, 'gamma', 64 | trainable=True, regularizable=False) 65 | 66 | def get_output_for(self, input, deterministic=False, **kwargs): 67 | if deterministic: 68 | # use stored mean and std 69 | mean = self.mean 70 | std = self.std 71 | else: 72 | # use this batch's mean and std 73 | mean = input.mean(self.axes, keepdims=True) 74 | std = input.std(self.axes, keepdims=True) 75 | # and update the stored mean and std: 76 | # we create (memory-aliased) clones of the stored mean and std 77 | running_mean = theano.clone(self.mean, share_inputs=False) 78 | running_std = theano.clone(self.std, share_inputs=False) 79 | # set a default update for them 80 | running_mean.default_update = ((1 - self.alpha) * running_mean + 81 | self.alpha * mean) 82 | running_std.default_update = ((1 - self.alpha) * running_std + 83 | self.alpha * std) 84 | # and include them in the graph so their default updates will be 85 | # applied (although the expressions will be optimized away later) 86 | mean += 0 * running_mean 87 | std += 0 * running_std 88 | std += self.epsilon 89 | mean = T.addbroadcast(mean, *self.axes) 90 | std = T.addbroadcast(std, *self.axes) 91 | beta = T.addbroadcast(self.beta, *self.axes) 92 | gamma = T.addbroadcast(self.gamma, *self.axes) 93 | normalized = (input - mean) * (gamma / std) + beta 94 | return self.nonlinearity(normalized) 95 | 96 | def batch_norm(layer): 97 | """ 98 | Convenience function to apply batch normalization to a given layer's output. 99 | Will steal the layer's nonlinearity if there is one (effectively introducing 100 | the normalization right before the nonlinearity), and will remove the 101 | layer's bias if there is one (because it would be redundant). 102 | 103 | @param layer: The `Layer` instance to apply the normalization to; note that 104 | it will be irreversibly modified as specified above 105 | @return: A `BatchNormLayer` instance stacked on the given `layer` 106 | """ 107 | nonlinearity = getattr(layer, 'nonlinearity', None) 108 | if nonlinearity is not None: 109 | layer.nonlinearity = lasagne.nonlinearities.identity 110 | if hasattr(layer, 'b'): 111 | del layer.params[layer.b] 112 | layer.b = None 113 | return BatchNormLayer(layer, nonlinearity=nonlinearity) 114 | -------------------------------------------------------------------------------- /binary_connect.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Matthieu Courbariaux 2 | 3 | # This file is part of BinaryConnect. 4 | 5 | # BinaryConnect is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # BinaryConnect is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with BinaryConnect. If not, see . 17 | 18 | import time 19 | 20 | from collections import OrderedDict 21 | 22 | import numpy as np 23 | 24 | # specifying the gpu to use 25 | # import theano.sandbox.cuda 26 | # theano.sandbox.cuda.use('gpu1') 27 | import theano 28 | import theano.tensor as T 29 | 30 | import lasagne 31 | 32 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 33 | 34 | def hard_sigmoid(x): 35 | return T.clip((x+1.)/2.,0,1) 36 | 37 | # The binarization function 38 | def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None): 39 | 40 | # (deterministic == True) <-> test-time <-> inference-time 41 | if not binary or (deterministic and stochastic): 42 | # print("not binary") 43 | Wb = W 44 | 45 | else: 46 | 47 | # [-1,1] -> [0,1] 48 | Wb = hard_sigmoid(W/H) 49 | 50 | # Stochastic BinaryConnect 51 | if stochastic: 52 | 53 | # print("stoch") 54 | Wb = T.cast(srng.binomial(n=1, p=Wb, size=T.shape(Wb)), theano.config.floatX) 55 | 56 | # Deterministic BinaryConnect (round to nearest) 57 | else: 58 | # print("det") 59 | Wb = T.round(Wb) 60 | 61 | # 0 or 1 -> -1 or 1 62 | Wb = T.cast(T.switch(Wb,H,-H), theano.config.floatX) 63 | 64 | return Wb 65 | 66 | # This class extends the Lasagne DenseLayer to support BinaryConnect 67 | class DenseLayer(lasagne.layers.DenseLayer): 68 | 69 | def __init__(self, incoming, num_units, 70 | binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs): 71 | 72 | self.binary = binary 73 | self.stochastic = stochastic 74 | 75 | self.H = H 76 | if H == "Glorot": 77 | num_inputs = int(np.prod(incoming.output_shape[1:])) 78 | self.H = np.float32(np.sqrt(1.5/ (num_inputs + num_units))) 79 | # print("H = "+str(self.H)) 80 | 81 | self.W_LR_scale = W_LR_scale 82 | if W_LR_scale == "Glorot": 83 | num_inputs = int(np.prod(incoming.output_shape[1:])) 84 | self.W_LR_scale = np.float32(1./np.sqrt(1.5/ (num_inputs + num_units))) 85 | 86 | self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) 87 | 88 | if self.binary: 89 | super(DenseLayer, self).__init__(incoming, num_units, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs) 90 | # add the binary tag to weights 91 | self.params[self.W]=set(['binary']) 92 | 93 | else: 94 | super(DenseLayer, self).__init__(incoming, num_units, **kwargs) 95 | 96 | def get_output_for(self, input, deterministic=False, **kwargs): 97 | 98 | self.Wb = binarization(self.W,self.H,self.binary,deterministic,self.stochastic,self._srng) 99 | Wr = self.W 100 | self.W = self.Wb 101 | 102 | rvalue = super(DenseLayer, self).get_output_for(input, **kwargs) 103 | 104 | self.W = Wr 105 | 106 | return rvalue 107 | 108 | # This class extends the Lasagne Conv2DLayer to support BinaryConnect 109 | class Conv2DLayer(lasagne.layers.Conv2DLayer): 110 | 111 | def __init__(self, incoming, num_filters, filter_size, 112 | binary = True, stochastic = True, H=1.,W_LR_scale="Glorot", **kwargs): 113 | 114 | self.binary = binary 115 | self.stochastic = stochastic 116 | 117 | self.H = H 118 | if H == "Glorot": 119 | num_inputs = int(np.prod(filter_size)*incoming.output_shape[1]) 120 | num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape 121 | self.H = np.float32(np.sqrt(1.5 / (num_inputs + num_units))) 122 | # print("H = "+str(self.H)) 123 | 124 | self.W_LR_scale = W_LR_scale 125 | if W_LR_scale == "Glorot": 126 | num_inputs = int(np.prod(filter_size)*incoming.output_shape[1]) 127 | num_units = int(np.prod(filter_size)*num_filters) # theoretically, I should divide num_units by the pool_shape 128 | self.W_LR_scale = np.float32(1./np.sqrt(1.5 / (num_inputs + num_units))) 129 | # print("W_LR_scale = "+str(self.W_LR_scale)) 130 | 131 | self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) 132 | 133 | if self.binary: 134 | super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, W=lasagne.init.Uniform((-self.H,self.H)), **kwargs) 135 | # add the binary tag to weights 136 | self.params[self.W]=set(['binary']) 137 | else: 138 | super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size, **kwargs) 139 | 140 | def convolve(self, input, deterministic=False, **kwargs): 141 | 142 | self.Wb = binarization(self.W,self.H,self.binary,deterministic,self.stochastic,self._srng) 143 | Wr = self.W 144 | self.W = self.Wb 145 | 146 | rvalue = super(Conv2DLayer, self).convolve(input, **kwargs) 147 | 148 | self.W = Wr 149 | 150 | return rvalue 151 | 152 | # This function computes the gradient of the binary weights 153 | def compute_grads(loss,network): 154 | 155 | layers = lasagne.layers.get_all_layers(network) 156 | grads = [] 157 | 158 | for layer in layers: 159 | 160 | params = layer.get_params(binary=True) 161 | if params: 162 | # print(params[0].name) 163 | grads.append(theano.grad(loss, wrt=layer.Wb)) 164 | 165 | return grads 166 | 167 | # This functions clips the weights after the parameter update 168 | def clipping_scaling(updates,network): 169 | 170 | layers = lasagne.layers.get_all_layers(network) 171 | updates = OrderedDict(updates) 172 | 173 | for layer in layers: 174 | 175 | params = layer.get_params(binary=True) 176 | for param in params: 177 | print("W_LR_scale = "+str(layer.W_LR_scale)) 178 | print("H = "+str(layer.H)) 179 | updates[param] = param + layer.W_LR_scale*(updates[param] - param) 180 | updates[param] = T.clip(updates[param], -layer.H,layer.H) 181 | 182 | return updates 183 | 184 | # Given a dataset and a model, this function trains the model on the dataset for several epochs 185 | # (There is no default train function in Lasagne yet) 186 | def train(train_fn,val_fn, 187 | batch_size, 188 | LR_start,LR_decay, 189 | num_epochs, 190 | X_train,y_train, 191 | X_val,y_val, 192 | X_test,y_test): 193 | 194 | # A function which shuffles a dataset 195 | def shuffle(X,y): 196 | 197 | shuffled_range = range(len(X)) 198 | np.random.shuffle(shuffled_range) 199 | # print(shuffled_range[0:10]) 200 | 201 | new_X = np.copy(X) 202 | new_y = np.copy(y) 203 | 204 | for i in range(len(X)): 205 | 206 | new_X[i] = X[shuffled_range[i]] 207 | new_y[i] = y[shuffled_range[i]] 208 | 209 | return new_X,new_y 210 | 211 | # This function trains the model a full epoch (on the whole dataset) 212 | def train_epoch(X,y,LR): 213 | 214 | loss = 0 215 | batches = len(X)/batch_size 216 | 217 | for i in range(batches): 218 | loss += train_fn(X[i*batch_size:(i+1)*batch_size],y[i*batch_size:(i+1)*batch_size],LR) 219 | 220 | loss/=batches 221 | 222 | return loss 223 | 224 | # This function tests the model a full epoch (on the whole dataset) 225 | def val_epoch(X,y): 226 | 227 | err = 0 228 | loss = 0 229 | batches = len(X)/batch_size 230 | 231 | for i in range(batches): 232 | new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]) 233 | err += new_err 234 | loss += new_loss 235 | 236 | err = err / batches * 100 237 | loss /= batches 238 | 239 | return err, loss 240 | 241 | # shuffle the train set 242 | X_train,y_train = shuffle(X_train,y_train) 243 | best_val_err = 100 244 | best_epoch = 1 245 | LR = LR_start 246 | 247 | # We iterate over epochs: 248 | for epoch in range(num_epochs): 249 | 250 | start_time = time.time() 251 | 252 | train_loss = train_epoch(X_train,y_train,LR) 253 | X_train,y_train = shuffle(X_train,y_train) 254 | 255 | val_err, val_loss = val_epoch(X_val,y_val) 256 | 257 | # test if validation error went down 258 | if val_err <= best_val_err: 259 | 260 | best_val_err = val_err 261 | best_epoch = epoch+1 262 | 263 | test_err, test_loss = val_epoch(X_test,y_test) 264 | 265 | epoch_duration = time.time() - start_time 266 | 267 | # Then we print the results for this epoch: 268 | print("Epoch "+str(epoch + 1)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s") 269 | print(" LR: "+str(LR)) 270 | print(" training loss: "+str(train_loss)) 271 | print(" validation loss: "+str(val_loss)) 272 | print(" validation error rate: "+str(val_err)+"%") 273 | print(" best epoch: "+str(best_epoch)) 274 | print(" best validation error rate: "+str(best_val_err)+"%") 275 | print(" test loss: "+str(test_loss)) 276 | print(" test error rate: "+str(test_err)+"%") 277 | 278 | # decay the LR 279 | LR *= LR_decay -------------------------------------------------------------------------------- /cifar10.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Matthieu Courbariaux 2 | 3 | # This file is part of BinaryConnect. 4 | 5 | # BinaryConnect is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # BinaryConnect is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with BinaryConnect. If not, see . 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | import os 22 | import time 23 | 24 | import numpy as np 25 | np.random.seed(1234) # for reproducibility? 26 | 27 | # specifying the gpu to use 28 | # import theano.sandbox.cuda 29 | # theano.sandbox.cuda.use('gpu1') 30 | import theano 31 | import theano.tensor as T 32 | 33 | import lasagne 34 | 35 | import cPickle as pickle 36 | import gzip 37 | 38 | import batch_norm 39 | import binary_connect 40 | 41 | from pylearn2.datasets.zca_dataset import ZCA_Dataset 42 | from pylearn2.utils import serial 43 | 44 | from collections import OrderedDict 45 | 46 | if __name__ == "__main__": 47 | 48 | # BN parameters 49 | batch_size = 50 50 | print("batch_size = "+str(batch_size)) 51 | # alpha is the exponential moving average factor 52 | alpha = .1 53 | print("alpha = "+str(alpha)) 54 | epsilon = 1e-4 55 | print("epsilon = "+str(epsilon)) 56 | 57 | # Training parameters 58 | num_epochs = 500 59 | print("num_epochs = "+str(num_epochs)) 60 | 61 | # BinaryConnect 62 | binary = True 63 | print("binary = "+str(binary)) 64 | stochastic = True 65 | print("stochastic = "+str(stochastic)) 66 | # (-H,+H) are the two binary values 67 | # H = "Glorot" 68 | H = 1. 69 | print("H = "+str(H)) 70 | # W_LR_scale = 1. 71 | W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper 72 | print("W_LR_scale = "+str(W_LR_scale)) 73 | 74 | # Decaying LR 75 | LR_start = 0.003 76 | print("LR_start = "+str(LR_start)) 77 | LR_fin = 0.000002 78 | print("LR_fin = "+str(LR_fin)) 79 | LR_decay = (LR_fin/LR_start)**(1./num_epochs) 80 | print("LR_decay = "+str(LR_decay)) 81 | # BTW, LR decay might good for the BN moving average... 82 | 83 | train_set_size = 45000 84 | print("train_set_size = "+str(train_set_size)) 85 | 86 | print('Loading CIFAR-10 dataset...') 87 | 88 | preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl") 89 | train_set = ZCA_Dataset( 90 | preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 91 | preprocessor = preprocessor, 92 | start=0, stop = train_set_size) 93 | valid_set = ZCA_Dataset( 94 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 95 | preprocessor = preprocessor, 96 | start=45000, stop = 50000) 97 | test_set = ZCA_Dataset( 98 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), 99 | preprocessor = preprocessor) 100 | 101 | # bc01 format 102 | # print train_set.X.shape 103 | train_set.X = train_set.X.reshape(-1,3,32,32) 104 | valid_set.X = valid_set.X.reshape(-1,3,32,32) 105 | test_set.X = test_set.X.reshape(-1,3,32,32) 106 | 107 | # flatten targets 108 | train_set.y = np.hstack(train_set.y) 109 | valid_set.y = np.hstack(valid_set.y) 110 | test_set.y = np.hstack(test_set.y) 111 | 112 | # Onehot the targets 113 | train_set.y = np.float32(np.eye(10)[train_set.y]) 114 | valid_set.y = np.float32(np.eye(10)[valid_set.y]) 115 | test_set.y = np.float32(np.eye(10)[test_set.y]) 116 | 117 | # for hinge loss 118 | train_set.y = 2* train_set.y - 1. 119 | valid_set.y = 2* valid_set.y - 1. 120 | test_set.y = 2* test_set.y - 1. 121 | 122 | print('Building the CNN...') 123 | 124 | # Prepare Theano variables for inputs and targets 125 | input = T.tensor4('inputs') 126 | target = T.matrix('targets') 127 | LR = T.scalar('LR', dtype=theano.config.floatX) 128 | 129 | cnn = lasagne.layers.InputLayer( 130 | shape=(None, 3, 32, 32), 131 | input_var=input) 132 | 133 | # 128C3-128C3-P2 134 | cnn = binary_connect.Conv2DLayer( 135 | cnn, 136 | binary=binary, 137 | stochastic=stochastic, 138 | H=H, 139 | W_LR_scale=W_LR_scale, 140 | num_filters=128, 141 | filter_size=(3, 3), 142 | pad=1, 143 | nonlinearity=lasagne.nonlinearities.identity) 144 | 145 | cnn = batch_norm.BatchNormLayer( 146 | cnn, 147 | epsilon=epsilon, 148 | alpha=alpha, 149 | nonlinearity=lasagne.nonlinearities.rectify) 150 | 151 | cnn = binary_connect.Conv2DLayer( 152 | cnn, 153 | binary=binary, 154 | stochastic=stochastic, 155 | H=H, 156 | W_LR_scale=W_LR_scale, 157 | num_filters=128, 158 | filter_size=(3, 3), 159 | pad=1, 160 | nonlinearity=lasagne.nonlinearities.identity) 161 | 162 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 163 | 164 | cnn = batch_norm.BatchNormLayer( 165 | cnn, 166 | epsilon=epsilon, 167 | alpha=alpha, 168 | nonlinearity=lasagne.nonlinearities.rectify) 169 | 170 | # 256C3-256C3-P2 171 | cnn = binary_connect.Conv2DLayer( 172 | cnn, 173 | binary=binary, 174 | stochastic=stochastic, 175 | H=H, 176 | W_LR_scale=W_LR_scale, 177 | num_filters=256, 178 | filter_size=(3, 3), 179 | pad=1, 180 | nonlinearity=lasagne.nonlinearities.identity) 181 | 182 | cnn = batch_norm.BatchNormLayer( 183 | cnn, 184 | epsilon=epsilon, 185 | alpha=alpha, 186 | nonlinearity=lasagne.nonlinearities.rectify) 187 | 188 | cnn = binary_connect.Conv2DLayer( 189 | cnn, 190 | binary=binary, 191 | stochastic=stochastic, 192 | H=H, 193 | W_LR_scale=W_LR_scale, 194 | num_filters=256, 195 | filter_size=(3, 3), 196 | pad=1, 197 | nonlinearity=lasagne.nonlinearities.identity) 198 | 199 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 200 | 201 | cnn = batch_norm.BatchNormLayer( 202 | cnn, 203 | epsilon=epsilon, 204 | alpha=alpha, 205 | nonlinearity=lasagne.nonlinearities.rectify) 206 | 207 | # 512C3-512C3-P2 208 | cnn = binary_connect.Conv2DLayer( 209 | cnn, 210 | binary=binary, 211 | stochastic=stochastic, 212 | H=H, 213 | W_LR_scale=W_LR_scale, 214 | num_filters=512, 215 | filter_size=(3, 3), 216 | pad=1, 217 | nonlinearity=lasagne.nonlinearities.identity) 218 | 219 | cnn = batch_norm.BatchNormLayer( 220 | cnn, 221 | epsilon=epsilon, 222 | alpha=alpha, 223 | nonlinearity=lasagne.nonlinearities.rectify) 224 | 225 | cnn = binary_connect.Conv2DLayer( 226 | cnn, 227 | binary=binary, 228 | stochastic=stochastic, 229 | H=H, 230 | W_LR_scale=W_LR_scale, 231 | num_filters=512, 232 | filter_size=(3, 3), 233 | pad=1, 234 | nonlinearity=lasagne.nonlinearities.identity) 235 | 236 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 237 | 238 | cnn = batch_norm.BatchNormLayer( 239 | cnn, 240 | epsilon=epsilon, 241 | alpha=alpha, 242 | nonlinearity=lasagne.nonlinearities.rectify) 243 | 244 | # print(cnn.output_shape) 245 | 246 | # 1024FP-1024FP-10FP 247 | cnn = binary_connect.DenseLayer( 248 | cnn, 249 | binary=binary, 250 | stochastic=stochastic, 251 | H=H, 252 | W_LR_scale=W_LR_scale, 253 | nonlinearity=lasagne.nonlinearities.identity, 254 | num_units=1024) 255 | 256 | cnn = batch_norm.BatchNormLayer( 257 | cnn, 258 | epsilon=epsilon, 259 | alpha=alpha, 260 | nonlinearity=lasagne.nonlinearities.rectify) 261 | 262 | cnn = binary_connect.DenseLayer( 263 | cnn, 264 | binary=binary, 265 | stochastic=stochastic, 266 | H=H, 267 | W_LR_scale=W_LR_scale, 268 | nonlinearity=lasagne.nonlinearities.identity, 269 | num_units=1024) 270 | 271 | cnn = batch_norm.BatchNormLayer( 272 | cnn, 273 | epsilon=epsilon, 274 | alpha=alpha, 275 | nonlinearity=lasagne.nonlinearities.rectify) 276 | 277 | cnn = binary_connect.DenseLayer( 278 | cnn, 279 | binary=binary, 280 | stochastic=stochastic, 281 | H=H, 282 | W_LR_scale=W_LR_scale, 283 | nonlinearity=lasagne.nonlinearities.identity, 284 | num_units=10) 285 | 286 | cnn = batch_norm.BatchNormLayer( 287 | cnn, 288 | epsilon=epsilon, 289 | alpha=alpha, 290 | nonlinearity=lasagne.nonlinearities.identity) 291 | 292 | train_output = lasagne.layers.get_output(cnn, deterministic=False) 293 | 294 | # squared hinge loss 295 | loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) 296 | 297 | if binary: 298 | 299 | # W updates 300 | W = lasagne.layers.get_all_params(cnn, binary=True) 301 | W_grads = binary_connect.compute_grads(loss,cnn) 302 | updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) 303 | updates = binary_connect.clipping_scaling(updates,cnn) 304 | 305 | # other parameters updates 306 | params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) 307 | updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) 308 | 309 | else: 310 | params = lasagne.layers.get_all_params(cnn, trainable=True) 311 | updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) 312 | 313 | test_output = lasagne.layers.get_output(cnn, deterministic=True) 314 | test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) 315 | test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) 316 | 317 | # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 318 | # and returning the corresponding training loss: 319 | train_fn = theano.function([input, target, LR], loss, updates=updates) 320 | 321 | # Compile a second function computing the validation loss and accuracy: 322 | val_fn = theano.function([input, target], [test_loss, test_err]) 323 | 324 | print('Training...') 325 | 326 | binary_connect.train( 327 | train_fn,val_fn, 328 | batch_size, 329 | LR_start,LR_decay, 330 | num_epochs, 331 | train_set.X,train_set.y, 332 | valid_set.X,valid_set.y, 333 | test_set.X,test_set.y) 334 | 335 | # print("display histogram") 336 | 337 | # W = lasagne.layers.get_all_layers(mlp)[2].W.get_value() 338 | # print(W.shape) 339 | 340 | # histogram = np.histogram(W,bins=1000,range=(-1.1,1.1)) 341 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist0.csv", histogram[0], delimiter=",") 342 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist1.csv", histogram[1], delimiter=",") 343 | 344 | # Optionally, you could now dump the network weights to a file like this: 345 | # np.savez('model.npz', lasagne.layers.get_all_param_values(network)) -------------------------------------------------------------------------------- /mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Matthieu Courbariaux 2 | 3 | # This file is part of BinaryConnect. 4 | 5 | # BinaryConnect is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # BinaryConnect is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with BinaryConnect. If not, see . 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | import os 22 | import time 23 | 24 | import numpy as np 25 | np.random.seed(1234) # for reproducibility 26 | 27 | # specifying the gpu to use 28 | # import theano.sandbox.cuda 29 | # theano.sandbox.cuda.use('gpu1') 30 | import theano 31 | import theano.tensor as T 32 | 33 | import lasagne 34 | 35 | import cPickle as pickle 36 | import gzip 37 | 38 | import batch_norm 39 | import binary_connect 40 | 41 | from pylearn2.datasets.mnist import MNIST 42 | from pylearn2.utils import serial 43 | 44 | from collections import OrderedDict 45 | 46 | if __name__ == "__main__": 47 | 48 | # BN parameters 49 | batch_size = 100 50 | print("batch_size = "+str(batch_size)) 51 | # alpha is the exponential moving average factor 52 | alpha = .15 53 | print("alpha = "+str(alpha)) 54 | epsilon = 1e-4 55 | print("epsilon = "+str(epsilon)) 56 | 57 | # MLP parameters 58 | num_units = 2048 59 | print("num_units = "+str(num_units)) 60 | n_hidden_layers = 3 61 | print("n_hidden_layers = "+str(n_hidden_layers)) 62 | 63 | # Training parameters 64 | num_epochs = 250 65 | print("num_epochs = "+str(num_epochs)) 66 | 67 | # Dropout parameters 68 | dropout_in = 0. # 0. means no dropout 69 | print("dropout_in = "+str(dropout_in)) 70 | dropout_hidden = 0. 71 | print("dropout_hidden = "+str(dropout_hidden)) 72 | 73 | # BinaryConnect 74 | binary = True 75 | print("binary = "+str(binary)) 76 | stochastic = True 77 | print("stochastic = "+str(stochastic)) 78 | # (-H,+H) are the two binary values 79 | # H = "Glorot" 80 | H = 1. 81 | print("H = "+str(H)) 82 | # W_LR_scale = 1. 83 | W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper 84 | print("W_LR_scale = "+str(W_LR_scale)) 85 | 86 | # Decaying LR 87 | LR_start = .001 88 | print("LR_start = "+str(LR_start)) 89 | LR_fin = 0.000003 90 | print("LR_fin = "+str(LR_fin)) 91 | LR_decay = (LR_fin/LR_start)**(1./num_epochs) 92 | print("LR_decay = "+str(LR_decay)) 93 | # BTW, LR decay might good for the BN moving average... 94 | 95 | print('Loading MNIST dataset...') 96 | 97 | train_set = MNIST(which_set= 'train', start=0, stop = 50000, center = True) 98 | valid_set = MNIST(which_set= 'train', start=50000, stop = 60000, center = True) 99 | test_set = MNIST(which_set= 'test', center = True) 100 | 101 | # bc01 format 102 | # print train_set.X.shape 103 | train_set.X = train_set.X.reshape(-1, 1, 28, 28) 104 | valid_set.X = valid_set.X.reshape(-1, 1, 28, 28) 105 | test_set.X = test_set.X.reshape(-1, 1, 28, 28) 106 | 107 | # flatten targets 108 | train_set.y = np.hstack(train_set.y) 109 | valid_set.y = np.hstack(valid_set.y) 110 | test_set.y = np.hstack(test_set.y) 111 | 112 | # Onehot the targets 113 | train_set.y = np.float32(np.eye(10)[train_set.y]) 114 | valid_set.y = np.float32(np.eye(10)[valid_set.y]) 115 | test_set.y = np.float32(np.eye(10)[test_set.y]) 116 | 117 | # for hinge loss 118 | train_set.y = 2* train_set.y - 1. 119 | valid_set.y = 2* valid_set.y - 1. 120 | test_set.y = 2* test_set.y - 1. 121 | 122 | print('Building the MLP...') 123 | 124 | # Prepare Theano variables for inputs and targets 125 | input = T.tensor4('inputs') 126 | target = T.matrix('targets') 127 | LR = T.scalar('LR', dtype=theano.config.floatX) 128 | 129 | mlp = lasagne.layers.InputLayer( 130 | shape=(None, 1, 28, 28), 131 | input_var=input) 132 | 133 | mlp = lasagne.layers.DropoutLayer( 134 | mlp, 135 | p=dropout_in) 136 | 137 | for k in range(n_hidden_layers): 138 | 139 | mlp = binary_connect.DenseLayer( 140 | mlp, 141 | binary=binary, 142 | stochastic=stochastic, 143 | H=H, 144 | nonlinearity=lasagne.nonlinearities.identity, 145 | num_units=num_units) 146 | 147 | mlp = batch_norm.BatchNormLayer( 148 | mlp, 149 | epsilon=epsilon, 150 | alpha=alpha, 151 | nonlinearity=lasagne.nonlinearities.rectify) 152 | 153 | mlp = lasagne.layers.DropoutLayer( 154 | mlp, 155 | p=dropout_hidden) 156 | 157 | mlp = binary_connect.DenseLayer( 158 | mlp, 159 | binary=binary, 160 | stochastic=stochastic, 161 | H=H, 162 | nonlinearity=lasagne.nonlinearities.identity, 163 | num_units=10) 164 | 165 | mlp = batch_norm.BatchNormLayer( 166 | mlp, 167 | epsilon=epsilon, 168 | alpha=alpha, 169 | nonlinearity=lasagne.nonlinearities.identity) 170 | 171 | train_output = lasagne.layers.get_output(mlp, deterministic=False) 172 | 173 | # squared hinge loss 174 | loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) 175 | 176 | if binary: 177 | 178 | # W updates 179 | W = lasagne.layers.get_all_params(mlp, binary=True) 180 | W_grads = binary_connect.compute_grads(loss,mlp) 181 | updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) 182 | updates = binary_connect.clipping_scaling(updates,mlp) 183 | 184 | # other parameters updates 185 | params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) 186 | updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) 187 | 188 | else: 189 | params = lasagne.layers.get_all_params(mlp, trainable=True) 190 | updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) 191 | 192 | test_output = lasagne.layers.get_output(mlp, deterministic=True) 193 | test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) 194 | test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) 195 | 196 | # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 197 | # and returning the corresponding training loss: 198 | train_fn = theano.function([input, target, LR], loss, updates=updates) 199 | 200 | # Compile a second function computing the validation loss and accuracy: 201 | val_fn = theano.function([input, target], [test_loss, test_err]) 202 | 203 | print('Training...') 204 | 205 | binary_connect.train( 206 | train_fn,val_fn, 207 | batch_size, 208 | LR_start,LR_decay, 209 | num_epochs, 210 | train_set.X,train_set.y, 211 | valid_set.X,valid_set.y, 212 | test_set.X,test_set.y) 213 | 214 | # print("display histogram") 215 | 216 | # W = lasagne.layers.get_all_layers(mlp)[2].W.get_value() 217 | # print(W.shape) 218 | 219 | # histogram = np.histogram(W,bins=1000,range=(-1.1,1.1)) 220 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist0.csv", histogram[0], delimiter=",") 221 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist1.csv", histogram[1], delimiter=",") 222 | 223 | # Optionally, you could now dump the network weights to a file like this: 224 | # np.savez('model.npz', lasagne.layers.get_all_param_values(network)) -------------------------------------------------------------------------------- /svhn.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Matthieu Courbariaux 2 | 3 | # This file is part of BinaryConnect. 4 | 5 | # BinaryConnect is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # BinaryConnect is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with BinaryConnect. If not, see . 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | import os 22 | import time 23 | 24 | import numpy as np 25 | np.random.seed(1234) # for reproducibility? 26 | 27 | # specifying the gpu to use 28 | # import theano.sandbox.cuda 29 | # theano.sandbox.cuda.use('gpu1') 30 | import theano 31 | import theano.tensor as T 32 | 33 | import lasagne 34 | 35 | import cPickle as pickle 36 | import gzip 37 | 38 | import batch_norm 39 | import binary_connect 40 | 41 | from pylearn2.datasets.svhn import SVHN 42 | from pylearn2.utils import serial 43 | 44 | from collections import OrderedDict 45 | 46 | if __name__ == "__main__": 47 | 48 | # Batch Normalization parameters 49 | batch_size = 50 50 | print("batch_size = "+str(batch_size)) 51 | # alpha is the exponential moving average factor 52 | alpha = .1 53 | print("alpha = "+str(alpha)) 54 | epsilon = 1e-4 55 | print("epsilon = "+str(epsilon)) 56 | 57 | # Training parameters 58 | num_epochs = 200 59 | print("num_epochs = "+str(num_epochs)) 60 | 61 | # BinaryConnect 62 | binary = True 63 | print("binary = "+str(binary)) 64 | stochastic = True 65 | print("stochastic = "+str(stochastic)) 66 | # (-H,+H) are the two binary values 67 | # H = "Glorot" 68 | H = 1. 69 | print("H = "+str(H)) 70 | # W_LR_scale = 1. 71 | W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper 72 | print("W_LR_scale = "+str(W_LR_scale)) 73 | 74 | # Decaying LR 75 | LR_start = 0.01 76 | print("LR_start = "+str(LR_start)) 77 | LR_fin = 0.000003 78 | print("LR_fin = "+str(LR_fin)) 79 | LR_decay = (LR_fin/LR_start)**(1./num_epochs) 80 | print("LR_decay = "+str(LR_decay)) 81 | # BTW, LR decay might good for the BN moving average... 82 | 83 | print('Loading SVHN dataset') 84 | 85 | train_set = SVHN( 86 | which_set= 'splitted_train', 87 | path= "${SVHN_LOCAL_PATH}", 88 | axes= ['b', 'c', 0, 1]) 89 | 90 | valid_set = SVHN( 91 | which_set= 'valid', 92 | path= "${SVHN_LOCAL_PATH}", 93 | axes= ['b', 'c', 0, 1]) 94 | 95 | test_set = SVHN( 96 | which_set= 'test', 97 | path= "${SVHN_LOCAL_PATH}", 98 | axes= ['b', 'c', 0, 1]) 99 | 100 | # bc01 format 101 | # print train_set.X.shape 102 | train_set.X = np.reshape(train_set.X,(-1,3,32,32)) 103 | valid_set.X = np.reshape(valid_set.X,(-1,3,32,32)) 104 | test_set.X = np.reshape(test_set.X,(-1,3,32,32)) 105 | 106 | # for hinge loss (targets are already onehot) 107 | train_set.y = np.subtract(np.multiply(2,train_set.y),1.) 108 | valid_set.y = np.subtract(np.multiply(2,valid_set.y),1.) 109 | test_set.y = np.subtract(np.multiply(2,test_set.y),1.) 110 | 111 | print('Building the CNN...') 112 | 113 | # Prepare Theano variables for inputs and targets 114 | input = T.tensor4('inputs') 115 | target = T.matrix('targets') 116 | LR = T.scalar('LR', dtype=theano.config.floatX) 117 | 118 | cnn = lasagne.layers.InputLayer( 119 | shape=(None, 3, 32, 32), 120 | input_var=input) 121 | 122 | # 64C3-64C3-P2 123 | cnn = binary_connect.Conv2DLayer( 124 | cnn, 125 | binary=binary, 126 | stochastic=stochastic, 127 | H=H, 128 | W_LR_scale=W_LR_scale, 129 | num_filters=64, 130 | filter_size=(3, 3), 131 | pad=1, 132 | nonlinearity=lasagne.nonlinearities.identity) 133 | 134 | cnn = batch_norm.BatchNormLayer( 135 | cnn, 136 | epsilon=epsilon, 137 | alpha=alpha, 138 | nonlinearity=lasagne.nonlinearities.rectify) 139 | 140 | cnn = binary_connect.Conv2DLayer( 141 | cnn, 142 | binary=binary, 143 | stochastic=stochastic, 144 | H=H, 145 | W_LR_scale=W_LR_scale, 146 | num_filters=64, 147 | filter_size=(3, 3), 148 | pad=1, 149 | nonlinearity=lasagne.nonlinearities.identity) 150 | 151 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 152 | 153 | cnn = batch_norm.BatchNormLayer( 154 | cnn, 155 | epsilon=epsilon, 156 | alpha=alpha, 157 | nonlinearity=lasagne.nonlinearities.rectify) 158 | 159 | # 128C3-128C3-P2 160 | cnn = binary_connect.Conv2DLayer( 161 | cnn, 162 | binary=binary, 163 | stochastic=stochastic, 164 | H=H, 165 | W_LR_scale=W_LR_scale, 166 | num_filters=128, 167 | filter_size=(3, 3), 168 | pad=1, 169 | nonlinearity=lasagne.nonlinearities.identity) 170 | 171 | cnn = batch_norm.BatchNormLayer( 172 | cnn, 173 | epsilon=epsilon, 174 | alpha=alpha, 175 | nonlinearity=lasagne.nonlinearities.rectify) 176 | 177 | cnn = binary_connect.Conv2DLayer( 178 | cnn, 179 | binary=binary, 180 | stochastic=stochastic, 181 | H=H, 182 | W_LR_scale=W_LR_scale, 183 | num_filters=128, 184 | filter_size=(3, 3), 185 | pad=1, 186 | nonlinearity=lasagne.nonlinearities.identity) 187 | 188 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 189 | 190 | cnn = batch_norm.BatchNormLayer( 191 | cnn, 192 | epsilon=epsilon, 193 | alpha=alpha, 194 | nonlinearity=lasagne.nonlinearities.rectify) 195 | 196 | # 256C3-256C3-P2 197 | cnn = binary_connect.Conv2DLayer( 198 | cnn, 199 | binary=binary, 200 | stochastic=stochastic, 201 | H=H, 202 | W_LR_scale=W_LR_scale, 203 | num_filters=256, 204 | filter_size=(3, 3), 205 | pad=1, 206 | nonlinearity=lasagne.nonlinearities.identity) 207 | 208 | cnn = batch_norm.BatchNormLayer( 209 | cnn, 210 | epsilon=epsilon, 211 | alpha=alpha, 212 | nonlinearity=lasagne.nonlinearities.rectify) 213 | 214 | cnn = binary_connect.Conv2DLayer( 215 | cnn, 216 | binary=binary, 217 | stochastic=stochastic, 218 | H=H, 219 | W_LR_scale=W_LR_scale, 220 | num_filters=256, 221 | filter_size=(3, 3), 222 | pad=1, 223 | nonlinearity=lasagne.nonlinearities.identity) 224 | 225 | cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) 226 | 227 | cnn = batch_norm.BatchNormLayer( 228 | cnn, 229 | epsilon=epsilon, 230 | alpha=alpha, 231 | nonlinearity=lasagne.nonlinearities.rectify) 232 | 233 | # print(cnn.output_shape) 234 | 235 | # 1024FP-1024FP-10FP 236 | cnn = binary_connect.DenseLayer( 237 | cnn, 238 | binary=binary, 239 | stochastic=stochastic, 240 | H=H, 241 | W_LR_scale=W_LR_scale, 242 | nonlinearity=lasagne.nonlinearities.identity, 243 | num_units=1024) 244 | 245 | cnn = batch_norm.BatchNormLayer( 246 | cnn, 247 | epsilon=epsilon, 248 | alpha=alpha, 249 | nonlinearity=lasagne.nonlinearities.rectify) 250 | 251 | cnn = binary_connect.DenseLayer( 252 | cnn, 253 | binary=binary, 254 | stochastic=stochastic, 255 | H=H, 256 | W_LR_scale=W_LR_scale, 257 | nonlinearity=lasagne.nonlinearities.identity, 258 | num_units=1024) 259 | 260 | cnn = batch_norm.BatchNormLayer( 261 | cnn, 262 | epsilon=epsilon, 263 | alpha=alpha, 264 | nonlinearity=lasagne.nonlinearities.rectify) 265 | 266 | cnn = binary_connect.DenseLayer( 267 | cnn, 268 | binary=binary, 269 | stochastic=stochastic, 270 | H=H, 271 | W_LR_scale=W_LR_scale, 272 | nonlinearity=lasagne.nonlinearities.identity, 273 | num_units=10) 274 | 275 | cnn = batch_norm.BatchNormLayer( 276 | cnn, 277 | epsilon=epsilon, 278 | alpha=alpha, 279 | nonlinearity=lasagne.nonlinearities.identity) 280 | 281 | train_output = lasagne.layers.get_output(cnn, deterministic=False) 282 | 283 | # squared hinge loss 284 | loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) 285 | 286 | if binary: 287 | 288 | # W updates 289 | W = lasagne.layers.get_all_params(cnn, binary=True) 290 | W_grads = binary_connect.compute_grads(loss,cnn) 291 | updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) 292 | updates = binary_connect.clipping_scaling(updates,cnn) 293 | 294 | # other parameters updates 295 | params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) 296 | updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) 297 | 298 | else: 299 | params = lasagne.layers.get_all_params(cnn, trainable=True) 300 | updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) 301 | 302 | test_output = lasagne.layers.get_output(cnn, deterministic=True) 303 | test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) 304 | test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) 305 | 306 | # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 307 | # and returning the corresponding training loss: 308 | train_fn = theano.function([input, target, LR], loss, updates=updates) 309 | 310 | # Compile a second function computing the validation loss and accuracy: 311 | val_fn = theano.function([input, target], [test_loss, test_err]) 312 | 313 | print('Training...') 314 | 315 | binary_connect.train( 316 | train_fn,val_fn, 317 | batch_size, 318 | LR_start,LR_decay, 319 | num_epochs, 320 | train_set.X,train_set.y, 321 | valid_set.X,valid_set.y, 322 | test_set.X,test_set.y) 323 | 324 | # print("display histogram") 325 | 326 | # W = lasagne.layers.get_all_layers(mlp)[2].W.get_value() 327 | # print(W.shape) 328 | 329 | # histogram = np.histogram(W,bins=1000,range=(-1.1,1.1)) 330 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist0.csv", histogram[0], delimiter=",") 331 | # np.savetxt(str(dropout_hidden)+str(binary)+str(stochastic)+str(H)+"_hist1.csv", histogram[1], delimiter=",") 332 | 333 | # Optionally, you could now dump the network weights to a file like this: 334 | # np.savez('model.npz', lasagne.layers.get_all_param_values(network)) -------------------------------------------------------------------------------- /svhn_preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import shutil 4 | from theano import config 5 | from pylearn2.datasets import preprocessing 6 | from pylearn2.datasets.svhn import SVHN 7 | from pylearn2.utils.string_utils import preprocess 8 | 9 | orig_path = preprocess('${PYLEARN2_DATA_PATH}/SVHN/format2') 10 | try: 11 | local_path = preprocess('${SVHN_LOCAL_PATH}') 12 | except ValueError: 13 | raise ValueError("You need to define SVHN_LOCAL_PATH environment " 14 | "variable.") 15 | 16 | train_name ='h5/splitted_train_32x32.h5' 17 | valid_name = 'h5/valid_32x32.h5' 18 | test_name = 'h5/test_32x32.h5' 19 | 20 | # copy data if don't exist 21 | if not os.path.isdir(os.path.join(local_path, 'h5')): 22 | os.makedirs(os.path.join(local_path, 'h5')) 23 | 24 | for d_set in [train_name, valid_name, test_name]: 25 | if not os.path.isfile(os.path.join(local_path, d_set)): 26 | logging.info("Copying data from {0} to {1}".format(os.path.join(local_path, d_set), local_path)) 27 | shutil.copyfile(os.path.join(orig_path, d_set), 28 | os.path.join(local_path, d_set)) 29 | 30 | def check_dtype(data): 31 | if str(data.X.dtype) != config.floatX: 32 | logging.warning("The dataset is saved as {}, changing theano's floatX "\ 33 | "to the same dtype".format(data.X.dtype)) 34 | config.floatX = str(data.X.dtype) 35 | 36 | # Load train data 37 | train = SVHN('splitted_train', path=local_path) 38 | check_dtype(train) 39 | 40 | # prepare preprocessing 41 | pipeline = preprocessing.Pipeline() 42 | # without batch_size there is a high chance that you might encounter memory error 43 | # or pytables crashes 44 | pipeline.items.append(preprocessing.GlobalContrastNormalization(batch_size=5000)) 45 | pipeline.items.append(preprocessing.LeCunLCN((32,32))) 46 | 47 | # apply the preprocessings to train 48 | train.apply_preprocessor(pipeline, can_fit=True) 49 | del train 50 | 51 | # load and preprocess valid 52 | valid = SVHN('valid', path=local_path) 53 | check_dtype(valid) 54 | valid.apply_preprocessor(pipeline, can_fit=False) 55 | 56 | # load and preprocess test 57 | test = SVHN('test', path=local_path) 58 | check_dtype(test) 59 | test.apply_preprocessor(pipeline, can_fit=False) 60 | --------------------------------------------------------------------------------