├── LICENSE.txt ├── README.md ├── format.py ├── layer.py ├── main.py ├── model.py ├── trainer.py └── utilities ├── filter_plot.py ├── goliat2_script.py ├── goliat3_script.py ├── goliat4_script.py ├── results_extractor.py └── svhn_preprocessing.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-learning-multipliers 2 | 3 | ## Requirements 4 | 5 | * Theano 0.6 (Bleeding edge version) 6 | * Pylearn2 0.1 7 | * PyTables (for the SVHN dataset) 8 | * a CUDA capable GPU 9 | 10 | ## Goal 11 | 12 | This code was written to allow anyone to easily reproduce the results 13 | of the article "Deep learning with low precision multipliers", available at http://arxiv.org/abs/1412.7024 . 14 | The article in question assesses whether it is possible to train Deep Neural Networks with low precision multipliers. 15 | 16 | Note that this code only simulates the impact of low precision multipliers. 17 | It does not exploit it in any way. 18 | If you are looking for fast low precision GPU kernels, NervanaSystems made some available https://github.com/NervanaSystems/nervanagpu . 19 | 20 | ## How to run it 21 | 22 | ### Command line 23 | 24 | python main.py [task] [format] [initial range] [propagations bit-width] 25 | [parameters updates bit-width] [ranges updates frequency] 26 | [maximum overflow rate] [number of epochs of ranges initialization] 27 | 28 | ### Task 29 | 30 | There are 4 different tasks: the permutation invariant MNIST (PI_MNIST), 31 | MNIST, CIFAR10 and SVHN. 32 | A set of hyperparameters is associated with each of those tasks 33 | (They are stored in model.py). 34 | For the SVHN dataset, 35 | you need to set an environment variable: 36 | 37 | SVHN_LOCAL_PATH=/tmp/SVHN/ 38 | 39 | You then need to pre-process it with the script 40 | utilities/svhn_preprocessing.py (script taken from pylearn2). 41 | 42 | ### Format 43 | 44 | There are 4 different formats: floating point (FLP), 45 | half floating point (HFLP), 46 | fixed point (FXP) and dynamic fixed point (DFXP). 47 | 48 | ### Initial range 49 | 50 | Initial range is only useful for FXP and DFXP. 51 | It is the initial position of the radix point 52 | for the fixed point formats. 53 | 5 works most of the time. 54 | 55 | ### Propagations and parameters updates bit-widths 56 | 57 | Only useful for FXP and DFXP. 58 | Those are the bit-widths of respectively the 59 | propagations and the parameters updates. 60 | Note that the sign is not counted in the bit-width. 61 | 62 | ### Ranges update frequency 63 | 64 | Range update frequency is only useful for DFXP. 65 | It is the number of batches between two ranges updates. 66 | 67 | ### Maximum overflow rate 68 | 69 | Only useful for DFXP. 70 | It is the amount of overflow tolerated before modifying the range. 71 | 72 | ### Number of epochs of range initialization 73 | 74 | Only useful for DFXP. 75 | This is the number of epochs we train with high precision 76 | to find the initial scaling factors. 77 | Once they are found, 78 | the parameters are reinitialized, and the DFXP training can begin. 79 | 80 | ### Examples 81 | 82 | python main.py PI_MNIST FLP 83 | python main.py SVHN FXP 5 19 19 84 | python main.py CIFAR10 DFXP 5 9 11 100 0.0001 2 85 | 86 | -------------------------------------------------------------------------------- /format.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matthieu Courbariaux 2 | 3 | # This file is part of deep-learning-multipliers. 4 | 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # deep-learning-multipliers is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with deep-learning-multipliers. If not, see . 17 | 18 | import gzip 19 | import cPickle 20 | import numpy as np 21 | import os 22 | import os.path 23 | import sys 24 | import theano 25 | import theano.tensor as T 26 | import time 27 | 28 | from theano.scalar.basic import UnaryScalarOp, same_out_nocomplex 29 | from theano.tensor.elemwise import Elemwise 30 | 31 | def apply_format(format, X, NOB, NOIB): 32 | 33 | if format == "FXP" or format == "DFXP": 34 | return fixed_point(X,NOB, NOIB) 35 | 36 | elif format == "FLP": 37 | return X 38 | 39 | elif format == "HFLP": 40 | return float16(X) 41 | 42 | # float16 function 43 | # we are using the nvidia cuda function (only works on GPU) 44 | class Float16(UnaryScalarOp): 45 | 46 | def impl(self, x): 47 | return numpy.float32(numpy.float16(x)) 48 | 49 | def c_code(self, node, name, (x,), (z,), sub): 50 | return "%(z)s = __half2float(__float2half_rn(%(x)s));" % locals() 51 | float16_scalar = Float16(same_out_nocomplex, name='float16') 52 | float16 = Elemwise(float16_scalar) 53 | 54 | # this function simulate the precision and the range of a fixed point 55 | # while working with floats 56 | # NOB = Number Of Bits = bit-width 57 | # NOIB = Number Of Integer Bits = position of the radix point = range 58 | def fixed_point(X,NOB, NOIB): 59 | 60 | power = T.cast(2.**(NOB - NOIB), theano.config.floatX) # float ! 61 | max = T.cast((2.**NOB)-1, theano.config.floatX) 62 | value = X*power 63 | value = T.round(value) # rounding 64 | value = T.clip(value, -max, max) # saturation arithmetic 65 | value = value/power 66 | return value 67 | 68 | # compute the new range of the dynamic fixed point representation 69 | def new_range(overflow, overflow_1, max_overflow): 70 | 71 | # the goal is to update the range of the vector 72 | # we know the overflow rates associated with range (overflow) 73 | # and range-1 (overflow_1) 74 | # if (overflow > max_overflow): increment range 75 | # else if (overflow_1 < max_overflow): decrement range 76 | return T.switch(T.gt(overflow, max_overflow), 1, 77 | T.switch(T.gt(overflow_1, max_overflow), 0, - 1)) 78 | 79 | # Overflow rate of a vector knowing its NOIB and NOB 80 | def overflow(vector, NOB, NOIB): 81 | 82 | # compute the max value of the fixed point representation (i.e. the overflow value) 83 | max = ((2.**NOB)-1)/(2.**(NOB - NOIB)) 84 | 85 | # compute the overflow rate of the vector 86 | overflow = T.mean(T.switch(T.ge(T.abs_(vector), max), 1., 0.)) 87 | 88 | return overflow 89 | -------------------------------------------------------------------------------- /layer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matthieu Courbariaux 2 | 3 | # This file is part of deep-learning-multipliers. 4 | 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # deep-learning-multipliers is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with deep-learning-multipliers. If not, see . 17 | 18 | import gzip 19 | import cPickle 20 | import numpy as np 21 | import os 22 | import os.path 23 | import sys 24 | import theano 25 | import theano.tensor as T 26 | from theano import pp 27 | import time 28 | import scipy.stats 29 | from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs 30 | from theano.sandbox.cuda.basic_ops import gpu_contiguous 31 | from pylearn2.sandbox.cuda_convnet.pool import MaxPool 32 | 33 | from format import apply_format, overflow, new_range 34 | 35 | class dropout_layer(object): 36 | 37 | def __init__(self, rng, p, scale, max_col_norm, format, 38 | comp_precision, update_precision, initial_range, max_overflow, w_LR_scale = 1., b_LR_scale = 1.): 39 | 40 | print " p = " + str(p) 41 | print " scale = " + str(scale) 42 | print " w_LR_scale = " + str(w_LR_scale) 43 | print " b_LR_scale = " + str(b_LR_scale) 44 | print " max_col_norm = " + str(max_col_norm) 45 | print " format = " + str(format) 46 | 47 | # save the parameters 48 | self.p = p 49 | self.scale = scale 50 | self.w_LR_scale = w_LR_scale 51 | self.b_LR_scale = b_LR_scale 52 | self.rng = rng 53 | self.max_col_norm = max_col_norm 54 | self.format = format 55 | 56 | # create shared variables 57 | self.comp_precision = theano.shared(value=comp_precision, name='comp_precision') 58 | self.update_precision = theano.shared(value=update_precision, name='update_precision') 59 | self.max_overflow = theano.shared(value=max_overflow, name='max_overflow') 60 | 61 | # create shared variables for the fixed point range 62 | self.z_range = theano.shared(value=initial_range, name='z_range') 63 | self.dEdz_range = theano.shared(value=initial_range, name='dEdz_range') 64 | self.y_range = theano.shared(value=initial_range, name='y_range') 65 | self.dEdy_range = theano.shared(value=initial_range, name='dEdy_range') 66 | self.w_range = theano.shared(value=initial_range, name='w_range') 67 | self.b_range = theano.shared(value=initial_range, name='b_range') 68 | self.dEdw_range = theano.shared(value=initial_range, name='dEdw_range') 69 | self.dEdb_range = theano.shared(value=initial_range, name='dEdb_range') 70 | self.update_w_range = theano.shared(value=initial_range, name='update_w_range') 71 | self.update_b_range = theano.shared(value=initial_range, name='update_b_range') 72 | 73 | # overflow counters for current range (needed to know when to augment the range) 74 | self.z_overflow = theano.shared(value=0., name='z_overflow') 75 | self.dEdz_overflow = theano.shared(value=0., name='dEdz_overflow') 76 | self.y_overflow = theano.shared(value=0., name='y_overflow') 77 | self.dEdy_overflow = theano.shared(value=0., name='dEdy_overflow') 78 | self.w_overflow = theano.shared(value=0., name='w_overflow') 79 | self.b_overflow = theano.shared(value=0., name='b_overflow') 80 | self.dEdw_overflow = theano.shared(value=0., name='dEdw_overflow') 81 | self.dEdb_overflow = theano.shared(value=0., name='dEdb_overflow') 82 | self.update_w_overflow = theano.shared(value=0., name='update_w_overflow') 83 | self.update_b_overflow = theano.shared(value=0., name='update_b_overflow') 84 | 85 | # overflow counter for current range-1 (needed to know when to reduce the range) 86 | self.z_overflow_1 = theano.shared(value=0., name='z_overflow_1') 87 | self.dEdz_overflow_1 = theano.shared(value=0., name='dEdz_overflow_1') 88 | self.y_overflow_1 = theano.shared(value=0., name='y_overflow_1') 89 | self.dEdy_overflow_1 = theano.shared(value=0., name='dEdy_overflow_1') 90 | self.w_overflow_1 = theano.shared(value=0., name='w_overflow_1') 91 | self.b_overflow_1 = theano.shared(value=0., name='b_overflow_1') 92 | self.dEdw_overflow_1 = theano.shared(value=0., name='dEdw_overflow_1') 93 | self.dEdb_overflow_1 = theano.shared(value=0., name='dEdb_overflow_1') 94 | self.update_w_overflow_1 = theano.shared(value=0., name='update_w_overflow_1') 95 | self.update_b_overflow_1 = theano.shared(value=0., name='update_b_overflow_1') 96 | 97 | def fprop(self, input): 98 | 99 | # we reduce the precision of parameters for the computations 100 | self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) 101 | self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) 102 | 103 | # scaled weighted sum 104 | self.z = apply_format(self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp*self.scale, self.comp_precision, self.z_range) 105 | 106 | # activation 107 | self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range) 108 | 109 | # return the output 110 | return self.y 111 | 112 | def dropout_fprop(self, input): 113 | 114 | # we reduce the precision of parameters for the computations 115 | self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) 116 | self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) 117 | 118 | # create the dropout mask 119 | # The cast is important because 120 | # int * float32 = float64 which pulls things off the gpu 121 | srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) 122 | self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) 123 | 124 | # apply the mask 125 | self.fixed_x = input * self.mask 126 | 127 | # weighted sum 128 | self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b 129 | self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) 130 | 131 | # activation 132 | self.y = self.activation(self.fixed_z) 133 | self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) 134 | 135 | # return the output 136 | return self.fixed_y 137 | 138 | def activation(self): 139 | 140 | raise NotImplementedError("Subclass must implement abstract method") 141 | 142 | def activation_bprop(self): 143 | 144 | raise NotImplementedError("Subclass must implement abstract method") 145 | 146 | def bprop(self, dEdy): 147 | 148 | self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range) 149 | 150 | # activation 151 | self.activation_bprop() 152 | 153 | # compute gradients of parameters 154 | self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range) 155 | self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range) 156 | 157 | # weighted sum 158 | dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0] 159 | 160 | # apply mask 161 | dEdx = self.mask * dEdx 162 | 163 | return dEdx 164 | 165 | def parameter_updates(self, LR, M): 166 | 167 | # compute updates 168 | new_update_W = apply_format(self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range) 169 | new_update_b = apply_format(self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range) 170 | 171 | # compute new parameters. Note that we use a better precision than the other operations 172 | new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range) 173 | new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range) 174 | 175 | # L2 column constraint on W 176 | col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0)) 177 | # col_norms = T.max(new_W, axis=0) 178 | desired_norms = T.clip(col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max 179 | new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range) 180 | # for some reason, works better than 181 | # new_W = new_W * (desired_norms / col_norms) 182 | # It may be a kind of regularization 183 | 184 | # return the updates of shared variables 185 | updates = [] 186 | updates.append((self.W, new_W)) 187 | updates.append((self.b, new_b)) 188 | updates.append((self.update_W, new_update_W)) 189 | updates.append((self.update_b, new_update_b)) 190 | 191 | return updates 192 | 193 | def overflow_updates(self): 194 | 195 | updates = [] 196 | 197 | # update overflow counters for the dynamic fixed point 198 | updates.append((self.z_overflow, self.z_overflow + overflow(self.fixed_z, self.comp_precision, self.z_range))) 199 | updates.append((self.dEdz_overflow, self.dEdz_overflow + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range))) 200 | updates.append((self.y_overflow, self.y_overflow + overflow(self.fixed_y, self.comp_precision, self.y_range))) 201 | updates.append((self.dEdy_overflow, self.dEdy_overflow + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range))) 202 | updates.append((self.w_overflow, self.w_overflow + overflow(self.W, self.update_precision, self.w_range))) 203 | updates.append((self.b_overflow, self.b_overflow + overflow(self.b, self.update_precision, self.b_range))) 204 | updates.append((self.dEdw_overflow, self.dEdw_overflow + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range))) 205 | updates.append((self.dEdb_overflow, self.dEdb_overflow + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range))) 206 | updates.append((self.update_w_overflow, self.update_w_overflow + overflow(self.update_W, self.comp_precision, self.update_w_range))) 207 | updates.append((self.update_b_overflow, self.update_b_overflow + overflow(self.update_b, self.comp_precision, self.update_b_range))) 208 | 209 | updates.append((self.z_overflow_1, self.z_overflow_1 + overflow(self.fixed_z, self.comp_precision, self.z_range-1))) 210 | updates.append((self.dEdz_overflow_1, self.dEdz_overflow_1 + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range-1))) 211 | updates.append((self.y_overflow_1, self.y_overflow_1 + overflow(self.fixed_y, self.comp_precision, self.y_range-1))) 212 | updates.append((self.dEdy_overflow_1, self.dEdy_overflow_1 + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range-1))) 213 | updates.append((self.w_overflow_1, self.w_overflow_1 + overflow(self.W, self.update_precision, self.w_range-1))) 214 | updates.append((self.b_overflow_1, self.b_overflow_1 + overflow(self.b, self.update_precision, self.b_range-1))) 215 | updates.append((self.dEdw_overflow_1, self.dEdw_overflow_1 + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range-1))) 216 | updates.append((self.dEdb_overflow_1, self.dEdb_overflow_1 + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range-1))) 217 | updates.append((self.update_w_overflow_1, self.update_w_overflow_1 + overflow(self.update_W, self.comp_precision, self.update_w_range-1))) 218 | updates.append((self.update_b_overflow_1, self.update_b_overflow_1 + overflow(self.update_b, self.comp_precision, self.update_b_range-1))) 219 | 220 | return updates 221 | 222 | def range_updates(self,batch_count): 223 | 224 | updates = [] 225 | 226 | # update the ranges according to the overflow counters 227 | updates.append((self.z_range, self.z_range+new_range(self.z_overflow/batch_count,self.z_overflow_1/batch_count, self.max_overflow))) 228 | updates.append((self.dEdz_range, self.dEdz_range+new_range(self.dEdz_overflow/batch_count, self.dEdz_overflow_1/batch_count, self.max_overflow))) 229 | updates.append((self.y_range, self.y_range+new_range(self.y_overflow/batch_count, self.y_overflow_1/batch_count, self.max_overflow))) 230 | updates.append((self.dEdy_range, self.dEdy_range+new_range(self.dEdy_overflow/batch_count, self.dEdy_overflow_1/batch_count, self.max_overflow))) 231 | updates.append((self.w_range, self.w_range+new_range(self.w_overflow/batch_count, self.w_overflow_1/batch_count, self.max_overflow))) 232 | updates.append((self.b_range, self.b_range+new_range(self.b_overflow/batch_count, self.b_overflow_1/batch_count, self.max_overflow))) 233 | updates.append((self.dEdw_range, self.dEdw_range+new_range(self.dEdw_overflow/batch_count, self.dEdw_overflow_1/batch_count, self.max_overflow))) 234 | updates.append((self.dEdb_range, self.dEdb_range+new_range(self.dEdb_overflow/batch_count, self.dEdb_overflow_1/batch_count, self.max_overflow))) 235 | updates.append((self.update_w_range, self.update_w_range+new_range(self.update_w_overflow/batch_count, self.update_w_overflow_1/batch_count, self.max_overflow))) 236 | updates.append((self.update_b_range, self.update_b_range+new_range(self.update_b_overflow/batch_count, self.update_b_overflow_1/batch_count, self.max_overflow))) 237 | 238 | # reset the overflow counters 239 | updates.append((self.z_overflow, 0.)) 240 | updates.append((self.dEdz_overflow, 0.)) 241 | updates.append((self.y_overflow, 0.)) 242 | updates.append((self.dEdy_overflow, 0.)) 243 | updates.append((self.w_overflow, 0.)) 244 | updates.append((self.b_overflow, 0.)) 245 | updates.append((self.dEdw_overflow, 0.)) 246 | updates.append((self.dEdb_overflow, 0.)) 247 | updates.append((self.update_w_overflow, 0.)) 248 | updates.append((self.update_b_overflow, 0.)) 249 | 250 | updates.append((self.z_overflow_1, 0.)) 251 | updates.append((self.dEdz_overflow_1, 0.)) 252 | updates.append((self.y_overflow_1, 0.)) 253 | updates.append((self.dEdy_overflow_1, 0.)) 254 | updates.append((self.w_overflow_1, 0.)) 255 | updates.append((self.b_overflow_1, 0.)) 256 | updates.append((self.dEdw_overflow_1, 0.)) 257 | updates.append((self.dEdb_overflow_1, 0.)) 258 | updates.append((self.update_w_overflow_1, 0.)) 259 | updates.append((self.update_b_overflow_1, 0.)) 260 | 261 | return updates 262 | 263 | def print_range(self): 264 | 265 | print ' z NOIB = %i' %(self.z_range.get_value()) 266 | print ' y NOIB = %i' %(self.y_range.get_value()) 267 | print ' w NOIB = %i' %(self.w_range.get_value()) 268 | print ' b NOIB = %i' %(self.b_range.get_value()) 269 | print ' dEdz NOIB = %i' %(self.dEdz_range.get_value()) 270 | print ' dEdy NOIB = %i' %(self.dEdy_range.get_value()) 271 | print ' dEdw NOIB = %i' %(self.dEdw_range.get_value()) 272 | print ' dEdb NOIB = %i' %(self.dEdb_range.get_value()) 273 | print ' update w NOIB = %i' %(self.update_w_range.get_value()) 274 | print ' update b NOIB = %i' %(self.update_b_range.get_value()) 275 | 276 | class MaxoutLayer(dropout_layer): 277 | 278 | def __init__(self, rng, n_inputs, n_units, n_pieces, p, scale, max_col_norm, format, 279 | comp_precision, update_precision, initial_range, max_overflow): 280 | 281 | self.n_pieces=n_pieces 282 | self.n_inputs = n_inputs 283 | self.n_units = n_units 284 | 285 | print " n_pieces = " + str(n_pieces) 286 | print " n_inputs = " + str(n_inputs) 287 | print " n_units = " + str(n_units) 288 | 289 | # call mother class constructor 290 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, 291 | comp_precision, update_precision, initial_range, max_overflow) 292 | 293 | # initial values of parameters 294 | low=-np.sqrt(6. / (n_inputs + n_units*n_pieces)) 295 | high=np.sqrt(6. / (n_inputs + n_units*n_pieces)) 296 | W_values = np.asarray(self.rng.uniform(low=low,high=high,size=(n_inputs, n_units*n_pieces)),dtype=theano.config.floatX) 297 | b_values = np.zeros((n_units*n_pieces), dtype=theano.config.floatX) 298 | 299 | # creation of shared symbolic variables 300 | # shared variables are the state of the built function 301 | # in practice, we put them in the GPU memory 302 | self.W = theano.shared(value=W_values, name='W') 303 | self.b = theano.shared(value=b_values, name='b') 304 | 305 | # momentum 306 | self.update_W = theano.shared(value=np.zeros((n_inputs, n_units*n_pieces), dtype=theano.config.floatX), name='update_W') 307 | self.update_b = theano.shared(value=b_values, name='update_b') 308 | 309 | # activation function 310 | def activation(self,z): 311 | 312 | y = T.reshape(z,(T.shape(z)[0], self.n_units, self.n_pieces)) 313 | 314 | # maxout 315 | y = T.max(y,axis=2) 316 | 317 | y = T.reshape(y,(T.shape(z)[0],self.n_units)) 318 | 319 | return y 320 | 321 | def activation_bprop(self): 322 | 323 | self.fixed_dEdz = apply_format(self.format, 324 | T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.y:self.fixed_dEdy})[0], 325 | self.comp_precision, self.dEdz_range) 326 | 327 | class SoftmaxLayer(dropout_layer): 328 | 329 | def __init__(self, rng, n_inputs, n_units, p, scale, max_col_norm, format, 330 | comp_precision, update_precision, initial_range, max_overflow): 331 | 332 | self.n_inputs = n_inputs 333 | self.n_units = n_units 334 | 335 | print " n_inputs = " + str(n_inputs) 336 | print " n_units = " + str(n_units) 337 | 338 | # call mother class constructor 339 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, 340 | comp_precision, update_precision, initial_range, max_overflow) 341 | 342 | # initial values of parameters 343 | W_values = np.zeros((n_inputs, n_units), dtype=theano.config.floatX) 344 | b_values = np.zeros(n_units, dtype=theano.config.floatX) 345 | 346 | # creation of shared symbolic variables 347 | self.W = theano.shared(value=W_values, name='W') 348 | self.b = theano.shared(value=b_values, name='b') 349 | 350 | # momentum 351 | self.update_W = theano.shared(value=W_values, name='update_W') 352 | self.update_b = theano.shared(value=b_values, name='update_b') 353 | 354 | # activation function 355 | def activation(self,z): 356 | 357 | return T.nnet.softmax(z) 358 | 359 | def activation_bprop(self): 360 | 361 | self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy, 362 | self.comp_precision, self.dEdz_range) 363 | 364 | class Maxout_conv_layer(dropout_layer): 365 | 366 | def __init__(self, rng, image_shape, zero_pad, output_shape, filter_shape, filter_stride, n_pieces, pool_shape, pool_stride, p, scale, max_col_norm, format, 367 | comp_precision, update_precision, initial_range, max_overflow, w_LR_scale=1., b_LR_scale=1., partial_sum = 1): 368 | 369 | # call mother class constructor 370 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, comp_precision, update_precision, initial_range, max_overflow, w_LR_scale, b_LR_scale) 371 | 372 | print ' output_shape = ' +str(output_shape) 373 | print ' image_shape = ' +str(image_shape) 374 | 375 | # add n zero on both side of the input 376 | # 0 <-> valid convolution, result is smaller 377 | # filter_size -1 <-> full convolution, result is bigger ! 378 | # valid convolution makes more sense to me. I use it to reduce the size of feature maps without using max pool. 379 | print ' zero_pad = ' +str(zero_pad) 380 | 381 | # number of output feature maps, number of inputs feature maps, x, y 382 | # number of inputs feature maps is important for the weights 383 | print ' filter_shape = ' +str(filter_shape) 384 | print ' filter_stride = ' +str(filter_stride) 385 | print ' n_pieces = ' +str(n_pieces) 386 | print ' pool_shape = ' +str(pool_shape) 387 | print ' pool_stride = ' +str(pool_stride) 388 | print ' partial_sum = ' +str(partial_sum) 389 | 390 | # save the parameters 391 | self.output_shape = output_shape 392 | self.image_shape = image_shape 393 | self.zero_pad = zero_pad 394 | self.filter_shape = (filter_shape[0]*n_pieces,filter_shape[1],filter_shape[2],filter_shape[3]) 395 | self.filter_stride = filter_stride 396 | self.n_pieces = n_pieces 397 | self.pool_shape = pool_shape 398 | self.pool_stride = pool_stride 399 | self.partial_sum = partial_sum 400 | 401 | # range of init 402 | fan_in = np.prod(self.filter_shape[1:]) 403 | fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]) / self.n_pieces / np.prod(self.pool_shape)) 404 | 405 | # initialize weights with random weights 406 | W_bound = np.sqrt(6. / (fan_in + fan_out)) 407 | self.W = theano.shared( 408 | np.asarray(rng.uniform(low=-W_bound, high=W_bound, size=self.filter_shape), 409 | dtype=theano.config.floatX)) 410 | 411 | # the bias is a 1D tensor -- one bias per output feature map 412 | b_values = np.zeros((self.filter_shape[0],), dtype=theano.config.floatX) 413 | self.b = theano.shared(value=b_values) 414 | 415 | self.update_W = theano.shared(value=np.zeros(self.filter_shape, dtype=theano.config.floatX), name='update_W') 416 | self.update_b = theano.shared(value=np.zeros((self.filter_shape[0],), dtype=theano.config.floatX), name='update_b') 417 | 418 | # activation function 419 | def activation(self,conv_out): 420 | 421 | conv_out = T.reshape(conv_out,(T.shape(conv_out)[0], T.shape(conv_out)[1]//self.n_pieces, self.n_pieces,T.shape(conv_out)[2],T.shape(conv_out)[3] )) 422 | return T.max( conv_out,axis=2) 423 | 424 | def fprop(self, input): 425 | 426 | # we reduce the precision of parameters for the computations 427 | self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) 428 | self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) 429 | 430 | input = input.reshape(self.image_shape) 431 | 432 | # convolution 433 | input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b 434 | filters_shuffled = self.w_comp.dimshuffle(1, 2, 3, 0) *self.scale # bc01 to c01b 435 | conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) 436 | contiguous_input = gpu_contiguous(input_shuffled) 437 | contiguous_filters = gpu_contiguous(filters_shuffled) 438 | conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) 439 | 440 | # downsample each feature map individually, using maxpooling 441 | # pooled_out = downsample.max_pool_2d(input=conv_out, 442 | # ds=poolsize, ignore_border=True) 443 | pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) 444 | pooled_out_shuffled = pool_op(conv_out_shuffled) 445 | pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 446 | 447 | # bias 448 | pooled_out = apply_format(self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x')*self.scale, self.comp_precision, self.z_range) 449 | 450 | # activation 451 | pooled_out = self.activation(pooled_out) 452 | pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range) 453 | 454 | return pooled_out 455 | 456 | def dropout_fprop(self, input): 457 | 458 | # we reduce the precision of parameters for the computations 459 | self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) 460 | self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) 461 | 462 | # create the dropout mask 463 | # The cast is important because 464 | # int * float32 = float64 which pulls things off the gpu 465 | 466 | srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) 467 | self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) 468 | input = input * self.mask 469 | 470 | self.fixed_x = input.reshape(self.image_shape) 471 | 472 | # convolution 473 | input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b 474 | filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b 475 | conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) # augment partial sum -> use less memory but slower 476 | contiguous_input = gpu_contiguous(input_shuffled) 477 | contiguous_filters = gpu_contiguous(filters_shuffled) 478 | conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) 479 | 480 | self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 481 | self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) 482 | 483 | conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b 484 | conv_out_shuffled = gpu_contiguous(conv_out_shuffled) 485 | 486 | # downsample each feature map individually, using maxpooling 487 | # pooled_out = downsample.max_pool_2d(input=conv_out, 488 | # ds=poolsize, ignore_border=True) 489 | pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) 490 | pooled_out_shuffled = pool_op(conv_out_shuffled) 491 | pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 492 | 493 | # bias 494 | self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x') 495 | self.fixed_u = apply_format(self.format, self.u, self.comp_precision, self.z_range) 496 | 497 | # activation 498 | self.y = self.activation(self.fixed_u).flatten(2) 499 | self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) 500 | 501 | return self.fixed_y 502 | 503 | def bprop(self, dEdy): 504 | 505 | self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range) 506 | 507 | fixed_dEdu = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_u], known_grads={self.y:self.fixed_dEdy})[0], self.comp_precision,self.dEdz_range) 508 | 509 | self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.u:fixed_dEdu})[0], self.comp_precision,self.dEdb_range) 510 | 511 | self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.u:fixed_dEdu})[0], self.comp_precision, self.dEdz_range) 512 | 513 | self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision,self.dEdw_range) 514 | 515 | dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0] 516 | 517 | dEdx = T.reshape(self.mask,T.shape(dEdx)) * dEdx 518 | 519 | return dEdx -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matthieu Courbariaux 2 | 3 | # This file is part of deep-learning-multipliers. 4 | 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # deep-learning-multipliers is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with deep-learning-multipliers. If not, see . 17 | 18 | import gzip 19 | import cPickle 20 | import numpy as np 21 | import os 22 | import os.path 23 | import sys 24 | import time 25 | 26 | from trainer import Trainer 27 | from model import PI_MNIST_model, MNIST_model, CIFAR10_SVHN_model 28 | 29 | from pylearn2.datasets.mnist import MNIST 30 | from pylearn2.datasets.zca_dataset import ZCA_Dataset 31 | from pylearn2.datasets.svhn import SVHN 32 | from pylearn2.utils import serial 33 | 34 | def onehot(x,numclasses=None): 35 | 36 | if x.shape==(): 37 | x = x[None] 38 | if numclasses is None: 39 | numclasses = np.max(x) + 1 40 | result = np.zeros(list(x.shape) + [numclasses], dtype="int") 41 | z = np.zeros(x.shape) 42 | for c in range(numclasses): 43 | z *= 0 44 | z[np.where(x==c)] = 1 45 | result[...,c] += z 46 | 47 | result = np.reshape(result,(np.shape(result)[0], np.shape(result)[result.ndim-1])) 48 | return result 49 | 50 | # MAIN 51 | 52 | if __name__ == "__main__": 53 | 54 | print 'Beginning of the program' 55 | start_time = time.clock() 56 | 57 | print 'Loading the dataset' 58 | 59 | dataset = sys.argv[1] 60 | 61 | if dataset == "PI_MNIST" or dataset == "MNIST": 62 | 63 | train_set = MNIST(which_set= 'train',start=0, stop = 50000)#, center = True) 64 | valid_set = MNIST(which_set= 'train',start=50000, stop = 60000)#, center = True) 65 | test_set = MNIST(which_set= 'test')#, center = True) 66 | 67 | # for both datasets, onehot the target 68 | train_set.y = np.float32(onehot(train_set.y)) 69 | valid_set.y = np.float32(onehot(valid_set.y)) 70 | test_set.y = np.float32(onehot(test_set.y)) 71 | 72 | elif dataset == "CIFAR10": 73 | 74 | preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl") 75 | train_set = ZCA_Dataset( 76 | preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 77 | preprocessor = preprocessor, 78 | start=0, stop = 45000) 79 | valid_set = ZCA_Dataset( 80 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 81 | preprocessor = preprocessor, 82 | start=45000, stop = 50000) 83 | test_set = ZCA_Dataset( 84 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), 85 | preprocessor = preprocessor) 86 | 87 | # for both datasets, onehot the target 88 | train_set.y = np.float32(onehot(train_set.y)) 89 | valid_set.y = np.float32(onehot(valid_set.y)) 90 | test_set.y = np.float32(onehot(test_set.y)) 91 | 92 | elif dataset == "SVHN": 93 | 94 | train_set = SVHN( 95 | which_set= 'splitted_train', 96 | path= "${SVHN_LOCAL_PATH}", 97 | axes= ['b', 'c', 0, 1]) 98 | 99 | valid_set = SVHN( 100 | which_set= 'valid', 101 | path= "${SVHN_LOCAL_PATH}", 102 | axes= ['b', 'c', 0, 1]) 103 | 104 | test_set = SVHN( 105 | which_set= 'test', 106 | path= "${SVHN_LOCAL_PATH}", 107 | axes= ['b', 'c', 0, 1]) 108 | 109 | print 'Creating the model' 110 | 111 | # storing format hyperparameters 112 | format = sys.argv[2] 113 | 114 | initial_range = 0 115 | comp_precision = 0 116 | update_precision = 0 117 | range_update_frequency = 0 118 | max_overflow = 0 119 | range_init_epoch = 0 120 | 121 | if format == "FXP" or format == "DFXP": 122 | initial_range = int(sys.argv[3]) 123 | comp_precision = int(sys.argv[4]) 124 | update_precision = int(sys.argv[5]) 125 | 126 | if format == "DFXP": 127 | range_update_frequency = int(sys.argv[6]) 128 | max_overflow = float(sys.argv[7]) 129 | range_init_epoch = int(sys.argv[8]) 130 | 131 | if dataset == "PI_MNIST": 132 | 133 | rng = np.random.RandomState(1234) 134 | LR_start = 0.05 135 | batch_size = 100 136 | gpu_batches = 500 137 | n_epoch = 800 138 | 139 | model = PI_MNIST_model(rng = rng, batch_size = batch_size, 140 | n_input = 784, n_output = 10, n_hidden = 240, n_pieces = 5, n_hidden_layers = 2, 141 | p_input = 0.8, scale_input = 1., p_hidden = 0.5, scale_hidden = 0.5, 142 | max_col_norm = 1.9365, format = format, 143 | comp_precision = comp_precision, update_precision = update_precision, 144 | initial_range = initial_range, max_overflow = max_overflow) 145 | 146 | trainer = Trainer(rng = rng, load_path = None, save_path = None, 147 | train_set = train_set, valid_set = valid_set, test_set = test_set, 148 | model = model, 149 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7, 150 | batch_size = batch_size, gpu_batches = gpu_batches, 151 | n_epoch = n_epoch, 152 | shuffle_batches = False, shuffle_examples = True, 153 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch) 154 | 155 | elif dataset == "MNIST": 156 | 157 | rng = np.random.RandomState(1234) 158 | LR_start = 0.02 159 | batch_size = 128 160 | gpu_batches = 391 # 391 -> 50000, 196 -> 25000, 79 -> 10000 161 | n_epoch = 800 162 | 163 | model = MNIST_model(rng = rng, batch_size = batch_size, format = format, 164 | comp_precision = comp_precision, update_precision = update_precision, 165 | initial_range = initial_range, max_overflow = max_overflow) 166 | 167 | trainer = Trainer(rng = rng, load_path = None, save_path = None, 168 | train_set = train_set, valid_set = valid_set, test_set = test_set, 169 | model = model, 170 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7, 171 | batch_size = batch_size, gpu_batches = gpu_batches, 172 | n_epoch = n_epoch, 173 | shuffle_batches = False, shuffle_examples = True, 174 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch) 175 | 176 | elif dataset == "CIFAR10": 177 | 178 | rng = np.random.RandomState(1234) 179 | LR_start = 0.02 180 | batch_size = 128 181 | gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000 182 | n_epoch = 400 183 | 184 | model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format, 185 | comp_precision = comp_precision, update_precision = update_precision, 186 | initial_range = initial_range, max_overflow = max_overflow) 187 | 188 | trainer = Trainer(rng = rng, load_path = None, save_path = None, 189 | train_set = train_set, valid_set = valid_set, test_set = test_set, 190 | model = model, 191 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7, 192 | batch_size = batch_size, gpu_batches = gpu_batches, 193 | n_epoch = n_epoch, 194 | shuffle_batches = False, shuffle_examples = True, 195 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch) 196 | 197 | elif dataset == "SVHN": 198 | 199 | rng = np.random.RandomState(1234) 200 | LR_start = 0.05 201 | batch_size = 128 202 | gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000 203 | n_epoch = 200 204 | 205 | model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format, 206 | comp_precision = comp_precision, update_precision = update_precision, 207 | initial_range = initial_range, max_overflow = max_overflow) 208 | 209 | trainer = Trainer(rng = rng, load_path = None, save_path = None, 210 | train_set = train_set, valid_set = valid_set, test_set = test_set, 211 | model = model, 212 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7, 213 | batch_size = batch_size, gpu_batches = gpu_batches, 214 | n_epoch = n_epoch, 215 | shuffle_batches = True, shuffle_examples = False, 216 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch) 217 | 218 | print 'Building' 219 | 220 | trainer.build() 221 | 222 | print 'Training' 223 | 224 | trainer.train() 225 | 226 | end_time = time.clock() 227 | print 'The code ran for %i seconds'%(end_time - start_time) 228 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matthieu Courbariaux 2 | 3 | # This file is part of deep-learning-multipliers. 4 | 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # deep-learning-multipliers is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with deep-learning-multipliers. If not, see . 17 | 18 | import gzip 19 | import cPickle 20 | import numpy as np 21 | import os 22 | import os.path 23 | import sys 24 | import theano 25 | import theano.tensor as T 26 | import time 27 | 28 | from layer import Maxout_conv_layer, SoftmaxLayer, MaxoutLayer 29 | 30 | 31 | class deep_dropout_network(object): 32 | 33 | layer = [] 34 | 35 | def __init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision, 36 | initial_range, max_overflow, format): 37 | 38 | print ' Overall description:' 39 | print ' Batch size = %i' %(batch_size) 40 | print ' Number of layers = %i' %(n_hidden_layers) 41 | print ' Computation precision = %i bits' %(comp_precision) 42 | print ' Update precision = %i bits' %(update_precision) 43 | print ' Initial range = %i bits' %(initial_range) 44 | print ' Maximum overflow rate = %f %%' %(max_overflow*100) 45 | print " Format = " + format 46 | 47 | self.rng = rng 48 | self.batch_size = batch_size 49 | self.n_hidden_layers = n_hidden_layers 50 | self.comp_precision = comp_precision 51 | self.update_precision = update_precision 52 | self.initial_range = initial_range 53 | self.max_overflow = max_overflow 54 | self.format = format 55 | 56 | def fprop(self, x): 57 | 58 | y = self.layer[0].fprop(x) 59 | 60 | for k in range(1,self.n_hidden_layers+1): 61 | 62 | y = self.layer[k].fprop(y) 63 | 64 | return y 65 | 66 | def dropout_fprop(self, x): 67 | 68 | y = self.layer[0].dropout_fprop(x) 69 | 70 | for k in range(1,self.n_hidden_layers+1): 71 | 72 | y = self.layer[k].dropout_fprop(y) 73 | 74 | return y 75 | 76 | # when you use fixed point, you cannot use T.grad directly -> bprop modifications. 77 | def bprop(self, y, t): 78 | 79 | # there is a simplification between softmax derivative and nll derivative 80 | dEdy = (y-t)/T.cast(T.shape(y)[1],dtype=theano.config.floatX) # /2. # actually, it is dEdz and not dEdy 81 | 82 | # bprop 83 | for k in range(self.n_hidden_layers,-1,-1): 84 | dEdy = self.layer[k].bprop(dEdy) 85 | 86 | # you give it the input and the target and it gives you the updates 87 | def parameter_updates(self, LR, M): 88 | 89 | # updates 90 | parameter_updates = self.layer[0].parameter_updates(LR, M) 91 | for k in range(1,self.n_hidden_layers+1): 92 | parameter_updates = parameter_updates + self.layer[k].parameter_updates(LR, M) 93 | 94 | return parameter_updates 95 | 96 | # function that updates the ranges of all fixed point vectors 97 | def range_updates(self,batch_count): 98 | 99 | range_updates = self.layer[0].range_updates(batch_count) 100 | for k in range(1,self.n_hidden_layers+1): 101 | range_updates = range_updates + self.layer[k].range_updates(batch_count) 102 | 103 | return range_updates 104 | 105 | # function that updates the ranges of all fixed point vectors 106 | def overflow_updates(self): 107 | 108 | overflow_updates = self.layer[0].overflow_updates() 109 | for k in range(1,self.n_hidden_layers+1): 110 | overflow_updates = overflow_updates + self.layer[k].overflow_updates() 111 | 112 | return overflow_updates 113 | 114 | # train function 115 | def updates(self, x, t, LR, M): 116 | 117 | y = self.dropout_fprop(x) 118 | self.bprop(y,t) 119 | updates = self.parameter_updates(LR,M) 120 | 121 | if self.format == "DFXP": 122 | updates += self.overflow_updates() 123 | 124 | return updates 125 | 126 | def errors(self, x, t): 127 | 128 | y = self.fprop(x) 129 | 130 | # error function 131 | errors = T.sum(T.neq(T.argmax(y, axis=1), T.argmax(t, axis=1))) 132 | 133 | return errors 134 | 135 | def save_params(self): 136 | 137 | self.W_save = [] 138 | self.b_save = [] 139 | 140 | for k in xrange(self.n_hidden_layers+1): 141 | self.W_save.append(self.layer[k].W.get_value(borrow=False)) 142 | self.b_save.append(self.layer[k].b.get_value(borrow=False)) 143 | 144 | def load_params(self): 145 | 146 | # read an load all the parameters 147 | for k in xrange(self.n_hidden_layers+1): 148 | self.layer[k].W.set_value(self.W_save[k]) 149 | self.layer[k].b.set_value(self.b_save[k]) 150 | 151 | def save_params_file(self, path): 152 | 153 | # Open the file and overwrite current contents 154 | save_file = open(path, 'wb') 155 | 156 | # write all the parameters in the file 157 | for k in xrange(self.n_hidden_layers+1): 158 | cPickle.dump(self.layer[k].W.get_value(), save_file, -1) 159 | cPickle.dump(self.layer[k].b.get_value(), save_file, -1) 160 | 161 | # close the file 162 | save_file.close() 163 | 164 | def load_params_file(self, path): 165 | 166 | # Open the file 167 | save_file = open(path) 168 | 169 | # read an load all the parameters 170 | for k in xrange(self.n_hidden_layers+1): 171 | self.layer[k].W.set_value(cPickle.load(save_file)) 172 | self.layer[k].b.set_value(cPickle.load(save_file)) 173 | 174 | # close the file 175 | save_file.close() 176 | 177 | 178 | def print_range(self): 179 | 180 | for k in xrange(self.n_hidden_layers+1): 181 | print ' Layer %i range:'%(k) 182 | self.layer[k].print_range() 183 | 184 | def set_comp_precision(self, comp_precision): 185 | 186 | for k in xrange(self.n_hidden_layers+1): 187 | self.layer[k].comp_precision.set_value(comp_precision) 188 | 189 | def get_comp_precision(self): 190 | 191 | return self.layer[0].comp_precision.get_value() 192 | 193 | def set_update_precision(self, update_precision): 194 | 195 | for k in xrange(self.n_hidden_layers+1): 196 | self.layer[k].update_precision.set_value(update_precision) 197 | 198 | def get_update_precision(self): 199 | 200 | return self.layer[0].update_precision.get_value() 201 | 202 | def set_max_overflow(self, max_overflow): 203 | 204 | for k in xrange(self.n_hidden_layers+1): 205 | self.layer[k].max_overflow.set_value(max_overflow) 206 | 207 | def get_max_overflow(self): 208 | 209 | return self.layer[0].max_overflow.get_value() 210 | 211 | class PI_MNIST_model(deep_dropout_network): 212 | 213 | def __init__(self, rng, batch_size, n_input, n_output, n_hidden, n_pieces, n_hidden_layers, 214 | p_input, scale_input, p_hidden, scale_hidden, max_col_norm, format, 215 | comp_precision, update_precision, initial_range, max_overflow): 216 | 217 | deep_dropout_network.__init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision, 218 | initial_range, max_overflow, format) 219 | 220 | print ' n_input = %i' %(n_input) 221 | print ' n_output = %i' %(n_output) 222 | print ' n_hidden = %i' %(n_hidden) 223 | print ' n_pieces = %i' %(n_pieces) 224 | print ' p_input = %f' %(p_input) 225 | print ' scale_input = %f' %(scale_input) 226 | print ' p_hidden = %f' %(p_hidden) 227 | print ' scale_hidden = %f' %(scale_hidden) 228 | print ' max_col_norm = %f' %(max_col_norm) 229 | 230 | # save the parameters 231 | self.n_input = n_input 232 | self.n_output = n_output 233 | self.n_hidden = n_hidden 234 | self.n_pieces = n_pieces 235 | self.p_input = p_input 236 | self.scale_input = scale_input 237 | self.p_hidden = p_hidden 238 | self.scale_hidden = scale_hidden 239 | self.max_col_norm = max_col_norm 240 | 241 | # Create MLP layers 242 | if self.n_hidden_layers == 0 : 243 | 244 | print " Softmax layer:" 245 | 246 | self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs=self.n_input, n_units=self.n_output, 247 | p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format, 248 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow)) 249 | 250 | else : 251 | 252 | print " Maxout layer 1:" 253 | 254 | self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_input, n_units = self.n_hidden, n_pieces = self.n_pieces, 255 | p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format, 256 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow)) 257 | 258 | for k in range(1,self.n_hidden_layers): 259 | 260 | print " Maxout layer "+str(k+1)+":" 261 | self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_hidden, n_units = self.n_hidden, n_pieces = self.n_pieces, 262 | p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format, 263 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow)) 264 | 265 | print " Softmax layer:" 266 | 267 | self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs= self.n_hidden, n_units= self.n_output, 268 | p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format, 269 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow)) 270 | 271 | class MNIST_model(deep_dropout_network): 272 | 273 | def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format): 274 | 275 | deep_dropout_network.__init__(self, rng, batch_size, 3, comp_precision, update_precision, 276 | initial_range, max_overflow, format) 277 | 278 | print " Convolution layer 1:" 279 | 280 | self.layer.append(Maxout_conv_layer( 281 | rng, 282 | image_shape=(batch_size, 1, 28, 28), 283 | zero_pad = 0, 284 | output_shape=(batch_size, 48, 10, 10), 285 | filter_shape=(48, 1, 8, 8), 286 | filter_stride = 1, 287 | n_pieces = 2, 288 | pool_shape=(4, 4), 289 | pool_stride = 2, 290 | p = 0.8, 291 | scale = 1., 292 | max_col_norm = 0.9, 293 | format = format, 294 | comp_precision = comp_precision, 295 | update_precision = update_precision, 296 | initial_range = initial_range, 297 | max_overflow = max_overflow 298 | )) 299 | 300 | 301 | print " Convolution layer 2:" 302 | 303 | self.layer.append(Maxout_conv_layer( 304 | rng, 305 | image_shape=(batch_size, 48, 10, 10), 306 | zero_pad = 3, # add n zero on both side of the input 307 | output_shape=(batch_size, 48, 4, 4), 308 | filter_shape=(48, 48, 8, 8), 309 | filter_stride = 1, 310 | n_pieces = 2, 311 | pool_shape=(4, 4), 312 | pool_stride =2, 313 | p = 0.5, 314 | scale = 0.5, 315 | max_col_norm = 1.9365, 316 | format = format, 317 | comp_precision = comp_precision, 318 | update_precision = update_precision, 319 | initial_range = initial_range, 320 | max_overflow = max_overflow 321 | )) 322 | 323 | 324 | print " Convolution layer 3:" 325 | 326 | self.layer.append(Maxout_conv_layer( 327 | rng, 328 | image_shape=(batch_size, 48, 4, 4), 329 | zero_pad = 3, # add n zero on both side of the input 330 | output_shape=(batch_size, 24, 3, 3), 331 | filter_shape=(24, 48, 5, 5), 332 | filter_stride = 1, 333 | n_pieces = 4, 334 | pool_shape=(2, 2), 335 | pool_stride =2, 336 | p = 0.5, 337 | scale = 0.5, 338 | max_col_norm = 1.9365, 339 | format = format, 340 | comp_precision = comp_precision, 341 | update_precision = update_precision, 342 | initial_range = initial_range, 343 | max_overflow = max_overflow 344 | )) 345 | 346 | print " Softmax layer:" 347 | 348 | self.layer.append(SoftmaxLayer( 349 | rng = rng, 350 | n_inputs= 24*3*3, 351 | n_units = 10, 352 | p = 0.5, 353 | scale = 0.5, 354 | max_col_norm =1.9365, 355 | format = format, 356 | comp_precision = comp_precision, 357 | update_precision = update_precision, 358 | initial_range = initial_range, 359 | max_overflow = max_overflow 360 | )) 361 | 362 | class CIFAR10_SVHN_model(deep_dropout_network): 363 | 364 | def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format): 365 | 366 | deep_dropout_network.__init__(self, rng, batch_size, 4, comp_precision, update_precision, 367 | initial_range, max_overflow, format) 368 | 369 | print " Convolution layer 1:" 370 | 371 | self.layer.append(Maxout_conv_layer( 372 | rng, 373 | image_shape=(batch_size, 3, 32, 32), 374 | zero_pad = 2, 375 | output_shape=(batch_size, 64, 16, 16), # 64 does fit in memory 376 | filter_shape=(64, 3, 5, 5), 377 | filter_stride = 1, 378 | n_pieces = 2, 379 | pool_shape=(3, 3), 380 | pool_stride = 2, 381 | p = 0.8, 382 | scale = 1., 383 | max_col_norm = 0.9, 384 | format = format, 385 | comp_precision = comp_precision, 386 | update_precision = update_precision, 387 | initial_range = initial_range, 388 | max_overflow = max_overflow, 389 | w_LR_scale = 0.2, 390 | b_LR_scale = 0.2, 391 | # partial_sum = 32 # total number = 33*33 392 | )) 393 | 394 | 395 | print " Convolution layer 2:" 396 | 397 | self.layer.append(Maxout_conv_layer( 398 | rng, 399 | image_shape=(batch_size, 64, 16, 16), 400 | zero_pad = 2, # add n zero on both side of the input 401 | output_shape=(batch_size, 128, 8, 8), 402 | filter_shape=(128, 64, 5, 5), 403 | filter_stride = 1, 404 | n_pieces = 2, 405 | pool_shape=(3, 3), 406 | pool_stride =2, 407 | p = 0.5, 408 | scale = 0.5, 409 | max_col_norm = 1.9365, 410 | format = format, 411 | comp_precision = comp_precision, 412 | update_precision = update_precision, 413 | initial_range = initial_range, 414 | max_overflow = max_overflow, 415 | w_LR_scale = 0.2, 416 | b_LR_scale = 0.2, 417 | # partial_sum = 16 # total number = 15*15 418 | )) 419 | 420 | 421 | print " Convolution layer 3:" 422 | 423 | self.layer.append(Maxout_conv_layer( 424 | rng, 425 | image_shape=(batch_size, 128, 8, 8), 426 | zero_pad = 2, # add n zero on both side of the input 427 | output_shape=(batch_size, 128, 4, 4), 428 | filter_shape=(128, 128, 5, 5), 429 | filter_stride = 1, 430 | n_pieces = 2, 431 | pool_shape=(3, 3), 432 | pool_stride =2, 433 | p = 0.5, 434 | scale = 0.5, 435 | max_col_norm = 1.9365, 436 | format = format, 437 | comp_precision = comp_precision, 438 | update_precision = update_precision, 439 | initial_range = initial_range, 440 | max_overflow = max_overflow, 441 | w_LR_scale = 0.2, 442 | b_LR_scale = 0.2, 443 | # partial_sum = 8 # total number = 9*9 444 | )) 445 | 446 | print " Maxout layer:" 447 | 448 | self.layer.append(MaxoutLayer( 449 | rng = rng, 450 | n_inputs= 128*4*4, 451 | n_units = 400, 452 | n_pieces = 5, 453 | p = 0.5, 454 | scale = 0.5, 455 | max_col_norm = 1.9365, 456 | format = format, 457 | comp_precision = comp_precision, 458 | update_precision = update_precision, 459 | initial_range = initial_range, 460 | max_overflow = max_overflow 461 | )) 462 | 463 | print " Softmax layer:" 464 | 465 | self.layer.append(SoftmaxLayer( 466 | rng = rng, 467 | n_inputs= 400, 468 | n_units = 10, 469 | p = 0.5, 470 | scale = 0.5, 471 | max_col_norm = 1.9365, 472 | format = format, 473 | comp_precision = comp_precision, 474 | update_precision = update_precision, 475 | initial_range = initial_range, 476 | max_overflow = max_overflow 477 | )) 478 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matthieu Courbariaux 2 | 3 | # This file is part of deep-learning-multipliers. 4 | 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # deep-learning-multipliers is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with deep-learning-multipliers. If not, see . 17 | 18 | import gzip 19 | import cPickle 20 | import numpy as np 21 | import os 22 | import os.path 23 | import sys 24 | import theano 25 | import theano.tensor as T 26 | import time 27 | 28 | # TRAINING 29 | 30 | class Trainer(object): 31 | 32 | def __init__(self, 33 | rng, save_path, load_path, 34 | train_set, valid_set, test_set, 35 | model, 36 | LR_start, LR_sat, LR_fin, M_start, M_sat, M_fin, 37 | batch_size, gpu_batches, 38 | n_epoch, 39 | format, range_update_frequency, range_init_epoch, 40 | shuffle_batches, shuffle_examples): 41 | 42 | print ' Training algorithm:' 43 | print ' Learning rate = %f' %(LR_start) 44 | print ' Learning rate saturation = %i' %(LR_sat) 45 | print ' Final learning rate = %f' %(LR_fin) 46 | print ' Momentum = %f' %(M_start) 47 | print ' Momentum saturation = %i' %(M_sat) 48 | print ' Final momentum = %f' %(M_fin) 49 | print ' Batch size = %i' %(batch_size) 50 | print ' gpu_batches = %i' %(gpu_batches) 51 | print ' Number of epochs = %i' %(n_epoch) 52 | print ' shuffle_batches = %i' %(shuffle_batches) 53 | print ' shuffle_examples = %i' %(shuffle_examples) 54 | print ' Format = '+ format 55 | print ' Range update frequency = %i' %(range_update_frequency) 56 | print ' Range init epochs = %i' %(range_init_epoch) 57 | 58 | # save the dataset 59 | self.rng = rng 60 | self.shuffle_batches = shuffle_batches 61 | self.shuffle_examples = shuffle_examples 62 | self.load_path = load_path 63 | self.save_path = save_path 64 | self.train_set = train_set 65 | self.valid_set = valid_set 66 | self.test_set = test_set 67 | 68 | # save the model 69 | self.model = model 70 | 71 | # save the parameters 72 | self.LR_start = LR_start 73 | self.LR_sat = LR_sat 74 | self.LR_fin = LR_fin 75 | self.M_start = M_start 76 | self.M_sat = M_sat 77 | self.M_fin = M_fin 78 | self.batch_size = batch_size 79 | self.gpu_batches = gpu_batches 80 | self.n_epoch = n_epoch 81 | self.format = format 82 | self.range_update_frequency = range_update_frequency 83 | self.range_init_epoch = range_init_epoch 84 | 85 | # put a part of the dataset on gpu 86 | self.shared_x = theano.shared( 87 | np.asarray(self.train_set.X[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX)) 88 | self.shared_y = theano.shared( 89 | np.asarray(self.train_set.y[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX)) 90 | 91 | def shuffle(self, set): 92 | 93 | # on the CPU for the moment. 94 | X = np.copy(set.X) 95 | y = np.copy(set.y) 96 | 97 | shuffled_index = range(set.X.shape[0]) 98 | self.rng.shuffle(shuffled_index) 99 | 100 | for i in range(set.X.shape[0]): 101 | set.X[i] = X[shuffled_index[i]] 102 | set.y[i] = y[shuffled_index[i]] 103 | 104 | def init_range(self): 105 | 106 | # save the precisions and the random parameters of the model 107 | comp_precision = self.model.get_comp_precision() 108 | update_precision = self.model.get_update_precision() 109 | self.model.save_params() 110 | 111 | # set a good precision 112 | self.model.set_comp_precision(31) 113 | self.model.set_update_precision(31) 114 | 115 | # train n epochs to adjust the initial range 116 | for k in range(self.range_init_epoch): 117 | self.train_epoch(self.train_set) 118 | 119 | # set back the precision and the random parameters 120 | self.model.set_comp_precision(comp_precision) 121 | self.model.set_update_precision(update_precision) 122 | self.model.load_params() 123 | 124 | def init(self): 125 | 126 | if self.load_path != None: 127 | self.model.load_params_file(self.load_path) 128 | 129 | self.LR = self.LR_start 130 | self.LR_step = (self.LR_fin-self.LR_start)/self.LR_sat 131 | self.M = self.M_start 132 | self.M_step = (self.M_fin-self.M_start)/self.M_sat 133 | 134 | self.epoch = 0 135 | self.best_epoch = self.epoch 136 | 137 | # test it on the validation set 138 | self.validation_ER = self.test_epoch(self.valid_set) 139 | # test it on the test set 140 | self.test_ER = self.test_epoch(self.test_set) 141 | 142 | self.best_validation_ER = self.validation_ER 143 | self.best_test_ER = self.test_ER 144 | 145 | if self.format == "DFXP" : 146 | self.init_range() 147 | 148 | def update(self): 149 | 150 | # start by shuffling train set 151 | if self.shuffle_examples == True: 152 | self.shuffle(self.train_set) 153 | 154 | self.epoch += 1 155 | 156 | # train the model on all training examples 157 | self.train_epoch(self.train_set) 158 | 159 | # test it on the validation set 160 | self.validation_ER = self.test_epoch(self.valid_set) 161 | 162 | # test it on the test set 163 | self.test_ER = self.test_epoch(self.test_set) 164 | 165 | # update LR and M as well during the first phase 166 | self.update_LR() 167 | self.update_M() 168 | 169 | # save the best parameters 170 | if self.validation_ER < self.best_validation_ER: 171 | self.best_validation_ER = self.validation_ER 172 | self.best_test_ER = self.test_ER 173 | self.best_epoch = self.epoch 174 | if self.save_path != None: 175 | self.model.save_params_file(self.save_path) 176 | 177 | def load_shared_dataset(self, set, start,size): 178 | 179 | self.shared_x.set_value( 180 | set.X[self.batch_size*start:self.batch_size*(size+start)]) 181 | self.shared_y.set_value( 182 | set.y[self.batch_size*start:self.batch_size*(size+start)]) 183 | 184 | def train_epoch(self, set): 185 | 186 | # number of batch in the dataset 187 | n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size)) 188 | # number of group of batches (in the memory of the GPU) 189 | n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches)) 190 | 191 | # number of batches in the last group 192 | if self.gpu_batches<=n_batches: 193 | n_remaining_batches = n_batches%self.gpu_batches 194 | else: 195 | n_remaining_batches = n_batches 196 | 197 | # batch counter for the range update frequency 198 | k = 0 199 | 200 | shuffled_range_i = range(n_gpu_batches) 201 | 202 | if self.shuffle_batches==True: 203 | self.rng.shuffle(shuffled_range_i) 204 | 205 | for i in shuffled_range_i: 206 | 207 | self.load_shared_dataset(set, 208 | start=i*self.gpu_batches, 209 | size=self.gpu_batches) 210 | 211 | shuffled_range_j = range(self.gpu_batches) 212 | 213 | if self.shuffle_batches==True: 214 | self.rng.shuffle(shuffled_range_j) 215 | 216 | for j in shuffled_range_j: 217 | 218 | self.train_batch(j, self.LR, self.M) 219 | 220 | # update the dynamic ranges every range_update_frequency epoch 221 | if self.format == "DFXP" : 222 | k+=1 223 | if k==self.range_update_frequency: 224 | self.update_range(k) 225 | k=0 226 | 227 | # load the last incomplete gpu batch of batches 228 | if n_remaining_batches > 0: 229 | 230 | self.load_shared_dataset(set, 231 | start=n_gpu_batches*self.gpu_batches, 232 | size=n_remaining_batches) 233 | 234 | shuffled_range_j = range(n_remaining_batches) 235 | if self.shuffle_batches==True: 236 | self.rng.shuffle(shuffled_range_j) 237 | 238 | for j in shuffled_range_j: 239 | 240 | self.train_batch(j, self.LR, self.M) 241 | 242 | # update the dynamic ranges every range_update_frequency epoch 243 | if self.format == "DFXP" : 244 | k+=1 245 | if k==self.range_update_frequency: 246 | self.update_range(k) 247 | k=0 248 | 249 | def test_epoch(self, set): 250 | 251 | n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size)) 252 | n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches)) 253 | 254 | if self.gpu_batches<=n_batches: 255 | n_remaining_batches = n_batches%self.gpu_batches 256 | else: 257 | n_remaining_batches = n_batches 258 | 259 | error_rate = 0. 260 | 261 | for i in range(n_gpu_batches): 262 | 263 | self.load_shared_dataset(set, 264 | start=i*self.gpu_batches, 265 | size=self.gpu_batches) 266 | 267 | for j in range(self.gpu_batches): 268 | 269 | error_rate += self.test_batch(j) 270 | 271 | # load the last incomplete gpu batch of batches 272 | if n_remaining_batches > 0: 273 | 274 | self.load_shared_dataset(set, 275 | start=n_gpu_batches*self.gpu_batches, 276 | size=n_remaining_batches) 277 | 278 | for j in range(n_remaining_batches): 279 | 280 | error_rate += self.test_batch(j) 281 | 282 | error_rate /= (n_batches*self.batch_size) 283 | error_rate *= 100. 284 | 285 | return error_rate 286 | 287 | def update_LR(self): 288 | 289 | if self.LR > self.LR_fin: 290 | self.LR += self.LR_step 291 | else: 292 | self.LR = self.LR_fin 293 | 294 | def update_M(self): 295 | 296 | if self.M < self.M_fin: 297 | self.M += self.M_step 298 | else: 299 | self.M = self.M_fin 300 | 301 | def monitor(self): 302 | 303 | print ' epoch %i:' %(self.epoch) 304 | print ' learning rate %f' %(self.LR) 305 | print ' momentum %f' %(self.M) 306 | print ' validation error rate %f%%' %(self.validation_ER) 307 | print ' test error rate %f%%' %(self.test_ER) 308 | print ' epoch associated to best validation error %i' %(self.best_epoch) 309 | print ' best validation error rate %f%%' %(self.best_validation_ER) 310 | print ' test error rate associated to best validation error %f%%' %(self.best_test_ER) 311 | 312 | if self.format == "DFXP": 313 | self.model.print_range() 314 | 315 | def train(self): 316 | 317 | self.init() 318 | self.monitor() 319 | 320 | for epoch in range(self.n_epoch): 321 | 322 | self.update() 323 | self.monitor() 324 | 325 | def build(self): 326 | 327 | # input and output variables 328 | x = T.matrix('x') 329 | y = T.matrix('y') 330 | index = T.lscalar() 331 | batch_count = T.lscalar() 332 | LR = T.scalar('LR', dtype=theano.config.floatX) 333 | M = T.scalar('M', dtype=theano.config.floatX) 334 | 335 | # before the build, you work with symbolic variables 336 | # after the build, you work with numeric variables 337 | 338 | self.train_batch = theano.function(inputs=[index,LR,M], updates=self.model.updates(x,y,LR,M),givens={ 339 | x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 340 | y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]}, 341 | name = "train_batch", on_unused_input='warn') 342 | 343 | self.test_batch = theano.function(inputs=[index],outputs=self.model.errors(x,y),givens={ 344 | x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 345 | y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]}, 346 | name = "test_batch") 347 | 348 | if self.format == "DFXP" : 349 | self.update_range = theano.function(inputs=[batch_count],updates=self.model.range_updates(batch_count), name = "update_range") 350 | -------------------------------------------------------------------------------- /utilities/filter_plot.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy 3 | 4 | def scale_to_unit_interval(ndar, eps=1e-8): 5 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 6 | ndar = ndar.copy() 7 | ndar -= ndar.min() 8 | ndar *= 1.0 / (ndar.max() + eps) 9 | return ndar 10 | 11 | 12 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 13 | scale_rows_to_unit_interval=True, 14 | output_pixel_vals=True): 15 | """ 16 | Transform an array with one flattened image per row, into an array in 17 | which images are reshaped and layed out like tiles on a floor. 18 | 19 | This function is useful for visualizing datasets whose rows are images, 20 | and also columns of matrices for transforming those rows 21 | (such as the first layer of a neural net). 22 | 23 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 24 | be 2-D ndarrays or None; 25 | :param X: a 2-D array in which every row is a flattened image. 26 | 27 | :type img_shape: tuple; (height, width) 28 | :param img_shape: the original shape of each image 29 | 30 | :type tile_shape: tuple; (rows, cols) 31 | :param tile_shape: the number of images to tile (rows, cols) 32 | 33 | :param output_pixel_vals: if output should be pixel values (i.e. int8 34 | values) or floats 35 | 36 | :param scale_rows_to_unit_interval: if the values need to be scaled before 37 | being plotted to [0,1] or not 38 | 39 | 40 | :returns: array suitable for viewing as an image. 41 | (See:`Image.fromarray`.) 42 | :rtype: a 2-d array with same dtype as X. 43 | 44 | """ 45 | 46 | assert len(img_shape) == 2 47 | assert len(tile_shape) == 2 48 | assert len(tile_spacing) == 2 49 | 50 | # The expression below can be re-written in a more C style as 51 | # follows : 52 | # 53 | # out_shape = [0,0] 54 | # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] - 55 | # tile_spacing[0] 56 | # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] - 57 | # tile_spacing[1] 58 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 59 | in zip(img_shape, tile_shape, tile_spacing)] 60 | 61 | if isinstance(X, tuple): 62 | assert len(X) == 4 63 | # Create an output numpy ndarray to store the image 64 | if output_pixel_vals: 65 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') 66 | else: 67 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) 68 | 69 | #colors default to 0, alpha defaults to 1 (opaque) 70 | if output_pixel_vals: 71 | channel_defaults = [0, 0, 0, 255] 72 | else: 73 | channel_defaults = [0., 0., 0., 1.] 74 | 75 | for i in xrange(4): 76 | if X[i] is None: 77 | # if channel is None, fill it with zeros of the correct 78 | # dtype 79 | out_array[:, :, i] = numpy.zeros(out_shape, 80 | dtype='uint8' if output_pixel_vals else out_array.dtype 81 | ) + channel_defaults[i] 82 | else: 83 | # use a recurrent call to compute the channel and store it 84 | # in the output 85 | out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) 86 | return out_array 87 | 88 | else: 89 | # if we are dealing with only one channel 90 | H, W = img_shape 91 | Hs, Ws = tile_spacing 92 | 93 | # generate a matrix to store the output 94 | out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) 95 | 96 | 97 | for tile_row in xrange(tile_shape[0]): 98 | for tile_col in xrange(tile_shape[1]): 99 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 100 | if scale_rows_to_unit_interval: 101 | # if we should scale values to be between 0 and 1 102 | # do this by calling the `scale_to_unit_interval` 103 | # function 104 | this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) 105 | else: 106 | this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) 107 | # add the slice to the corresponding position in the 108 | # output array 109 | out_array[ 110 | tile_row * (H+Hs): tile_row * (H + Hs) + H, 111 | tile_col * (W+Ws): tile_col * (W + Ws) + W 112 | ] \ 113 | = this_img * (255 if output_pixel_vals else 1) 114 | return out_array -------------------------------------------------------------------------------- /utilities/goliat2_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | # gpu 5 | nb_gpu = 4 6 | delay_between_lauches = 5*60 # 5 minutes 7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs 8 | gpu_id = 0 9 | 10 | # hyper parameters lists 11 | comp_NOB = [9,11] 12 | up_NOB = [11,13] 13 | NOIB = [8] 14 | dynamic_range = [1] 15 | 16 | for i in comp_NOB: 17 | for j in up_NOB: 18 | for k in NOIB: 19 | for l in dynamic_range: 20 | 21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &" 22 | os.system(command) 23 | print command 24 | 25 | if gpu_id == nb_gpu - 1: 26 | 27 | gpu_id = 0 28 | time.sleep(time_per_job) 29 | print " " 30 | 31 | else: 32 | 33 | gpu_id = gpu_id+1 34 | time.sleep(delay_between_lauches) 35 | 36 | -------------------------------------------------------------------------------- /utilities/goliat3_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | # gpu 5 | nb_gpu = 4 6 | delay_between_lauches = 5*60 # 5 minutes 7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs 8 | gpu_id = 0 9 | 10 | # hyper parameters lists 11 | comp_NOB = [19,21] 12 | up_NOB = [17,19] 13 | NOIB = [5] 14 | dynamic_range = [0] 15 | 16 | for i in comp_NOB: 17 | for j in up_NOB: 18 | for k in NOIB: 19 | for l in dynamic_range: 20 | 21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &" 22 | os.system(command) 23 | print command 24 | 25 | if gpu_id == nb_gpu - 1: 26 | 27 | gpu_id = 0 28 | time.sleep(time_per_job) 29 | print " " 30 | 31 | else: 32 | 33 | gpu_id = gpu_id+1 34 | time.sleep(delay_between_lauches) 35 | 36 | -------------------------------------------------------------------------------- /utilities/goliat4_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | # gpu 5 | nb_gpu = 4 6 | delay_between_lauches = 5*60 # 2 minutes 7 | time_per_job = 7*60*60 # 9 in float32 for 1.3 more neurones, partial sum, no range updates, for 200 epochs 8 | gpu_id = 0 9 | 10 | # hyper parameters lists 11 | comp_NOB = [31] 12 | up_NOB = [31] 13 | NOIB = [3,5,6] 14 | dynamic_range = [0] 15 | 16 | for i in comp_NOB: 17 | for j in up_NOB: 18 | for k in NOIB: 19 | for l in dynamic_range: 20 | 21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &" 22 | os.system(command) 23 | print command 24 | 25 | if gpu_id == nb_gpu - 1: 26 | 27 | gpu_id = 0 28 | time.sleep(time_per_job) 29 | print " " 30 | 31 | else: 32 | 33 | if gpu_id == 0: 34 | gpu_id = gpu_id+2 # gpu1 does not work :( 35 | else: 36 | gpu_id = gpu_id+1 37 | 38 | time.sleep(delay_between_lauches) -------------------------------------------------------------------------------- /utilities/results_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | import re 4 | 5 | comp_precision = [19,21,23] 6 | update_precision = [17,19] 7 | initial_range = [5] 8 | dynamic_range = [0] 9 | 10 | csv_file = open('X_X_5_0.csv', 'w') 11 | csv_writer = csv.writer(csv_file, lineterminator = '\n') 12 | csv_writer.writerow(["comp_precision","update_precision", "initial_range","dynamic_range","validation_error","test_error"]) 13 | 14 | for j in comp_precision: 15 | for k in update_precision: 16 | for l in initial_range: 17 | for m in dynamic_range: 18 | 19 | name = str(j) + "_" + str(k) + "_" + str(l) + "_" + str(m) + ".txt" 20 | f = open(name, 'r').readlines() 21 | 22 | length = len(f) 23 | 24 | print f[length-3] 25 | validation_error = float(re.findall("\d+.\d+", f[length-3])[0])/100. 26 | 27 | print f[length-2] 28 | test_error = float(re.findall("\d+.\d+", f[length-2])[0])/100. 29 | 30 | # print f[length-3-44-1] 31 | # validation_error = float(re.findall("\d+.\d+", f[length-3-44-1])[0])/100. 32 | 33 | # print f[length-2-44-1] 34 | # test_error = float(re.findall("\d+.\d+", f[length-2-44-1])[0])/100. 35 | 36 | csv_writer.writerow([j,k,l,m,validation_error,test_error]) -------------------------------------------------------------------------------- /utilities/svhn_preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import shutil 4 | from theano import config 5 | from pylearn2.datasets import preprocessing 6 | from pylearn2.datasets.svhn import SVHN 7 | from pylearn2.utils.string_utils import preprocess 8 | 9 | orig_path = preprocess('${PYLEARN2_DATA_PATH}/SVHN/format2') 10 | try: 11 | local_path = preprocess('${SVHN_LOCAL_PATH}') 12 | except ValueError: 13 | raise ValueError("You need to define SVHN_LOCAL_PATH environment " 14 | "variable.") 15 | 16 | train_name ='h5/splitted_train_32x32.h5' 17 | valid_name = 'h5/valid_32x32.h5' 18 | test_name = 'h5/test_32x32.h5' 19 | 20 | # copy data if don't exist 21 | if not os.path.isdir(os.path.join(local_path, 'h5')): 22 | os.makedirs(os.path.join(local_path, 'h5')) 23 | 24 | for d_set in [train_name, valid_name, test_name]: 25 | if not os.path.isfile(os.path.join(local_path, d_set)): 26 | logging.info("Copying data from {0} to {1}".format(os.path.join(local_path, d_set), local_path)) 27 | shutil.copyfile(os.path.join(orig_path, d_set), 28 | os.path.join(local_path, d_set)) 29 | 30 | def check_dtype(data): 31 | if str(data.X.dtype) != config.floatX: 32 | logging.warning("The dataset is saved as {}, changing theano's floatX "\ 33 | "to the same dtype".format(data.X.dtype)) 34 | config.floatX = str(data.X.dtype) 35 | 36 | # Load train data 37 | train = SVHN('splitted_train', path=local_path) 38 | check_dtype(train) 39 | 40 | # prepare preprocessing 41 | pipeline = preprocessing.Pipeline() 42 | # without batch_size there is a high chance that you might encounter memory error 43 | # or pytables crashes 44 | pipeline.items.append(preprocessing.GlobalContrastNormalization(batch_size=5000)) 45 | pipeline.items.append(preprocessing.LeCunLCN((32,32))) 46 | 47 | # apply the preprocessings to train 48 | train.apply_preprocessor(pipeline, can_fit=True) 49 | del train 50 | 51 | # load and preprocess valid 52 | valid = SVHN('valid', path=local_path) 53 | check_dtype(valid) 54 | valid.apply_preprocessor(pipeline, can_fit=False) 55 | 56 | # load and preprocess test 57 | test = SVHN('test', path=local_path) 58 | check_dtype(test) 59 | test.apply_preprocessor(pipeline, can_fit=False) 60 | --------------------------------------------------------------------------------