├── LICENSE.txt
├── README.md
├── format.py
├── layer.py
├── main.py
├── model.py
├── trainer.py
└── utilities
    ├── filter_plot.py
    ├── goliat2_script.py
    ├── goliat3_script.py
    ├── goliat4_script.py
    ├── results_extractor.py
    └── svhn_preprocessing.py


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deep-learning-multipliers
 2 | 
 3 | ## Requirements
 4 | 
 5 | * Theano 0.6 (Bleeding edge version)
 6 | * Pylearn2 0.1 
 7 | * PyTables (for the SVHN dataset)
 8 | * a CUDA capable GPU
 9 | 
10 | ## Goal
11 | 
12 | This code was written to allow anyone to easily reproduce the results 
13 | of the article "Deep learning with low precision multipliers", available at http://arxiv.org/abs/1412.7024 .
14 | The article in question assesses whether it is possible to train Deep Neural Networks with low precision multipliers.
15 | 
16 | Note that this code only simulates the impact of low precision multipliers.
17 | It does not exploit it in any way.
18 | If you are looking for fast low precision GPU kernels, NervanaSystems made some available https://github.com/NervanaSystems/nervanagpu . 
19 | 
20 | ## How to run it
21 | 
22 | ### Command line
23 | 
24 |     python main.py [task] [format] [initial range] [propagations bit-width] 
25 |         [parameters updates bit-width] [ranges updates frequency]
26 |         [maximum overflow rate] [number of epochs of ranges initialization]
27 | 
28 | ### Task
29 | 
30 | There are 4 different tasks: the permutation invariant MNIST (PI_MNIST), 
31 | MNIST, CIFAR10 and SVHN.
32 | A set of hyperparameters is associated with each of those tasks 
33 | (They are stored in model.py).
34 | For the SVHN dataset, 
35 | you need to set an environment variable: 
36 | 
37 |     SVHN_LOCAL_PATH=/tmp/SVHN/ 
38 |     
39 | You then need to pre-process it with the script 
40 | utilities/svhn_preprocessing.py (script taken from pylearn2).
41 | 
42 | ### Format
43 | 
44 | There are 4 different formats: floating point (FLP), 
45 | half floating point (HFLP), 
46 | fixed point (FXP) and dynamic fixed point (DFXP).
47 | 
48 | ### Initial range
49 | 
50 | Initial range is only useful for FXP and DFXP. 
51 | It is the initial position of the radix point 
52 | for the fixed point formats.
53 | 5 works most of the time.
54 | 
55 | ### Propagations and parameters updates bit-widths
56 | 
57 | Only useful for FXP and DFXP.
58 | Those are the bit-widths of respectively the 
59 | propagations and the parameters updates.
60 | Note that the sign is not counted in the bit-width.
61 | 
62 | ### Ranges update frequency
63 | 
64 | Range update frequency is only useful for DFXP.
65 | It is the number of batches between two ranges updates.
66 | 
67 | ### Maximum overflow rate
68 | 
69 | Only useful for DFXP.
70 | It is the amount of overflow tolerated before modifying the range.
71 |     
72 | ### Number of epochs of range initialization
73 | 
74 | Only useful for DFXP.
75 | This is the number of epochs we train with high precision 
76 | to find the initial scaling factors.
77 | Once they are found, 
78 | the parameters are reinitialized, and the DFXP training can begin.    
79 |         
80 | ### Examples
81 | 
82 |     python main.py PI_MNIST FLP
83 |     python main.py SVHN FXP 5 19 19
84 |     python main.py CIFAR10 DFXP 5 9 11 100 0.0001 2
85 |         
86 | 


--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Matthieu Courbariaux
 2 | 
 3 | # This file is part of deep-learning-multipliers.
 4 | 
 5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | 
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import theano 
25 | import theano.tensor as T
26 | import time
27 | 
28 | from theano.scalar.basic import UnaryScalarOp, same_out_nocomplex
29 | from theano.tensor.elemwise import Elemwise
30 | 
31 | def apply_format(format, X, NOB, NOIB):
32 |     
33 |     if format == "FXP" or format == "DFXP": 
34 |         return fixed_point(X,NOB, NOIB)
35 |         
36 |     elif format == "FLP":
37 |         return X
38 |         
39 |     elif format == "HFLP":
40 |         return float16(X)     
41 | 
42 | # float16 function
43 | # we are using the nvidia cuda function (only works on GPU)
44 | class Float16(UnaryScalarOp):
45 | 
46 |     def impl(self, x):
47 |         return numpy.float32(numpy.float16(x))
48 |     
49 |     def c_code(self, node, name, (x,), (z,), sub):
50 |         return "%(z)s = __half2float(__float2half_rn(%(x)s));" % locals()  
51 | float16_scalar = Float16(same_out_nocomplex, name='float16')
52 | float16 = Elemwise(float16_scalar)
53 |         
54 | # this function simulate the precision and the range of a fixed point 
55 | # while working with floats
56 | # NOB = Number Of Bits = bit-width
57 | # NOIB = Number Of Integer Bits = position of the radix point = range
58 | def fixed_point(X,NOB, NOIB):
59 |     
60 |     power = T.cast(2.**(NOB - NOIB), theano.config.floatX) # float !
61 |     max = T.cast((2.**NOB)-1, theano.config.floatX)
62 |     value = X*power    
63 |     value = T.round(value) # rounding
64 |     value = T.clip(value, -max, max) # saturation arithmetic
65 |     value = value/power
66 |     return value
67 |         
68 | # compute the new range of the dynamic fixed point representation
69 | def new_range(overflow, overflow_1, max_overflow):
70 |     
71 |     # the goal is to update the range of the vector 
72 |     # we know the overflow rates associated with range (overflow) 
73 |     # and range-1 (overflow_1)
74 |     # if (overflow > max_overflow): increment range
75 |     # else if (overflow_1 < max_overflow): decrement range
76 |     return T.switch(T.gt(overflow, max_overflow), 1, 
77 |         T.switch(T.gt(overflow_1, max_overflow), 0, - 1))
78 | 
79 | # Overflow rate of a vector knowing its NOIB and NOB
80 | def overflow(vector, NOB, NOIB):
81 |     
82 |     # compute the max value of the fixed point representation (i.e. the overflow value)
83 |     max = ((2.**NOB)-1)/(2.**(NOB - NOIB))
84 |     
85 |     # compute the overflow rate of the vector
86 |     overflow = T.mean(T.switch(T.ge(T.abs_(vector), max), 1., 0.))
87 |     
88 |     return overflow
89 |     


--------------------------------------------------------------------------------
/layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Matthieu Courbariaux
  2 | 
  3 | # This file is part of deep-learning-multipliers.
  4 | 
  5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with deep-learning-multipliers.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | import gzip
 19 | import cPickle
 20 | import numpy as np
 21 | import os
 22 | import os.path
 23 | import sys
 24 | import theano 
 25 | import theano.tensor as T
 26 | from theano import pp
 27 | import time
 28 | import scipy.stats
 29 | from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs
 30 | from theano.sandbox.cuda.basic_ops import gpu_contiguous
 31 | from pylearn2.sandbox.cuda_convnet.pool import MaxPool
 32 | 
 33 | from format import apply_format, overflow, new_range
 34 |         
 35 | class dropout_layer(object):
 36 |     
 37 |     def __init__(self, rng, p, scale, max_col_norm, format,
 38 |         comp_precision, update_precision, initial_range, max_overflow, w_LR_scale = 1., b_LR_scale = 1.):
 39 |         
 40 |         print "        p = " + str(p)
 41 |         print "        scale = " + str(scale)
 42 |         print "        w_LR_scale = " + str(w_LR_scale)
 43 |         print "        b_LR_scale = " + str(b_LR_scale)
 44 |         print "        max_col_norm = " + str(max_col_norm)
 45 |         print "        format = " + str(format)
 46 |         
 47 |         # save the parameters
 48 |         self.p = p
 49 |         self.scale = scale
 50 |         self.w_LR_scale = w_LR_scale
 51 |         self.b_LR_scale = b_LR_scale
 52 |         self.rng = rng
 53 |         self.max_col_norm = max_col_norm
 54 |         self.format = format
 55 |         
 56 |         # create shared variables
 57 |         self.comp_precision = theano.shared(value=comp_precision, name='comp_precision')
 58 |         self.update_precision = theano.shared(value=update_precision, name='update_precision')
 59 |         self.max_overflow = theano.shared(value=max_overflow, name='max_overflow')
 60 | 
 61 |         # create shared variables for the fixed point range
 62 |         self.z_range = theano.shared(value=initial_range, name='z_range')
 63 |         self.dEdz_range = theano.shared(value=initial_range, name='dEdz_range')
 64 |         self.y_range = theano.shared(value=initial_range, name='y_range')
 65 |         self.dEdy_range = theano.shared(value=initial_range, name='dEdy_range')
 66 |         self.w_range = theano.shared(value=initial_range, name='w_range')
 67 |         self.b_range = theano.shared(value=initial_range, name='b_range')
 68 |         self.dEdw_range = theano.shared(value=initial_range, name='dEdw_range')
 69 |         self.dEdb_range = theano.shared(value=initial_range, name='dEdb_range')
 70 |         self.update_w_range = theano.shared(value=initial_range, name='update_w_range')
 71 |         self.update_b_range = theano.shared(value=initial_range, name='update_b_range')
 72 |         
 73 |         # overflow counters for current range (needed to know when to augment the range)
 74 |         self.z_overflow = theano.shared(value=0., name='z_overflow')
 75 |         self.dEdz_overflow = theano.shared(value=0., name='dEdz_overflow')
 76 |         self.y_overflow = theano.shared(value=0., name='y_overflow')
 77 |         self.dEdy_overflow = theano.shared(value=0., name='dEdy_overflow')
 78 |         self.w_overflow = theano.shared(value=0., name='w_overflow')
 79 |         self.b_overflow = theano.shared(value=0., name='b_overflow')
 80 |         self.dEdw_overflow = theano.shared(value=0., name='dEdw_overflow')
 81 |         self.dEdb_overflow = theano.shared(value=0., name='dEdb_overflow')
 82 |         self.update_w_overflow = theano.shared(value=0., name='update_w_overflow')
 83 |         self.update_b_overflow = theano.shared(value=0., name='update_b_overflow')
 84 |     
 85 |         # overflow counter for current range-1 (needed to know when to reduce the range)
 86 |         self.z_overflow_1 = theano.shared(value=0., name='z_overflow_1')
 87 |         self.dEdz_overflow_1 = theano.shared(value=0., name='dEdz_overflow_1')
 88 |         self.y_overflow_1 = theano.shared(value=0., name='y_overflow_1')
 89 |         self.dEdy_overflow_1 = theano.shared(value=0., name='dEdy_overflow_1')
 90 |         self.w_overflow_1 = theano.shared(value=0., name='w_overflow_1')
 91 |         self.b_overflow_1 = theano.shared(value=0., name='b_overflow_1')
 92 |         self.dEdw_overflow_1 = theano.shared(value=0., name='dEdw_overflow_1')
 93 |         self.dEdb_overflow_1 = theano.shared(value=0., name='dEdb_overflow_1')
 94 |         self.update_w_overflow_1 = theano.shared(value=0., name='update_w_overflow_1')
 95 |         self.update_b_overflow_1 = theano.shared(value=0., name='update_b_overflow_1')
 96 |         
 97 |     def fprop(self, input):
 98 |         
 99 |         # we reduce the precision of parameters for the computations
100 |         self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
101 |         self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
102 |         
103 |         # scaled weighted sum
104 |         self.z = apply_format(self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp*self.scale, self.comp_precision, self.z_range)
105 |         
106 |         # activation
107 |         self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range)
108 |         
109 |         # return the output
110 |         return self.y
111 |         
112 |     def dropout_fprop(self, input):
113 |         
114 |         # we reduce the precision of parameters for the computations
115 |         self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
116 |         self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
117 |             
118 |         # create the dropout mask
119 |         # The cast is important because
120 |         # int * float32 = float64 which pulls things off the gpu
121 |         srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
122 |         self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
123 |         
124 |         # apply the mask
125 |         self.fixed_x = input * self.mask
126 |         
127 |         # weighted sum
128 |         self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b
129 |         self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range)
130 |         
131 |         # activation
132 |         self.y = self.activation(self.fixed_z)
133 |         self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
134 |         
135 |         # return the output
136 |         return  self.fixed_y
137 |     
138 |     def activation(self):
139 |         
140 |         raise NotImplementedError("Subclass must implement abstract method")
141 |     
142 |     def activation_bprop(self):
143 |         
144 |         raise NotImplementedError("Subclass must implement abstract method")
145 |         
146 |     def bprop(self, dEdy):
147 |    
148 |         self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range)
149 |         
150 |         # activation
151 |         self.activation_bprop()
152 |          
153 |         # compute gradients of parameters
154 |         self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range)
155 |         self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range)
156 |         
157 |         # weighted sum
158 |         dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
159 |         
160 |         # apply mask
161 |         dEdx = self.mask * dEdx
162 |         
163 |         return dEdx
164 |         
165 |     def parameter_updates(self, LR, M):    
166 |         
167 |         # compute updates
168 |         new_update_W = apply_format(self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range)
169 |         new_update_b = apply_format(self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range)
170 |         
171 |         # compute new parameters. Note that we use a better precision than the other operations
172 |         new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range)
173 |         new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range)
174 |         
175 |         # L2 column constraint on W
176 |         col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0))
177 |         # col_norms = T.max(new_W, axis=0)
178 |         desired_norms = T.clip(col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max
179 |         new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range)
180 |         # for some reason, works better than 
181 |         # new_W = new_W * (desired_norms / col_norms)
182 |         # It may be a kind of regularization
183 |         
184 |         # return the updates of shared variables
185 |         updates = []
186 |         updates.append((self.W, new_W))
187 |         updates.append((self.b, new_b))
188 |         updates.append((self.update_W, new_update_W))
189 |         updates.append((self.update_b, new_update_b)) 
190 |         
191 |         return updates
192 | 
193 |     def overflow_updates(self):
194 |     
195 |         updates = []
196 |         
197 |         # update overflow counters for the dynamic fixed point
198 |         updates.append((self.z_overflow, self.z_overflow + overflow(self.fixed_z, self.comp_precision, self.z_range)))
199 |         updates.append((self.dEdz_overflow, self.dEdz_overflow + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range)))
200 |         updates.append((self.y_overflow, self.y_overflow + overflow(self.fixed_y, self.comp_precision, self.y_range)))
201 |         updates.append((self.dEdy_overflow, self.dEdy_overflow + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range)))
202 |         updates.append((self.w_overflow, self.w_overflow + overflow(self.W, self.update_precision, self.w_range)))
203 |         updates.append((self.b_overflow, self.b_overflow + overflow(self.b, self.update_precision, self.b_range)))
204 |         updates.append((self.dEdw_overflow, self.dEdw_overflow + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range)))
205 |         updates.append((self.dEdb_overflow, self.dEdb_overflow + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range)))
206 |         updates.append((self.update_w_overflow, self.update_w_overflow + overflow(self.update_W, self.comp_precision, self.update_w_range)))
207 |         updates.append((self.update_b_overflow, self.update_b_overflow + overflow(self.update_b, self.comp_precision, self.update_b_range)))
208 |         
209 |         updates.append((self.z_overflow_1, self.z_overflow_1 + overflow(self.fixed_z, self.comp_precision, self.z_range-1)))
210 |         updates.append((self.dEdz_overflow_1, self.dEdz_overflow_1 + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range-1)))
211 |         updates.append((self.y_overflow_1, self.y_overflow_1 + overflow(self.fixed_y, self.comp_precision, self.y_range-1)))
212 |         updates.append((self.dEdy_overflow_1, self.dEdy_overflow_1 + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range-1)))
213 |         updates.append((self.w_overflow_1, self.w_overflow_1 + overflow(self.W, self.update_precision, self.w_range-1)))
214 |         updates.append((self.b_overflow_1, self.b_overflow_1 + overflow(self.b, self.update_precision, self.b_range-1)))
215 |         updates.append((self.dEdw_overflow_1, self.dEdw_overflow_1 + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range-1)))
216 |         updates.append((self.dEdb_overflow_1, self.dEdb_overflow_1 + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range-1)))
217 |         updates.append((self.update_w_overflow_1, self.update_w_overflow_1 + overflow(self.update_W, self.comp_precision, self.update_w_range-1)))
218 |         updates.append((self.update_b_overflow_1, self.update_b_overflow_1 + overflow(self.update_b, self.comp_precision, self.update_b_range-1)))
219 |         
220 |         return updates
221 |     
222 |     def range_updates(self,batch_count):
223 |         
224 |         updates = []
225 |         
226 |         # update the ranges according to the overflow counters
227 |         updates.append((self.z_range, self.z_range+new_range(self.z_overflow/batch_count,self.z_overflow_1/batch_count, self.max_overflow)))
228 |         updates.append((self.dEdz_range, self.dEdz_range+new_range(self.dEdz_overflow/batch_count, self.dEdz_overflow_1/batch_count, self.max_overflow)))
229 |         updates.append((self.y_range, self.y_range+new_range(self.y_overflow/batch_count, self.y_overflow_1/batch_count, self.max_overflow)))
230 |         updates.append((self.dEdy_range, self.dEdy_range+new_range(self.dEdy_overflow/batch_count, self.dEdy_overflow_1/batch_count, self.max_overflow)))
231 |         updates.append((self.w_range, self.w_range+new_range(self.w_overflow/batch_count, self.w_overflow_1/batch_count, self.max_overflow)))
232 |         updates.append((self.b_range, self.b_range+new_range(self.b_overflow/batch_count, self.b_overflow_1/batch_count, self.max_overflow)))
233 |         updates.append((self.dEdw_range, self.dEdw_range+new_range(self.dEdw_overflow/batch_count, self.dEdw_overflow_1/batch_count, self.max_overflow)))
234 |         updates.append((self.dEdb_range, self.dEdb_range+new_range(self.dEdb_overflow/batch_count, self.dEdb_overflow_1/batch_count, self.max_overflow)))
235 |         updates.append((self.update_w_range, self.update_w_range+new_range(self.update_w_overflow/batch_count, self.update_w_overflow_1/batch_count, self.max_overflow)))
236 |         updates.append((self.update_b_range, self.update_b_range+new_range(self.update_b_overflow/batch_count, self.update_b_overflow_1/batch_count, self.max_overflow)))
237 | 
238 |         # reset the overflow counters
239 |         updates.append((self.z_overflow, 0.))
240 |         updates.append((self.dEdz_overflow, 0.))
241 |         updates.append((self.y_overflow, 0.))
242 |         updates.append((self.dEdy_overflow, 0.))
243 |         updates.append((self.w_overflow, 0.))
244 |         updates.append((self.b_overflow, 0.))
245 |         updates.append((self.dEdw_overflow, 0.))
246 |         updates.append((self.dEdb_overflow, 0.))
247 |         updates.append((self.update_w_overflow, 0.))
248 |         updates.append((self.update_b_overflow, 0.))        
249 |         
250 |         updates.append((self.z_overflow_1, 0.))
251 |         updates.append((self.dEdz_overflow_1, 0.))
252 |         updates.append((self.y_overflow_1, 0.))
253 |         updates.append((self.dEdy_overflow_1, 0.))
254 |         updates.append((self.w_overflow_1, 0.))
255 |         updates.append((self.b_overflow_1, 0.))
256 |         updates.append((self.dEdw_overflow_1, 0.))
257 |         updates.append((self.dEdb_overflow_1, 0.))
258 |         updates.append((self.update_w_overflow_1, 0.))
259 |         updates.append((self.update_b_overflow_1, 0.))
260 |         
261 |         return updates
262 |         
263 |     def print_range(self):
264 |         
265 |         print '            z NOIB = %i' %(self.z_range.get_value())
266 |         print '            y NOIB = %i' %(self.y_range.get_value())
267 |         print '            w NOIB = %i' %(self.w_range.get_value())
268 |         print '            b NOIB = %i' %(self.b_range.get_value())
269 |         print '            dEdz NOIB = %i' %(self.dEdz_range.get_value())
270 |         print '            dEdy NOIB = %i' %(self.dEdy_range.get_value())
271 |         print '            dEdw NOIB = %i' %(self.dEdw_range.get_value())
272 |         print '            dEdb NOIB = %i' %(self.dEdb_range.get_value())
273 |         print '            update w NOIB = %i' %(self.update_w_range.get_value())
274 |         print '            update b NOIB = %i' %(self.update_b_range.get_value())
275 |         
276 | class MaxoutLayer(dropout_layer):
277 | 
278 |     def __init__(self, rng, n_inputs, n_units, n_pieces, p, scale, max_col_norm, format, 
279 |         comp_precision, update_precision, initial_range, max_overflow):
280 |         
281 |         self.n_pieces=n_pieces
282 |         self.n_inputs = n_inputs
283 |         self.n_units = n_units
284 |         
285 |         print "        n_pieces = " + str(n_pieces)
286 |         print "        n_inputs = " + str(n_inputs)
287 |         print "        n_units = " + str(n_units)
288 |         
289 |         # call mother class constructor
290 |         dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, 
291 |             comp_precision, update_precision, initial_range, max_overflow)
292 |     
293 |         # initial values of parameters
294 |         low=-np.sqrt(6. / (n_inputs + n_units*n_pieces))
295 |         high=np.sqrt(6. / (n_inputs + n_units*n_pieces))
296 |         W_values = np.asarray(self.rng.uniform(low=low,high=high,size=(n_inputs, n_units*n_pieces)),dtype=theano.config.floatX)
297 |         b_values = np.zeros((n_units*n_pieces), dtype=theano.config.floatX)
298 |             
299 |         # creation of shared symbolic variables
300 |         # shared variables are the state of the built function
301 |         # in practice, we put them in the GPU memory
302 |         self.W = theano.shared(value=W_values, name='W')
303 |         self.b = theano.shared(value=b_values, name='b')
304 |         
305 |         # momentum
306 |         self.update_W = theano.shared(value=np.zeros((n_inputs, n_units*n_pieces), dtype=theano.config.floatX), name='update_W')
307 |         self.update_b = theano.shared(value=b_values, name='update_b')
308 |     
309 |     # activation function
310 |     def activation(self,z):
311 |         
312 |         y = T.reshape(z,(T.shape(z)[0], self.n_units, self.n_pieces))
313 |         
314 |         # maxout
315 |         y = T.max(y,axis=2)
316 |         
317 |         y = T.reshape(y,(T.shape(z)[0],self.n_units))
318 | 
319 |         return y
320 |     
321 |     def activation_bprop(self):
322 |     
323 |         self.fixed_dEdz = apply_format(self.format, 
324 |             T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.y:self.fixed_dEdy})[0], 
325 |             self.comp_precision, self.dEdz_range)
326 |         
327 | class SoftmaxLayer(dropout_layer):
328 |     
329 |     def __init__(self, rng, n_inputs, n_units, p, scale, max_col_norm, format, 
330 |         comp_precision, update_precision, initial_range, max_overflow):
331 |         
332 |         self.n_inputs = n_inputs
333 |         self.n_units = n_units
334 |         
335 |         print "        n_inputs = " + str(n_inputs)
336 |         print "        n_units = " + str(n_units)
337 |         
338 |         # call mother class constructor
339 |         dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, 
340 |             comp_precision, update_precision, initial_range, max_overflow)
341 |             
342 |         # initial values of parameters
343 |         W_values = np.zeros((n_inputs, n_units), dtype=theano.config.floatX)
344 |         b_values = np.zeros(n_units, dtype=theano.config.floatX)
345 |             
346 |         # creation of shared symbolic variables
347 |         self.W = theano.shared(value=W_values, name='W')
348 |         self.b = theano.shared(value=b_values, name='b')
349 |         
350 |         # momentum
351 |         self.update_W = theano.shared(value=W_values, name='update_W')
352 |         self.update_b = theano.shared(value=b_values, name='update_b')
353 |     
354 |         # activation function
355 |     def activation(self,z):
356 |         
357 |         return T.nnet.softmax(z)
358 |         
359 |     def activation_bprop(self):
360 |         
361 |         self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy, 
362 |             self.comp_precision, self.dEdz_range)
363 |         
364 | class Maxout_conv_layer(dropout_layer): 
365 |     
366 |     def __init__(self, rng, image_shape, zero_pad, output_shape, filter_shape, filter_stride, n_pieces, pool_shape, pool_stride, p, scale, max_col_norm, format,
367 |             comp_precision, update_precision, initial_range, max_overflow, w_LR_scale=1., b_LR_scale=1., partial_sum = 1):
368 |         
369 |         # call mother class constructor
370 |         dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, comp_precision, update_precision, initial_range, max_overflow, w_LR_scale, b_LR_scale)
371 |         
372 |         print '        output_shape = ' +str(output_shape)
373 |         print '        image_shape = ' +str(image_shape)
374 |         
375 |         # add n zero on both side of the input 
376 |         # 0 <-> valid convolution, result is smaller
377 |         # filter_size -1 <-> full convolution, result is bigger !
378 |         # valid convolution makes more sense to me. I use it to reduce the size of feature maps without using max pool.
379 |         print '        zero_pad = ' +str(zero_pad)
380 |         
381 |         # number of output feature maps, number of inputs feature maps, x, y
382 |         # number of inputs feature maps is important for the weights
383 |         print '        filter_shape = ' +str(filter_shape)
384 |         print '        filter_stride = ' +str(filter_stride)
385 |         print '        n_pieces = ' +str(n_pieces)
386 |         print '        pool_shape = ' +str(pool_shape)
387 |         print '        pool_stride = ' +str(pool_stride)
388 |         print '        partial_sum = ' +str(partial_sum)
389 |         
390 |         # save the parameters
391 |         self.output_shape = output_shape
392 |         self.image_shape = image_shape
393 |         self.zero_pad = zero_pad
394 |         self.filter_shape = (filter_shape[0]*n_pieces,filter_shape[1],filter_shape[2],filter_shape[3])
395 |         self.filter_stride = filter_stride
396 |         self.n_pieces = n_pieces
397 |         self.pool_shape = pool_shape  
398 |         self.pool_stride = pool_stride 
399 |         self.partial_sum = partial_sum 
400 |         
401 |         # range of init
402 |         fan_in = np.prod(self.filter_shape[1:])
403 |         fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]) /  self.n_pieces / np.prod(self.pool_shape)) 
404 | 
405 |         # initialize weights with random weights
406 |         W_bound = np.sqrt(6. / (fan_in + fan_out))
407 |         self.W = theano.shared(
408 |             np.asarray(rng.uniform(low=-W_bound, high=W_bound, size=self.filter_shape),
409 |                 dtype=theano.config.floatX))
410 |                 
411 |          # the bias is a 1D tensor -- one bias per output feature map
412 |         b_values = np.zeros((self.filter_shape[0],), dtype=theano.config.floatX)
413 |         self.b = theano.shared(value=b_values)
414 |         
415 |         self.update_W = theano.shared(value=np.zeros(self.filter_shape, dtype=theano.config.floatX), name='update_W')
416 |         self.update_b = theano.shared(value=np.zeros((self.filter_shape[0],), dtype=theano.config.floatX), name='update_b')    
417 | 
418 |     # activation function
419 |     def activation(self,conv_out):
420 |         
421 |         conv_out = T.reshape(conv_out,(T.shape(conv_out)[0], T.shape(conv_out)[1]//self.n_pieces, self.n_pieces,T.shape(conv_out)[2],T.shape(conv_out)[3] ))
422 |         return T.max( conv_out,axis=2)
423 |         
424 |     def fprop(self, input):
425 |         
426 |         # we reduce the precision of parameters for the computations
427 |         self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
428 |         self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
429 |         
430 |         input = input.reshape(self.image_shape)
431 |         
432 |         # convolution
433 |         input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b
434 |         filters_shuffled = self.w_comp.dimshuffle(1, 2, 3, 0) *self.scale # bc01 to c01b
435 |         conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad)
436 |         contiguous_input = gpu_contiguous(input_shuffled)
437 |         contiguous_filters = gpu_contiguous(filters_shuffled)
438 |         conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
439 |         
440 |         # downsample each feature map individually, using maxpooling
441 |         # pooled_out = downsample.max_pool_2d(input=conv_out,
442 |         #                                     ds=poolsize, ignore_border=True)
443 |         pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
444 |         pooled_out_shuffled = pool_op(conv_out_shuffled)
445 |         pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
446 |         
447 |         # bias
448 |         pooled_out = apply_format(self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x')*self.scale, self.comp_precision, self.z_range)
449 |         
450 |         # activation
451 |         pooled_out = self.activation(pooled_out)
452 |         pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range)
453 |         
454 |         return pooled_out
455 |     
456 |     def dropout_fprop(self, input):
457 |         
458 |         # we reduce the precision of parameters for the computations
459 |         self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
460 |         self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
461 |         
462 |         # create the dropout mask
463 |         # The cast is important because
464 |         # int * float32 = float64 which pulls things off the gpu
465 |         
466 |         srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
467 |         self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
468 |         input = input * self.mask
469 |         
470 |         self.fixed_x = input.reshape(self.image_shape)
471 | 
472 |         # convolution
473 |         input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b
474 |         filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b
475 |         conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) # augment partial sum -> use less memory but slower
476 |         contiguous_input = gpu_contiguous(input_shuffled)
477 |         contiguous_filters = gpu_contiguous(filters_shuffled)
478 |         conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
479 |         
480 |         self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
481 |         self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) 
482 |         
483 |         conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b
484 |         conv_out_shuffled = gpu_contiguous(conv_out_shuffled)
485 |         
486 |         # downsample each feature map individually, using maxpooling
487 |         # pooled_out = downsample.max_pool_2d(input=conv_out,
488 |         #                                     ds=poolsize, ignore_border=True)
489 |         pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
490 |         pooled_out_shuffled = pool_op(conv_out_shuffled)
491 |         pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
492 |         
493 |         # bias
494 |         self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x')
495 |         self.fixed_u =  apply_format(self.format, self.u, self.comp_precision, self.z_range)
496 |         
497 |         # activation
498 |         self.y = self.activation(self.fixed_u).flatten(2)
499 |         self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
500 |         
501 |         return self.fixed_y
502 |         
503 |     def bprop(self, dEdy):
504 |         
505 |         self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range)
506 |         
507 |         fixed_dEdu = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_u], known_grads={self.y:self.fixed_dEdy})[0],  self.comp_precision,self.dEdz_range)
508 |         
509 |         self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.u:fixed_dEdu})[0],  self.comp_precision,self.dEdb_range)
510 |         
511 |         self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.u:fixed_dEdu})[0], self.comp_precision, self.dEdz_range)
512 |         
513 |         self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0],  self.comp_precision,self.dEdw_range)
514 |         
515 |         dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
516 |         
517 |         dEdx = T.reshape(self.mask,T.shape(dEdx)) * dEdx
518 |         
519 |         return dEdx     


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Matthieu Courbariaux
  2 | 
  3 | # This file is part of deep-learning-multipliers.
  4 | 
  5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with deep-learning-multipliers.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | import gzip
 19 | import cPickle
 20 | import numpy as np
 21 | import os
 22 | import os.path
 23 | import sys
 24 | import time
 25 | 
 26 | from trainer import Trainer
 27 | from model import PI_MNIST_model, MNIST_model, CIFAR10_SVHN_model
 28 | 
 29 | from pylearn2.datasets.mnist import MNIST   
 30 | from pylearn2.datasets.zca_dataset import ZCA_Dataset    
 31 | from pylearn2.datasets.svhn import SVHN
 32 | from pylearn2.utils import serial
 33 |           
 34 | def onehot(x,numclasses=None):
 35 | 
 36 |     if x.shape==():
 37 |         x = x[None]
 38 |     if numclasses is None:
 39 |         numclasses = np.max(x) + 1
 40 |     result = np.zeros(list(x.shape) + [numclasses], dtype="int")
 41 |     z = np.zeros(x.shape)
 42 |     for c in range(numclasses):
 43 |         z *= 0
 44 |         z[np.where(x==c)] = 1
 45 |         result[...,c] += z
 46 | 
 47 |     result = np.reshape(result,(np.shape(result)[0], np.shape(result)[result.ndim-1]))
 48 |     return result
 49 |        
 50 | # MAIN
 51 | 
 52 | if __name__ == "__main__":
 53 |        
 54 |     print 'Beginning of the program'
 55 |     start_time = time.clock()   
 56 |     
 57 |     print 'Loading the dataset' 
 58 |     
 59 |     dataset = sys.argv[1]
 60 |     
 61 |     if dataset == "PI_MNIST" or dataset == "MNIST":
 62 |             
 63 |         train_set = MNIST(which_set= 'train',start=0, stop = 50000)#, center = True)
 64 |         valid_set = MNIST(which_set= 'train',start=50000, stop = 60000)#, center = True)
 65 |         test_set = MNIST(which_set= 'test')#, center = True)
 66 |         
 67 |         # for both datasets, onehot the target
 68 |         train_set.y = np.float32(onehot(train_set.y))
 69 |         valid_set.y = np.float32(onehot(valid_set.y))
 70 |         test_set.y = np.float32(onehot(test_set.y))
 71 |         
 72 |     elif dataset == "CIFAR10":
 73 |         
 74 |         preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
 75 |         train_set = ZCA_Dataset(
 76 |             preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
 77 |             preprocessor = preprocessor,
 78 |             start=0, stop = 45000)
 79 |         valid_set = ZCA_Dataset(
 80 |             preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
 81 |             preprocessor = preprocessor,
 82 |             start=45000, stop = 50000)  
 83 |         test_set = ZCA_Dataset(
 84 |             preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), 
 85 |             preprocessor = preprocessor) 
 86 |         
 87 |         # for both datasets, onehot the target
 88 |         train_set.y = np.float32(onehot(train_set.y))
 89 |         valid_set.y = np.float32(onehot(valid_set.y))
 90 |         test_set.y = np.float32(onehot(test_set.y))
 91 |         
 92 |     elif dataset == "SVHN":
 93 |         
 94 |         train_set = SVHN(
 95 |             which_set= 'splitted_train',
 96 |             path= "${SVHN_LOCAL_PATH}",
 97 |             axes= ['b', 'c', 0, 1])
 98 |      
 99 |         valid_set = SVHN(
100 |             which_set= 'valid',
101 |             path= "${SVHN_LOCAL_PATH}",
102 |             axes= ['b', 'c', 0, 1])
103 |         
104 |         test_set = SVHN(
105 |             which_set= 'test',
106 |             path= "${SVHN_LOCAL_PATH}",
107 |             axes= ['b', 'c', 0, 1])
108 |         
109 |     print 'Creating the model'
110 |     
111 |     # storing format hyperparameters
112 |     format = sys.argv[2]
113 |     
114 |     initial_range = 0
115 |     comp_precision = 0
116 |     update_precision = 0
117 |     range_update_frequency = 0
118 |     max_overflow = 0
119 |     range_init_epoch = 0
120 |     
121 |     if format == "FXP" or format == "DFXP":
122 |         initial_range = int(sys.argv[3])
123 |         comp_precision = int(sys.argv[4])
124 |         update_precision = int(sys.argv[5])
125 |         
126 |         if format == "DFXP":
127 |             range_update_frequency = int(sys.argv[6])
128 |             max_overflow = float(sys.argv[7])
129 |             range_init_epoch = int(sys.argv[8])   
130 |         
131 |     if dataset == "PI_MNIST":
132 |         
133 |         rng = np.random.RandomState(1234)
134 |         LR_start = 0.05
135 |         batch_size = 100
136 |         gpu_batches = 500
137 |         n_epoch = 800 
138 |         
139 |         model = PI_MNIST_model(rng = rng, batch_size = batch_size,
140 |             n_input = 784, n_output = 10, n_hidden = 240, n_pieces = 5, n_hidden_layers = 2, 
141 |             p_input = 0.8, scale_input = 1., p_hidden = 0.5, scale_hidden = 0.5, 
142 |             max_col_norm = 1.9365, format = format,
143 |             comp_precision = comp_precision, update_precision = update_precision, 
144 |             initial_range = initial_range, max_overflow = max_overflow)
145 |         
146 |         trainer = Trainer(rng = rng, load_path = None, save_path = None,
147 |             train_set = train_set, valid_set = valid_set, test_set = test_set,
148 |             model = model,
149 |             LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7, 
150 |             batch_size = batch_size, gpu_batches = gpu_batches,
151 |             n_epoch = n_epoch,
152 |             shuffle_batches = False, shuffle_examples = True,
153 |             format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
154 |     
155 |     elif dataset == "MNIST":
156 |     
157 |         rng = np.random.RandomState(1234)
158 |         LR_start = 0.02
159 |         batch_size = 128
160 |         gpu_batches = 391 # 391 -> 50000, 196 -> 25000, 79 -> 10000
161 |         n_epoch = 800
162 |         
163 |         model = MNIST_model(rng = rng, batch_size = batch_size, format = format,
164 |             comp_precision = comp_precision, update_precision = update_precision, 
165 |             initial_range = initial_range, max_overflow = max_overflow)
166 |         
167 |         trainer = Trainer(rng = rng, load_path = None, save_path = None,
168 |             train_set = train_set, valid_set = valid_set, test_set = test_set,
169 |             model = model,
170 |             LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7, 
171 |             batch_size = batch_size, gpu_batches = gpu_batches,
172 |             n_epoch = n_epoch,
173 |             shuffle_batches = False, shuffle_examples = True,
174 |             format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
175 |     
176 |     elif dataset == "CIFAR10":
177 |     
178 |         rng = np.random.RandomState(1234)
179 |         LR_start = 0.02
180 |         batch_size = 128
181 |         gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000
182 |         n_epoch = 400
183 |         
184 |         model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format,
185 |             comp_precision = comp_precision, update_precision = update_precision, 
186 |             initial_range = initial_range, max_overflow = max_overflow)
187 |         
188 |         trainer = Trainer(rng = rng, load_path = None, save_path = None,
189 |             train_set = train_set, valid_set = valid_set, test_set = test_set,
190 |             model = model,
191 |             LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7, 
192 |             batch_size = batch_size, gpu_batches = gpu_batches,
193 |             n_epoch = n_epoch,
194 |             shuffle_batches = False, shuffle_examples = True,
195 |             format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
196 |     
197 |     elif dataset == "SVHN":
198 |         
199 |         rng = np.random.RandomState(1234)
200 |         LR_start = 0.05
201 |         batch_size = 128
202 |         gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000
203 |         n_epoch = 200
204 |         
205 |         model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format,
206 |             comp_precision = comp_precision, update_precision = update_precision, 
207 |             initial_range = initial_range, max_overflow = max_overflow)
208 |         
209 |         trainer = Trainer(rng = rng, load_path = None, save_path = None,
210 |             train_set = train_set, valid_set = valid_set, test_set = test_set,
211 |             model = model,
212 |             LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7, 
213 |             batch_size = batch_size, gpu_batches = gpu_batches,
214 |             n_epoch = n_epoch,
215 |             shuffle_batches = True, shuffle_examples = False,
216 |             format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
217 | 
218 |     print 'Building'
219 |     
220 |     trainer.build()
221 |     
222 |     print 'Training'
223 |     
224 |     trainer.train()
225 | 
226 |     end_time = time.clock()
227 |     print 'The code ran for %i seconds'%(end_time - start_time)
228 |     


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Matthieu Courbariaux
  2 | 
  3 | # This file is part of deep-learning-multipliers.
  4 | 
  5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with deep-learning-multipliers.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | import gzip
 19 | import cPickle
 20 | import numpy as np
 21 | import os
 22 | import os.path
 23 | import sys
 24 | import theano 
 25 | import theano.tensor as T
 26 | import time
 27 | 
 28 | from layer import Maxout_conv_layer, SoftmaxLayer, MaxoutLayer            
 29 |         
 30 |         
 31 | class deep_dropout_network(object):
 32 |     
 33 |     layer = []                
 34 |     
 35 |     def __init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision, 
 36 |         initial_range, max_overflow, format):
 37 |         
 38 |         print '    Overall description:'
 39 |         print '        Batch size = %i' %(batch_size)
 40 |         print '        Number of layers = %i' %(n_hidden_layers)
 41 |         print '        Computation precision = %i bits' %(comp_precision)
 42 |         print '        Update precision = %i bits' %(update_precision)
 43 |         print '        Initial range = %i bits' %(initial_range)
 44 |         print '        Maximum overflow rate = %f %%' %(max_overflow*100)   
 45 |         print "        Format = " + format      
 46 |         
 47 |         self.rng = rng
 48 |         self.batch_size = batch_size
 49 |         self.n_hidden_layers = n_hidden_layers
 50 |         self.comp_precision = comp_precision
 51 |         self.update_precision = update_precision
 52 |         self.initial_range = initial_range
 53 |         self.max_overflow = max_overflow
 54 |         self.format = format
 55 |     
 56 |     def fprop(self, x):
 57 |     
 58 |         y = self.layer[0].fprop(x)
 59 |         
 60 |         for k in range(1,self.n_hidden_layers+1):
 61 | 
 62 |             y = self.layer[k].fprop(y)
 63 |         
 64 |         return y
 65 |     
 66 |     def dropout_fprop(self, x):
 67 |         
 68 |         y = self.layer[0].dropout_fprop(x)
 69 |         
 70 |         for k in range(1,self.n_hidden_layers+1):
 71 | 
 72 |             y = self.layer[k].dropout_fprop(y)
 73 |         
 74 |         return y
 75 |         
 76 |     # when you use fixed point, you cannot use T.grad directly -> bprop modifications.
 77 |     def bprop(self, y, t):
 78 |         
 79 |         # there is a simplification between softmax derivative and nll derivative        
 80 |         dEdy = (y-t)/T.cast(T.shape(y)[1],dtype=theano.config.floatX) # /2. # actually, it is dEdz and not dEdy
 81 |         
 82 |         # bprop
 83 |         for k in range(self.n_hidden_layers,-1,-1):
 84 |             dEdy = self.layer[k].bprop(dEdy)
 85 |             
 86 |     # you give it the input and the target and it gives you the updates
 87 |     def parameter_updates(self, LR, M):
 88 |             
 89 |         # updates
 90 |         parameter_updates = self.layer[0].parameter_updates(LR, M)
 91 |         for k in range(1,self.n_hidden_layers+1):
 92 |             parameter_updates = parameter_updates + self.layer[k].parameter_updates(LR, M)
 93 |         
 94 |         return parameter_updates
 95 |         
 96 |     # function that updates the ranges of all fixed point vectors
 97 |     def range_updates(self,batch_count):
 98 |             
 99 |         range_updates = self.layer[0].range_updates(batch_count)
100 |         for k in range(1,self.n_hidden_layers+1):
101 |             range_updates = range_updates + self.layer[k].range_updates(batch_count)
102 |         
103 |         return range_updates
104 |         
105 |     # function that updates the ranges of all fixed point vectors
106 |     def overflow_updates(self):
107 |             
108 |         overflow_updates = self.layer[0].overflow_updates()
109 |         for k in range(1,self.n_hidden_layers+1):
110 |             overflow_updates = overflow_updates + self.layer[k].overflow_updates()
111 |         
112 |         return overflow_updates
113 | 
114 |     # train function
115 |     def updates(self, x, t, LR, M):
116 |     
117 |         y = self.dropout_fprop(x)
118 |         self.bprop(y,t)
119 |         updates = self.parameter_updates(LR,M)
120 |         
121 |         if self.format == "DFXP":
122 |             updates += self.overflow_updates()
123 |         
124 |         return updates   
125 |     
126 |     def errors(self, x, t):
127 |         
128 |         y = self.fprop(x) 
129 |         
130 |         # error function
131 |         errors = T.sum(T.neq(T.argmax(y, axis=1), T.argmax(t, axis=1)))
132 |         
133 |         return errors
134 |     
135 |     def save_params(self):        
136 |         
137 |         self.W_save = []
138 |         self.b_save = []
139 |         
140 |         for k in xrange(self.n_hidden_layers+1):
141 |             self.W_save.append(self.layer[k].W.get_value(borrow=False))
142 |             self.b_save.append(self.layer[k].b.get_value(borrow=False)) 
143 |         
144 |     def load_params(self): 
145 |         
146 |         # read an load all the parameters
147 |         for k in xrange(self.n_hidden_layers+1):
148 |             self.layer[k].W.set_value(self.W_save[k])
149 |             self.layer[k].b.set_value(self.b_save[k])
150 |     
151 |     def save_params_file(self, path):        
152 |         
153 |         # Open the file and overwrite current contents
154 |         save_file = open(path, 'wb')
155 |         
156 |         # write all the parameters in the file
157 |         for k in xrange(self.n_hidden_layers+1):
158 |             cPickle.dump(self.layer[k].W.get_value(), save_file, -1)
159 |             cPickle.dump(self.layer[k].b.get_value(), save_file, -1)
160 |         
161 |         # close the file
162 |         save_file.close()
163 |         
164 |     def load_params_file(self, path): 
165 |         
166 |         # Open the file
167 |         save_file = open(path)
168 |         
169 |         # read an load all the parameters
170 |         for k in xrange(self.n_hidden_layers+1):
171 |             self.layer[k].W.set_value(cPickle.load(save_file))
172 |             self.layer[k].b.set_value(cPickle.load(save_file))
173 | 
174 |         # close the file
175 |         save_file.close()
176 |     
177 | 
178 |     def print_range(self):
179 |         
180 |         for k in xrange(self.n_hidden_layers+1):
181 |             print '        Layer %i range:'%(k)
182 |             self.layer[k].print_range()
183 |             
184 |     def set_comp_precision(self, comp_precision):
185 |         
186 |         for k in xrange(self.n_hidden_layers+1):
187 |             self.layer[k].comp_precision.set_value(comp_precision)
188 |     
189 |     def get_comp_precision(self):
190 |         
191 |         return self.layer[0].comp_precision.get_value()
192 |             
193 |     def set_update_precision(self, update_precision):
194 |         
195 |         for k in xrange(self.n_hidden_layers+1):
196 |             self.layer[k].update_precision.set_value(update_precision)
197 |     
198 |     def get_update_precision(self):
199 |         
200 |         return self.layer[0].update_precision.get_value()
201 |     
202 |     def set_max_overflow(self, max_overflow):
203 |         
204 |         for k in xrange(self.n_hidden_layers+1):
205 |             self.layer[k].max_overflow.set_value(max_overflow)
206 |             
207 |     def get_max_overflow(self):
208 |         
209 |         return self.layer[0].max_overflow.get_value()
210 | 
211 | class PI_MNIST_model(deep_dropout_network):
212 | 
213 |     def __init__(self, rng, batch_size, n_input, n_output, n_hidden, n_pieces, n_hidden_layers, 
214 |         p_input,  scale_input, p_hidden, scale_hidden, max_col_norm, format,
215 |         comp_precision, update_precision, initial_range, max_overflow):
216 |         
217 |         deep_dropout_network.__init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision, 
218 |             initial_range, max_overflow, format)
219 |         
220 |         print '        n_input = %i' %(n_input)
221 |         print '        n_output = %i' %(n_output)
222 |         print '        n_hidden = %i' %(n_hidden)
223 |         print '        n_pieces = %i' %(n_pieces)
224 |         print '        p_input = %f' %(p_input)
225 |         print '        scale_input = %f' %(scale_input)
226 |         print '        p_hidden = %f' %(p_hidden)
227 |         print '        scale_hidden = %f' %(scale_hidden)
228 |         print '        max_col_norm = %f' %(max_col_norm)
229 |         
230 |         # save the parameters
231 |         self.n_input = n_input
232 |         self.n_output = n_output
233 |         self.n_hidden = n_hidden
234 |         self.n_pieces = n_pieces
235 |         self.p_input = p_input
236 |         self.scale_input = scale_input
237 |         self.p_hidden = p_hidden
238 |         self.scale_hidden = scale_hidden
239 |         self.max_col_norm = max_col_norm
240 |         
241 |         # Create MLP layers    
242 |         if self.n_hidden_layers == 0 :
243 |             
244 |             print "    Softmax layer:"
245 |             
246 |             self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs=self.n_input, n_units=self.n_output, 
247 |                 p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format,
248 |                 comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
249 | 
250 |         else :
251 |             
252 |             print "    Maxout layer 1:"
253 |             
254 |             self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_input, n_units = self.n_hidden, n_pieces = self.n_pieces, 
255 |                 p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format,
256 |                 comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
257 | 
258 |             for k in range(1,self.n_hidden_layers):
259 |                 
260 |                 print "    Maxout layer "+str(k+1)+":"
261 |                 self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_hidden, n_units = self.n_hidden, n_pieces = self.n_pieces, 
262 |                 p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format,
263 |                 comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
264 |             
265 |             print "    Softmax layer:"
266 |             
267 |             self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs= self.n_hidden, n_units= self.n_output, 
268 |                 p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format,
269 |                 comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))     
270 |         
271 | class MNIST_model(deep_dropout_network):
272 |     
273 |     def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format):
274 |         
275 |         deep_dropout_network.__init__(self, rng, batch_size, 3, comp_precision, update_precision, 
276 |             initial_range, max_overflow, format)
277 |         
278 |         print "    Convolution layer 1:"
279 |         
280 |         self.layer.append(Maxout_conv_layer(
281 |             rng,
282 |             image_shape=(batch_size, 1, 28, 28),
283 |             zero_pad = 0, 
284 |             output_shape=(batch_size, 48, 10, 10),
285 |             filter_shape=(48, 1, 8, 8),
286 |             filter_stride = 1,
287 |             n_pieces = 2,
288 |             pool_shape=(4, 4),
289 |             pool_stride = 2,
290 |             p = 0.8, 
291 |             scale = 1., 
292 |             max_col_norm = 0.9,
293 |             format = format,
294 |             comp_precision = comp_precision, 
295 |             update_precision = update_precision, 
296 |             initial_range = initial_range, 
297 |             max_overflow = max_overflow
298 |         ))
299 |         
300 |         
301 |         print "    Convolution layer 2:"
302 |         
303 |         self.layer.append(Maxout_conv_layer(
304 |             rng,
305 |             image_shape=(batch_size, 48, 10, 10),
306 |             zero_pad = 3, # add n zero on both side of the input
307 |             output_shape=(batch_size, 48, 4, 4),
308 |             filter_shape=(48, 48, 8, 8),
309 |             filter_stride = 1,
310 |             n_pieces = 2,
311 |             pool_shape=(4, 4),
312 |             pool_stride =2,
313 |             p = 0.5, 
314 |             scale = 0.5, 
315 |             max_col_norm = 1.9365,
316 |             format = format,
317 |             comp_precision = comp_precision, 
318 |             update_precision = update_precision, 
319 |             initial_range = initial_range, 
320 |             max_overflow = max_overflow
321 |         ))   
322 |                
323 |         
324 |         print "    Convolution layer 3:"
325 |         
326 |         self.layer.append(Maxout_conv_layer(
327 |             rng,
328 |             image_shape=(batch_size, 48, 4, 4),
329 |             zero_pad = 3, # add n zero on both side of the input
330 |             output_shape=(batch_size, 24, 3, 3),
331 |             filter_shape=(24, 48, 5, 5),
332 |             filter_stride = 1,
333 |             n_pieces = 4,
334 |             pool_shape=(2, 2),
335 |             pool_stride =2,
336 |             p = 0.5, 
337 |             scale = 0.5, 
338 |             max_col_norm = 1.9365,
339 |             format = format,
340 |             comp_precision = comp_precision, 
341 |             update_precision = update_precision, 
342 |             initial_range = initial_range, 
343 |             max_overflow = max_overflow
344 |         )) 
345 |         
346 |         print "    Softmax layer:"
347 |         
348 |         self.layer.append(SoftmaxLayer(
349 |             rng = rng, 
350 |             n_inputs= 24*3*3, 
351 |             n_units = 10, 
352 |             p = 0.5, 
353 |             scale = 0.5, 
354 |             max_col_norm =1.9365,
355 |             format = format,
356 |             comp_precision = comp_precision, 
357 |             update_precision = update_precision, 
358 |             initial_range = initial_range, 
359 |             max_overflow = max_overflow
360 |         ))
361 |         
362 | class CIFAR10_SVHN_model(deep_dropout_network):
363 |     
364 |     def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format):
365 |         
366 |         deep_dropout_network.__init__(self, rng, batch_size, 4, comp_precision, update_precision, 
367 |             initial_range, max_overflow, format)
368 |         
369 |         print "    Convolution layer 1:"
370 |         
371 |         self.layer.append(Maxout_conv_layer(
372 |             rng,
373 |             image_shape=(batch_size, 3, 32, 32),
374 |             zero_pad = 2, 
375 |             output_shape=(batch_size, 64, 16, 16), # 64 does fit in memory
376 |             filter_shape=(64, 3, 5, 5),
377 |             filter_stride = 1,
378 |             n_pieces = 2,
379 |             pool_shape=(3, 3),
380 |             pool_stride = 2,
381 |             p = 0.8, 
382 |             scale = 1., 
383 |             max_col_norm = 0.9,
384 |             format = format,
385 |             comp_precision = comp_precision, 
386 |             update_precision = update_precision, 
387 |             initial_range = initial_range, 
388 |             max_overflow = max_overflow,
389 |             w_LR_scale = 0.2,
390 |             b_LR_scale = 0.2,
391 |             # partial_sum = 32 # total number = 33*33
392 |         ))
393 |         
394 |         
395 |         print "    Convolution layer 2:"
396 |         
397 |         self.layer.append(Maxout_conv_layer(
398 |             rng,
399 |             image_shape=(batch_size, 64, 16, 16),
400 |             zero_pad = 2, # add n zero on both side of the input
401 |             output_shape=(batch_size, 128, 8, 8),
402 |             filter_shape=(128, 64, 5, 5),
403 |             filter_stride = 1,
404 |             n_pieces = 2,
405 |             pool_shape=(3, 3),
406 |             pool_stride =2,
407 |             p = 0.5, 
408 |             scale = 0.5, 
409 |             max_col_norm = 1.9365,
410 |             format = format,
411 |             comp_precision = comp_precision, 
412 |             update_precision = update_precision, 
413 |             initial_range = initial_range, 
414 |             max_overflow = max_overflow,
415 |             w_LR_scale = 0.2,
416 |             b_LR_scale = 0.2,
417 |             # partial_sum = 16 # total number = 15*15
418 |         ))   
419 |                
420 |         
421 |         print "    Convolution layer 3:"
422 |         
423 |         self.layer.append(Maxout_conv_layer(
424 |             rng,
425 |             image_shape=(batch_size, 128, 8, 8),
426 |             zero_pad = 2, # add n zero on both side of the input
427 |             output_shape=(batch_size, 128, 4, 4),
428 |             filter_shape=(128, 128, 5, 5),
429 |             filter_stride = 1,
430 |             n_pieces = 2,
431 |             pool_shape=(3, 3),
432 |             pool_stride =2,
433 |             p = 0.5, 
434 |             scale = 0.5, 
435 |             max_col_norm = 1.9365,
436 |             format = format,
437 |             comp_precision = comp_precision, 
438 |             update_precision = update_precision, 
439 |             initial_range = initial_range, 
440 |             max_overflow = max_overflow,
441 |             w_LR_scale = 0.2,
442 |             b_LR_scale = 0.2,
443 |             # partial_sum = 8 # total number = 9*9
444 |         )) 
445 |         
446 |         print "    Maxout layer:"
447 |         
448 |         self.layer.append(MaxoutLayer(
449 |             rng = rng, 
450 |             n_inputs= 128*4*4, 
451 |             n_units = 400,
452 |             n_pieces = 5,
453 |             p = 0.5, 
454 |             scale = 0.5, 
455 |             max_col_norm = 1.9365,
456 |             format = format,
457 |             comp_precision = comp_precision, 
458 |             update_precision = update_precision, 
459 |             initial_range = initial_range, 
460 |             max_overflow = max_overflow
461 |         ))
462 |         
463 |         print "    Softmax layer:"
464 |         
465 |         self.layer.append(SoftmaxLayer(
466 |             rng = rng, 
467 |             n_inputs= 400, 
468 |             n_units = 10, 
469 |             p = 0.5, 
470 |             scale = 0.5, 
471 |             max_col_norm = 1.9365,
472 |             format = format,
473 |             comp_precision = comp_precision, 
474 |             update_precision = update_precision, 
475 |             initial_range = initial_range, 
476 |             max_overflow = max_overflow
477 |         ))
478 |           


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Matthieu Courbariaux
  2 | 
  3 | # This file is part of deep-learning-multipliers.
  4 | 
  5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with deep-learning-multipliers.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | import gzip
 19 | import cPickle
 20 | import numpy as np
 21 | import os
 22 | import os.path
 23 | import sys
 24 | import theano 
 25 | import theano.tensor as T
 26 | import time
 27 | 
 28 | # TRAINING
 29 | 
 30 | class Trainer(object):
 31 |     
 32 |     def __init__(self,
 33 |             rng, save_path, load_path,
 34 |             train_set, valid_set, test_set,
 35 |             model,
 36 |             LR_start, LR_sat, LR_fin, M_start, M_sat, M_fin, 
 37 |             batch_size, gpu_batches,
 38 |             n_epoch,
 39 |             format, range_update_frequency, range_init_epoch,
 40 |             shuffle_batches, shuffle_examples):
 41 |         
 42 |         print '    Training algorithm:'
 43 |         print '        Learning rate = %f' %(LR_start)
 44 |         print '        Learning rate saturation = %i' %(LR_sat)
 45 |         print '        Final learning rate = %f' %(LR_fin)
 46 |         print '        Momentum = %f' %(M_start)
 47 |         print '        Momentum saturation = %i' %(M_sat)
 48 |         print '        Final momentum = %f' %(M_fin)
 49 |         print '        Batch size = %i' %(batch_size)
 50 |         print '        gpu_batches = %i' %(gpu_batches)
 51 |         print '        Number of epochs = %i' %(n_epoch)
 52 |         print '        shuffle_batches = %i' %(shuffle_batches)
 53 |         print '        shuffle_examples = %i' %(shuffle_examples)
 54 |         print '        Format = '+ format
 55 |         print '        Range update frequency = %i' %(range_update_frequency)
 56 |         print '        Range init epochs = %i' %(range_init_epoch)
 57 | 
 58 |         # save the dataset
 59 |         self.rng = rng
 60 |         self.shuffle_batches = shuffle_batches
 61 |         self.shuffle_examples = shuffle_examples
 62 |         self.load_path = load_path
 63 |         self.save_path = save_path
 64 |         self.train_set = train_set
 65 |         self.valid_set = valid_set
 66 |         self.test_set = test_set
 67 |         
 68 |         # save the model
 69 |         self.model = model
 70 |         
 71 |         # save the parameters
 72 |         self.LR_start = LR_start
 73 |         self.LR_sat = LR_sat
 74 |         self.LR_fin = LR_fin
 75 |         self.M_start = M_start
 76 |         self.M_sat = M_sat
 77 |         self.M_fin = M_fin
 78 |         self.batch_size = batch_size
 79 |         self.gpu_batches = gpu_batches
 80 |         self.n_epoch = n_epoch
 81 |         self.format = format
 82 |         self.range_update_frequency = range_update_frequency
 83 |         self.range_init_epoch = range_init_epoch
 84 |         
 85 |         # put a part of the dataset on gpu
 86 |         self.shared_x = theano.shared(
 87 |             np.asarray(self.train_set.X[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX))
 88 |         self.shared_y = theano.shared(
 89 |             np.asarray(self.train_set.y[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX))
 90 |     
 91 |     def shuffle(self, set):
 92 |         
 93 |         # on the CPU for the moment.
 94 |         X = np.copy(set.X)
 95 |         y = np.copy(set.y)
 96 |                 
 97 |         shuffled_index = range(set.X.shape[0])
 98 |         self.rng.shuffle(shuffled_index)
 99 |         
100 |         for i in range(set.X.shape[0]):
101 |             set.X[i] = X[shuffled_index[i]]
102 |             set.y[i] = y[shuffled_index[i]]
103 |     
104 |     def init_range(self):
105 |         
106 |         # save the precisions and the random parameters of the model
107 |         comp_precision = self.model.get_comp_precision()
108 |         update_precision = self.model.get_update_precision()
109 |         self.model.save_params()
110 |         
111 |         # set a good precision 
112 |         self.model.set_comp_precision(31)
113 |         self.model.set_update_precision(31)
114 |         
115 |         # train n epochs to adjust the initial range
116 |         for k in range(self.range_init_epoch):
117 |             self.train_epoch(self.train_set)
118 |         
119 |         # set back the precision and the random parameters
120 |         self.model.set_comp_precision(comp_precision)
121 |         self.model.set_update_precision(update_precision)
122 |         self.model.load_params()
123 |     
124 |     def init(self):
125 |         
126 |         if self.load_path != None:
127 |             self.model.load_params_file(self.load_path)
128 | 
129 |         self.LR = self.LR_start
130 |         self.LR_step = (self.LR_fin-self.LR_start)/self.LR_sat 
131 |         self.M = self.M_start 
132 |         self.M_step = (self.M_fin-self.M_start)/self.M_sat 
133 |         
134 |         self.epoch = 0
135 |         self.best_epoch = self.epoch
136 |         
137 |         # test it on the validation set
138 |         self.validation_ER = self.test_epoch(self.valid_set)
139 |         # test it on the test set
140 |         self.test_ER = self.test_epoch(self.test_set)
141 |         
142 |         self.best_validation_ER = self.validation_ER
143 |         self.best_test_ER = self.test_ER
144 |         
145 |         if self.format == "DFXP" : 
146 |             self.init_range()
147 |  
148 |     def update(self):
149 |         
150 |         # start by shuffling train set
151 |         if self.shuffle_examples == True:
152 |             self.shuffle(self.train_set)
153 |         
154 |         self.epoch += 1
155 |         
156 |         # train the model on all training examples
157 |         self.train_epoch(self.train_set)
158 |         
159 |         # test it on the validation set
160 |         self.validation_ER = self.test_epoch(self.valid_set)
161 |         
162 |         # test it on the test set
163 |         self.test_ER = self.test_epoch(self.test_set) 
164 |         
165 |         # update LR and M as well during the first phase
166 |         self.update_LR()
167 |         self.update_M()
168 |         
169 |         # save the best parameters
170 |         if self.validation_ER < self.best_validation_ER:
171 |             self.best_validation_ER = self.validation_ER
172 |             self.best_test_ER = self.test_ER
173 |             self.best_epoch = self.epoch
174 |             if self.save_path != None:
175 |                 self.model.save_params_file(self.save_path)
176 |     
177 |     def load_shared_dataset(self, set, start,size):
178 |         
179 |         self.shared_x.set_value(
180 |             set.X[self.batch_size*start:self.batch_size*(size+start)])
181 |         self.shared_y.set_value(
182 |             set.y[self.batch_size*start:self.batch_size*(size+start)])
183 |     
184 |     def train_epoch(self, set):
185 |         
186 |         # number of batch in the dataset
187 |         n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size))
188 |         # number of group of batches (in the memory of the GPU)
189 |         n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches))
190 |         
191 |         # number of batches in the last group
192 |         if self.gpu_batches<=n_batches:
193 |             n_remaining_batches = n_batches%self.gpu_batches
194 |         else:
195 |             n_remaining_batches = n_batches
196 |         
197 |         # batch counter for the range update frequency
198 |         k = 0
199 |         
200 |         shuffled_range_i = range(n_gpu_batches)
201 |         
202 |         if self.shuffle_batches==True:
203 |             self.rng.shuffle(shuffled_range_i)
204 |         
205 |         for i in shuffled_range_i:
206 |         
207 |             self.load_shared_dataset(set,
208 |                 start=i*self.gpu_batches,
209 |                 size=self.gpu_batches)
210 |             
211 |             shuffled_range_j = range(self.gpu_batches)
212 |             
213 |             if self.shuffle_batches==True:
214 |                 self.rng.shuffle(shuffled_range_j)
215 |             
216 |             for j in shuffled_range_j: 
217 | 
218 |                 self.train_batch(j, self.LR, self.M)
219 |                 
220 |                 # update the dynamic ranges every range_update_frequency epoch
221 |                 if self.format == "DFXP" : 
222 |                     k+=1
223 |                     if k==self.range_update_frequency:
224 |                         self.update_range(k)
225 |                         k=0
226 |         
227 |         # load the last incomplete gpu batch of batches
228 |         if n_remaining_batches > 0:
229 |         
230 |             self.load_shared_dataset(set,
231 |                     start=n_gpu_batches*self.gpu_batches,
232 |                     size=n_remaining_batches)
233 |             
234 |             shuffled_range_j = range(n_remaining_batches)
235 |             if self.shuffle_batches==True:
236 |                 self.rng.shuffle(shuffled_range_j)
237 |             
238 |             for j in shuffled_range_j: 
239 | 
240 |                 self.train_batch(j, self.LR, self.M)
241 |                 
242 |                 # update the dynamic ranges every range_update_frequency epoch
243 |                 if self.format == "DFXP" : 
244 |                     k+=1
245 |                     if k==self.range_update_frequency:
246 |                         self.update_range(k)
247 |                         k=0
248 |     
249 |     def test_epoch(self, set):
250 |         
251 |         n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size))
252 |         n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches))
253 |         
254 |         if self.gpu_batches<=n_batches:
255 |             n_remaining_batches = n_batches%self.gpu_batches
256 |         else:
257 |             n_remaining_batches = n_batches
258 |         
259 |         error_rate = 0.
260 |         
261 |         for i in range(n_gpu_batches):
262 |         
263 |             self.load_shared_dataset(set,
264 |                 start=i*self.gpu_batches,
265 |                 size=self.gpu_batches)
266 |             
267 |             for j in range(self.gpu_batches): 
268 | 
269 |                 error_rate += self.test_batch(j)
270 |         
271 |         # load the last incomplete gpu batch of batches
272 |         if n_remaining_batches > 0:
273 |         
274 |             self.load_shared_dataset(set,
275 |                     start=n_gpu_batches*self.gpu_batches,
276 |                     size=n_remaining_batches)
277 |             
278 |             for j in range(n_remaining_batches): 
279 | 
280 |                 error_rate += self.test_batch(j)
281 |         
282 |         error_rate /= (n_batches*self.batch_size)
283 |         error_rate *= 100.
284 |         
285 |         return error_rate
286 |     
287 |     def update_LR(self):
288 | 
289 |         if self.LR > self.LR_fin:
290 |             self.LR += self.LR_step
291 |         else:
292 |             self.LR = self.LR_fin
293 |     
294 |     def update_M(self):
295 |     
296 |         if self.M < self.M_fin: 
297 |             self.M += self.M_step
298 |         else:
299 |             self.M = self.M_fin
300 |     
301 |     def monitor(self):
302 |     
303 |         print '    epoch %i:' %(self.epoch)
304 |         print '        learning rate %f' %(self.LR)
305 |         print '        momentum %f' %(self.M)
306 |         print '        validation error rate %f%%' %(self.validation_ER)
307 |         print '        test error rate %f%%' %(self.test_ER)
308 |         print '        epoch associated to best validation error %i' %(self.best_epoch)
309 |         print '        best validation error rate %f%%' %(self.best_validation_ER)
310 |         print '        test error rate associated to best validation error %f%%' %(self.best_test_ER)
311 |         
312 |         if self.format == "DFXP": 
313 |             self.model.print_range()
314 |     
315 |     def train(self):        
316 |         
317 |         self.init()
318 |         self.monitor()
319 |         
320 |         for epoch in range(self.n_epoch):
321 |             
322 |             self.update()   
323 |             self.monitor()
324 |     
325 |     def build(self):
326 |         
327 |         # input and output variables
328 |         x = T.matrix('x')
329 |         y = T.matrix('y')
330 |         index = T.lscalar() 
331 |         batch_count = T.lscalar() 
332 |         LR = T.scalar('LR', dtype=theano.config.floatX)
333 |         M = T.scalar('M', dtype=theano.config.floatX)
334 | 
335 |         # before the build, you work with symbolic variables
336 |         # after the build, you work with numeric variables
337 |         
338 |         self.train_batch = theano.function(inputs=[index,LR,M], updates=self.model.updates(x,y,LR,M),givens={ 
339 |                 x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 
340 |                 y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
341 |                 name = "train_batch", on_unused_input='warn')
342 |         
343 |         self.test_batch = theano.function(inputs=[index],outputs=self.model.errors(x,y),givens={
344 |                 x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 
345 |                 y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
346 |                 name = "test_batch")
347 |                 
348 |         if self.format == "DFXP" :  
349 |             self.update_range = theano.function(inputs=[batch_count],updates=self.model.range_updates(batch_count), name = "update_range")
350 | 


--------------------------------------------------------------------------------
/utilities/filter_plot.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy
  3 | 
  4 | def scale_to_unit_interval(ndar, eps=1e-8):
  5 |   """ Scales all values in the ndarray ndar to be between 0 and 1 """
  6 |   ndar = ndar.copy()
  7 |   ndar -= ndar.min()
  8 |   ndar *= 1.0 / (ndar.max() + eps)
  9 |   return ndar
 10 | 
 11 | 
 12 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 13 |                        scale_rows_to_unit_interval=True,
 14 |                        output_pixel_vals=True):
 15 |   """
 16 |   Transform an array with one flattened image per row, into an array in
 17 |   which images are reshaped and layed out like tiles on a floor.
 18 | 
 19 |   This function is useful for visualizing datasets whose rows are images,
 20 |   and also columns of matrices for transforming those rows
 21 |   (such as the first layer of a neural net).
 22 | 
 23 |   :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 24 |   be 2-D ndarrays or None;
 25 |   :param X: a 2-D array in which every row is a flattened image.
 26 | 
 27 |   :type img_shape: tuple; (height, width)
 28 |   :param img_shape: the original shape of each image
 29 | 
 30 |   :type tile_shape: tuple; (rows, cols)
 31 |   :param tile_shape: the number of images to tile (rows, cols)
 32 | 
 33 |   :param output_pixel_vals: if output should be pixel values (i.e. int8
 34 |   values) or floats
 35 | 
 36 |   :param scale_rows_to_unit_interval: if the values need to be scaled before
 37 |   being plotted to [0,1] or not
 38 | 
 39 | 
 40 |   :returns: array suitable for viewing as an image.
 41 |   (See:`Image.fromarray`.)
 42 |   :rtype: a 2-d array with same dtype as X.
 43 | 
 44 |   """
 45 | 
 46 |   assert len(img_shape) == 2
 47 |   assert len(tile_shape) == 2
 48 |   assert len(tile_spacing) == 2
 49 | 
 50 |   # The expression below can be re-written in a more C style as
 51 |   # follows :
 52 |   #
 53 |   # out_shape = [0,0]
 54 |   # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
 55 |   #                tile_spacing[0]
 56 |   # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
 57 |   #                tile_spacing[1]
 58 |   out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
 59 |                       in zip(img_shape, tile_shape, tile_spacing)]
 60 | 
 61 |   if isinstance(X, tuple):
 62 |       assert len(X) == 4
 63 |       # Create an output numpy ndarray to store the image
 64 |       if output_pixel_vals:
 65 |           out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
 66 |       else:
 67 |           out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
 68 | 
 69 |       #colors default to 0, alpha defaults to 1 (opaque)
 70 |       if output_pixel_vals:
 71 |           channel_defaults = [0, 0, 0, 255]
 72 |       else:
 73 |           channel_defaults = [0., 0., 0., 1.]
 74 | 
 75 |       for i in xrange(4):
 76 |           if X[i] is None:
 77 |               # if channel is None, fill it with zeros of the correct
 78 |               # dtype
 79 |               out_array[:, :, i] = numpy.zeros(out_shape,
 80 |                       dtype='uint8' if output_pixel_vals else out_array.dtype
 81 |                       ) + channel_defaults[i]
 82 |           else:
 83 |               # use a recurrent call to compute the channel and store it
 84 |               # in the output
 85 |               out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
 86 |       return out_array
 87 | 
 88 |   else:
 89 |       # if we are dealing with only one channel
 90 |       H, W = img_shape
 91 |       Hs, Ws = tile_spacing
 92 | 
 93 |       # generate a matrix to store the output
 94 |       out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
 95 | 
 96 | 
 97 |       for tile_row in xrange(tile_shape[0]):
 98 |           for tile_col in xrange(tile_shape[1]):
 99 |               if tile_row * tile_shape[1] + tile_col < X.shape[0]:
100 |                   if scale_rows_to_unit_interval:
101 |                       # if we should scale values to be between 0 and 1
102 |                       # do this by calling the `scale_to_unit_interval`
103 |                       # function
104 |                       this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
105 |                   else:
106 |                       this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
107 |                   # add the slice to the corresponding position in the
108 |                   # output array
109 |                   out_array[
110 |                       tile_row * (H+Hs): tile_row * (H + Hs) + H,
111 |                       tile_col * (W+Ws): tile_col * (W + Ws) + W
112 |                       ] \
113 |                       = this_img * (255 if output_pixel_vals else 1)
114 |       return out_array


--------------------------------------------------------------------------------
/utilities/goliat2_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | # gpu
 5 | nb_gpu = 4
 6 | delay_between_lauches = 5*60 # 5 minutes
 7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs
 8 | gpu_id = 0                    
 9 | 
10 | # hyper parameters lists
11 | comp_NOB = [9,11]
12 | up_NOB = [11,13]
13 | NOIB = [8] 
14 | dynamic_range = [1]
15 | 
16 | for i in comp_NOB:
17 |     for j in up_NOB:
18 |         for k in NOIB:
19 |             for l in dynamic_range:
20 | 
21 |                 command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 |                 os.system(command)
23 |                 print command
24 | 
25 |                 if gpu_id == nb_gpu - 1:
26 | 
27 |                     gpu_id = 0
28 |                     time.sleep(time_per_job)
29 |                     print " "
30 | 
31 |                 else:
32 |                 
33 |                     gpu_id = gpu_id+1
34 |                     time.sleep(delay_between_lauches)
35 |                     
36 |                     


--------------------------------------------------------------------------------
/utilities/goliat3_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | # gpu
 5 | nb_gpu = 4
 6 | delay_between_lauches = 5*60 # 5 minutes
 7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs
 8 | gpu_id = 0                    
 9 | 
10 | # hyper parameters lists
11 | comp_NOB = [19,21]
12 | up_NOB = [17,19]
13 | NOIB = [5] 
14 | dynamic_range = [0]
15 | 
16 | for i in comp_NOB:
17 |     for j in up_NOB:
18 |         for k in NOIB:
19 |             for l in dynamic_range:
20 | 
21 |                 command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 |                 os.system(command)
23 |                 print command
24 | 
25 |                 if gpu_id == nb_gpu - 1:
26 | 
27 |                     gpu_id = 0
28 |                     time.sleep(time_per_job)
29 |                     print " "
30 | 
31 |                 else:
32 |                 
33 |                     gpu_id = gpu_id+1
34 |                     time.sleep(delay_between_lauches)
35 |                     
36 |                     


--------------------------------------------------------------------------------
/utilities/goliat4_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | # gpu
 5 | nb_gpu = 4
 6 | delay_between_lauches = 5*60 # 2 minutes
 7 | time_per_job = 7*60*60 # 9 in float32 for 1.3 more neurones, partial sum, no range updates, for 200 epochs
 8 | gpu_id = 0                                    
 9 |                     
10 | # hyper parameters lists
11 | comp_NOB = [31]
12 | up_NOB = [31]
13 | NOIB = [3,5,6]
14 | dynamic_range = [0]
15 | 
16 | for i in comp_NOB:
17 |     for j in up_NOB:
18 |         for k in NOIB:
19 |             for l in dynamic_range:
20 | 
21 |                 command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 |                 os.system(command)
23 |                 print command
24 | 
25 |                 if gpu_id == nb_gpu - 1:
26 | 
27 |                     gpu_id = 0
28 |                     time.sleep(time_per_job)
29 |                     print " "
30 | 
31 |                 else:
32 |                 
33 |                     if gpu_id == 0:
34 |                         gpu_id = gpu_id+2 # gpu1 does not work :(
35 |                     else:
36 |                         gpu_id = gpu_id+1
37 |                         
38 |                     time.sleep(delay_between_lauches)


--------------------------------------------------------------------------------
/utilities/results_extractor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import csv
 3 | import re
 4 | 
 5 | comp_precision = [19,21,23]
 6 | update_precision = [17,19]
 7 | initial_range = [5]
 8 | dynamic_range = [0]
 9 | 
10 | csv_file = open('X_X_5_0.csv', 'w')
11 | csv_writer = csv.writer(csv_file,  lineterminator = '\n')
12 | csv_writer.writerow(["comp_precision","update_precision", "initial_range","dynamic_range","validation_error","test_error"])
13 | 
14 | for j in comp_precision:
15 |     for k in update_precision:
16 |         for l in initial_range:
17 |             for m in dynamic_range:
18 |                 
19 |                 name =  str(j) + "_" + str(k) + "_" + str(l) + "_" + str(m) + ".txt"
20 |                 f = open(name, 'r').readlines()
21 |                 
22 |                 length = len(f)
23 |                 
24 |                 print f[length-3]
25 |                 validation_error = float(re.findall("\d+.\d+", f[length-3])[0])/100.
26 |                 
27 |                 print f[length-2]
28 |                 test_error = float(re.findall("\d+.\d+", f[length-2])[0])/100.
29 |                 
30 |                 # print f[length-3-44-1]
31 |                 # validation_error = float(re.findall("\d+.\d+", f[length-3-44-1])[0])/100.
32 |                 
33 |                 # print f[length-2-44-1]
34 |                 # test_error = float(re.findall("\d+.\d+", f[length-2-44-1])[0])/100.
35 |                 
36 |                 csv_writer.writerow([j,k,l,m,validation_error,test_error])


--------------------------------------------------------------------------------
/utilities/svhn_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import shutil
 4 | from theano import config
 5 | from pylearn2.datasets import preprocessing
 6 | from pylearn2.datasets.svhn import SVHN
 7 | from pylearn2.utils.string_utils import preprocess
 8 | 
 9 | orig_path = preprocess('${PYLEARN2_DATA_PATH}/SVHN/format2')
10 | try:
11 |     local_path = preprocess('${SVHN_LOCAL_PATH}')
12 | except ValueError:
13 |     raise ValueError("You need to define SVHN_LOCAL_PATH environment "
14 |                         "variable.")
15 | 
16 | train_name ='h5/splitted_train_32x32.h5'
17 | valid_name = 'h5/valid_32x32.h5'
18 | test_name = 'h5/test_32x32.h5'
19 | 
20 | # copy data if don't exist
21 | if not os.path.isdir(os.path.join(local_path, 'h5')):
22 |     os.makedirs(os.path.join(local_path, 'h5'))
23 | 
24 | for d_set in [train_name, valid_name, test_name]:
25 |     if not os.path.isfile(os.path.join(local_path, d_set)):
26 |         logging.info("Copying data from {0} to {1}".format(os.path.join(local_path, d_set), local_path))
27 |         shutil.copyfile(os.path.join(orig_path, d_set),
28 |                     os.path.join(local_path, d_set))
29 | 
30 | def check_dtype(data):
31 |     if str(data.X.dtype) != config.floatX:
32 |         logging.warning("The dataset is saved as {}, changing theano's floatX "\
33 |                 "to the same dtype".format(data.X.dtype))
34 |         config.floatX = str(data.X.dtype)
35 | 
36 | # Load train data
37 | train = SVHN('splitted_train', path=local_path)
38 | check_dtype(train)
39 | 
40 | # prepare preprocessing
41 | pipeline = preprocessing.Pipeline()
42 | # without batch_size there is a high chance that you might encounter memory error
43 | # or pytables crashes
44 | pipeline.items.append(preprocessing.GlobalContrastNormalization(batch_size=5000))
45 | pipeline.items.append(preprocessing.LeCunLCN((32,32)))
46 | 
47 | # apply the preprocessings to train
48 | train.apply_preprocessor(pipeline, can_fit=True)
49 | del train
50 | 
51 | # load and preprocess valid
52 | valid = SVHN('valid', path=local_path)
53 | check_dtype(valid)
54 | valid.apply_preprocessor(pipeline, can_fit=False)
55 | 
56 | # load and preprocess test
57 | test = SVHN('test', path=local_path)
58 | check_dtype(test)
59 | test.apply_preprocessor(pipeline, can_fit=False)
60 | 


--------------------------------------------------------------------------------