├── .gitignore
├── LICENSE
├── README.md
├── analytical
    ├── main.py
    └── optimizers.py
├── cifar
    ├── main.py
    └── models
    │   ├── __init__.py
    │   ├── densenet.py
    │   ├── dla.py
    │   ├── dpn.py
    │   ├── googlenet.py
    │   ├── lenet.py
    │   ├── mobilenet.py
    │   ├── mobilenetv2.py
    │   ├── pnasnet.py
    │   ├── preact_resnet.py
    │   ├── resnet.py
    │   ├── resnext.py
    │   ├── senet.py
    │   ├── shufflenet.py
    │   └── vgg.py
├── figs
    └── Rosenbrock.png
├── fine-grained
    └── main.py
├── mini-imagenet
    ├── main.py
    └── models
    │   └── resnet_ws.py
└── myoptims
    ├── AdaBelief.py
    ├── Diffgrad.py
    ├── cosangulargrad.py
    └── tanangulargrad.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AngularGrad Optimizer
 2 | 
 3 | This repository contains the oficial implementation for [AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks](http://arxiv.org/abs/2105.10190) in PyTorch.
 4 | 
 5 | AngularGrad reduces the zig-zag effect in the optimization trajectory. Fluctuations are significantly smoothed, tracing a more direct path towards the minimum of the cost function.
 6 | 
 7 | You can import the optimizer as follows:
 8 | ```python
 9 | from myoptims.tanangulargrad import tanangulargrad
10 | from myoptims.cosangulargrad import cosangulargrad
11 | ...
12 | model = YourModel()
13 | optimizer = tanangulargrad(model.parameters())
14 | ...
15 | for input, output in data:
16 |   optimizer.zero_grad()
17 |   loss = loss_function(output, model(input))
18 |   loss.backward()
19 |   optimizer.step()
20 | ...
21 | ```
22 | 
23 | 
24 | If you have questions or suggestions, please feel free to open an issue. Please cite as:
25 | ```
26 | @article{roy2021angulargrad,
27 |   title={AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks},
28 |   author={S.K. Roy, M.E. Paoletti, J.M. Haut, S.R. Dubey, P. Kar, A. Plaza and B.B. Chaudhuri},
29 |   journal={arXiv preprint arXiv:2105.10190},
30 |   year={2021}
31 | }
32 | ```
33 | <p align="center">
34 | <img src="figs/Rosenbrock.png" width="1000" align="center"> 
35 | </p>
36 | 
37 | 
38 | 
39 | ## Experiments
40 | 
41 | Experiments in the paper:
42 | 
43 | Analitycal
44 | ```
45 | cd analitycal/
46 | python main.py
47 | ```
48 | 
49 | CIFAR-10/100
50 | ```
51 | cd cifar/
52 | python main.py --dataset <cifar10/cifar100> --model <r18/r34/.../vgg16/d121> --alg <adam/sgd/.../cosangulargrad/tanangulargrad> --lr <float>
53 | Example:
54 | python main.py --dataset cifar10 --model r50 --alg cosangulargrad --lr 1e-3
55 | ```
56 | 
57 | Mini-ImageNet:
58 | ```
59 | cd mini-imagenet/
60 | wget URL dataset
61 | python main.py DATADIR --alg <adam/sgd/.../cosangulargrad/tanangulargrad> --lr <float>
62 | Example:
63 | python main.py ./split_mini/ --alg cosangulargrad --model r50 --lr 1e-3
64 | ```
65 | 
66 | Fine-Grained:
67 | ``` 
68 | cd fine-grained/
69 | wget URL datasets
70 | python main.py DATADIR --dataset <dsetname> --alg <adam/sgd/.../cosangulargrad/tanangulargrad> --lr <float>
71 | Example:
72 | python main.py ./data/Car196/ --dataset cars --alg adam --lr 1e-3
73 | ```
74 | 
75 | 


--------------------------------------------------------------------------------
/analytical/main.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import math as mt
  3 | import numpy as np
  4 | from optimizers import *
  5 | import os
  6 | 
  7 | def calc_func1(x):
  8 |     if x <= 0: val = (x + 0.3) ** 2
  9 |     else:      val = (x - 0.2) ** 2 + 0.05
 10 |     return val
 11 | 
 12 | 
 13 | def calc_grad1(x):
 14 |     if x <= 0: val = 2*x + 0.6
 15 |     else:      val = 2*x - 0.4
 16 |     return val
 17 | 
 18 | 
 19 | def calc_func2(x):
 20 |     if x <= -0.9: val = -40 * x - 35.15
 21 |     else:         val = (x * x * x) + x * mt.sin(8 * x) + 0.85
 22 |     return val
 23 | 
 24 | 
 25 | def calc_grad2(x):
 26 |     if x <= -0.9:
 27 |         return -40
 28 |     else:
 29 |         return 3 * x * x  + 8 * x * mt.cos(8 * x) + mt.sin(8 * x)
 30 | 
 31 | 
 32 | def calc_func3(x):
 33 |     if x <= -0.5:   val = x**2
 34 |     elif x <= -0.4: val = 0.75 + x
 35 |     elif x <= 0.0:  val = -7 * x / 8
 36 |     elif x <= 0.4:  val = 7 * x / 8
 37 |     elif x <= 0.5:  val = 0.75 - x
 38 |     else:           val = x**2
 39 |     return val
 40 | 
 41 | 
 42 | def calc_grad3(x):
 43 |     if x <= -0.5:   val = 2 * x
 44 |     elif x <= -0.4: val = 1.0
 45 |     elif x <= 0.0:  val = -7/8
 46 |     elif x <= 0.4:  val = 7/8
 47 |     elif x <= 0.5:  val = -1.0
 48 |     else:           val = 2 * x
 49 |     return val
 50 | 
 51 | 
 52 | 
 53 | def solve_func(xvals):
 54 |     return [calc_func(xval) for xval in xvals]
 55 | 
 56 | 
 57 | # optimize with the specified solver
 58 | def solve(x0, solver):
 59 |     x = np.zeros(nb_iters)
 60 |     x[0] = x0
 61 |     for idx_iter in range(1, nb_iters):
 62 |         g = calc_grad(x[idx_iter - 1])
 63 |         x[idx_iter] = solver.update(x[idx_iter - 1], g)
 64 |     return x
 65 | 
 66 | # optimize with the specified solver
 67 | def solve_reg(x0, solver):
 68 |     x = np.zeros(nb_iters)
 69 |     y = np.zeros(nb_iters)
 70 |     x[0] = x0
 71 |     for idx_iter in range(1, nb_iters):
 72 |         g = calc_grad(x[idx_iter - 1])
 73 |         x[idx_iter] = solver.update(x[idx_iter - 1], g)
 74 |         y[idx_iter] = calc_func(x[idx_iter])
 75 |     return x, y
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | nb_iters = 300
 85 | lrn_rate = 0.1
 86 | beta1 = 0.95
 87 | beta2 = 0.999
 88 | eps = 0.00000001
 89 | if not os.path.isdir('figures'):
 90 |     os.mkdir('figures')
 91 | 
 92 | for idfunc, calc_func in enumerate([calc_func1, calc_func2, calc_func3]):
 93 |     # Adam & diffGrad
 94 |     x = {}
 95 |     xvals = np.arange(-1,1,0.05)
 96 |     x['adam'] = solve_func(xvals)
 97 | 
 98 |     # visualization
 99 |     #plt.rcParams['figure.dpi']= 300
100 |     plt.rcParams['figure.figsize'] = [6.0, 4.0]
101 |     plt.plot(xvals, x['adam'], label='func')
102 |     #plt.legend()
103 |     plt.xlabel("x")
104 |     plt.ylabel("F"+str(idfunc+1)+"(x)")
105 |     plt.grid()
106 |     #plt.show()
107 |     plt.savefig('figures/function_'+str(idfunc)+'.png', dpi=600, format='png', bbox_inches='tight')
108 |     plt.clf()
109 | 
110 | 
111 | for idfunc, calc_grad in enumerate([calc_grad1, calc_grad2, calc_grad3]):
112 |     # Adam & diffGrad
113 |     x = {}
114 |     x0 = -1.0
115 |     solver = SGDM(lrn_rate, beta1, eps)
116 |     x['sgdm'] = solve(x0, solver)
117 | 
118 |     x0 = -1.0
119 |     solver = Adam(lrn_rate, beta1, beta2, eps)
120 |     x['adam'] = solve(x0, solver)
121 |     
122 |     x0 = -1.0
123 |     solver = diffGrad(lrn_rate, beta1, beta2, eps)
124 |     x['diffGrad'] = solve(x0, solver)
125 |     #solver = AdaBelief(lrn_rate, beta1, beta2, eps)
126 |     #x['AdaBelief'] = solve(x0, solver)
127 | 
128 |     x0 = -1.0
129 |     solver = AdaBelief(lrn_rate, beta1, beta2, eps)
130 |     x['AdaBelief'] = solve(x0, solver)
131 |     
132 |     x0 = -1.0
133 |     solver = AngularGradCos(lrn_rate, beta1, beta2, eps)
134 |     x['AngularGradCos'] = solve(x0, solver)
135 |     
136 |     x0 = -1.0
137 |     solver = AngularGradTan(lrn_rate, beta1, beta2, eps)
138 |     x['AngularGradTan'] = solve(x0, solver)
139 |     
140 |     # visualization
141 |     #plt.rcParams['figure.dpi']= 300
142 |     plt.rcParams['figure.figsize'] = [6.0, 4.0]
143 |     plt.plot(np.arange(nb_iters) + 1, x['sgdm'], label='SGDM')
144 |     plt.plot(np.arange(nb_iters) + 1, x['adam'], label='Adam')
145 |     plt.plot(np.arange(nb_iters) + 1, x['diffGrad'], label='diffGrad')
146 |     plt.plot(np.arange(nb_iters) + 1, x['AdaBelief'], label='AdaBelief')
147 |     plt.plot(np.arange(nb_iters) + 1, x['AngularGradCos'], label='$AngularGrad^{Cos}$')
148 |     plt.plot(np.arange(nb_iters) + 1, x['AngularGradTan'], label='$AngularGrad^{Tan}$')
149 |     plt.xlabel("Iteration")
150 |     plt.ylabel("Parameters Values")
151 |     plt.legend(ncol=2)
152 |     plt.grid()
153 |     #plt.show()
154 |     plt.savefig('figures/deriv_'+str(idfunc)+'.png', dpi=600, format='png', bbox_inches='tight')
155 |     plt.clf()
156 | 
157 | 
158 | for idfunc, (calc_grad,calc_func) in enumerate(zip([calc_grad1, calc_grad2, calc_grad3], [calc_func1, calc_func2, calc_func3])):
159 |     x = {}
160 |     y = {}
161 |     
162 |     x0 = -1.0
163 |     solver = SGDM(lrn_rate, beta1, eps)
164 |     x['sgdm'], y['sgdm'] = solve_reg(x0, solver)
165 |     
166 |     x0 = -1.0
167 |     solver = Adam(lrn_rate, beta1, beta2, eps)
168 |     x['adam'], y['adam'] = solve_reg(x0, solver)
169 |     
170 |     x0 = -1.0
171 |     solver = diffGrad(lrn_rate, beta1, beta2, eps)
172 |     x['diffGrad'], y['diffGrad'] = solve_reg(x0, solver)
173 | 
174 |     x0 = -1.0
175 |     solver = AdaBelief(lrn_rate, beta1, beta2, eps)
176 |     x['AdaBelief'], y['AdaBelief'] = solve_reg(x0, solver)
177 |     
178 |     x0 = -1.0
179 |     solver = AngularGradCos(lrn_rate, beta1, beta2, eps)
180 |     x['AngularGradCos'], y['AngularGradCos'] = solve_reg(x0, solver)
181 |     
182 |     x0 = -1.0
183 |     solver = AngularGradTan(lrn_rate, beta1, beta2, eps)
184 |     x['AngularGradTan'], y['AngularGradTan'] = solve_reg(x0, solver)
185 |     # visualization
186 |     #plt.rcParams['figure.dpi']= 300
187 |     plt.rcParams['figure.figsize'] = [6.0, 4.0]
188 |     plt.plot(np.arange(nb_iters) + 1, y['sgdm'], label='SGDM')
189 |     plt.plot(np.arange(nb_iters) + 1, y['adam'], label='Adam')
190 |     plt.plot(np.arange(nb_iters) + 1, y['diffGrad'], label='diffGrad')
191 |     plt.plot(np.arange(nb_iters) + 1, y['AdaBelief'], label='AdaBelief')
192 |     plt.plot(np.arange(nb_iters) + 1, y['AngularGradCos'], label='$AngularGrad^{Cos}$')
193 |     plt.plot(np.arange(nb_iters) + 1, y['AngularGradTan'], label='$AngularGrad^{Tan}$')
194 |     plt.xlabel("Iteration")
195 |     plt.ylabel("Regression Loss")
196 |     #plt.legend()
197 |     plt.legend(ncol=2)
198 |     plt.grid()
199 |     #plt.show()
200 |     plt.savefig('figures/regression_'+str(idfunc)+'.png', dpi=600, format='png', bbox_inches='tight')
201 |     plt.clf()
202 | 
203 | 


--------------------------------------------------------------------------------
/analytical/optimizers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math as mt
  3 | 
  4 | class Adam(object):
  5 |     def __init__(self, lrn_rate, beta1, beta2, eps):
  6 |         self.lrn_rate = lrn_rate
  7 |         self.beta1 = beta1
  8 |         self.beta2 = beta2
  9 |         self.eps = eps
 10 |         self.idx = 0
 11 |         self.m = 0.0 # 1st order
 12 |         self.v = 0.0 # 2nd order
 13 | 
 14 |     def update(self, x, g):
 15 |         self.idx += 1
 16 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
 17 |         self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
 18 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
 19 |         v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
 20 |         x_new = x - self.lrn_rate * m_adj / np.sqrt(v_adj + self.eps)
 21 |         return x_new
 22 | 
 23 | 
 24 | class AdaBelief(object):
 25 |     def __init__(self, lrn_rate, beta1, beta2, eps):
 26 |         self.lrn_rate = lrn_rate
 27 |         self.beta1 = beta1
 28 |         self.beta2 = beta2
 29 |         self.eps = eps
 30 |         self.idx = 0
 31 |         self.m = 0.0 # 1st order
 32 |         self.v = 0.0 # 2nd order
 33 | 
 34 |     def update(self, x, g):
 35 |         self.idx += 1
 36 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
 37 |         self.v = self.beta2 * self.v + (1.0 - self.beta2) * (g - self.m) ** 2 + self.eps
 38 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
 39 |         v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
 40 |         x_new = x - self.lrn_rate * m_adj / np.sqrt(v_adj + self.eps)
 41 |         return x_new
 42 | 
 43 | class diffGrad(object):
 44 |     def __init__(self, lrn_rate, beta1, beta2, eps):
 45 |         self.lrn_rate = lrn_rate
 46 |         self.beta1 = beta1
 47 |         self.beta2 = beta2
 48 |         self.eps = eps
 49 |         self.idx = 0
 50 |         self.m = 0.0 # 1st order
 51 |         self.v = 0.0 # 2nd order
 52 |         self.g_prev = 0.0
 53 | 
 54 |     def update(self, x, g):
 55 |         self.idx += 1
 56 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
 57 |         self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
 58 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
 59 |         v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
 60 |         dfc = 1.0 / (1.0 + np.exp(-np.abs(self.g_prev - g)))
 61 |         x_new = x - self.lrn_rate * m_adj * dfc / (np.sqrt(v_adj) + self.eps)
 62 |         self.g_prev = g
 63 |         return x_new
 64 | 
 65 | 
 66 | 
 67 | 
 68 | class AngularGradCos(object):
 69 |     def __init__(self, lrn_rate, beta1, beta2, eps):
 70 |         self.lrn_rate = lrn_rate
 71 |         self.beta1 = beta1
 72 |         self.beta2 = beta2
 73 |         self.eps = eps
 74 |         self.idx = 0
 75 |         self.m = 0.0 # 1st order
 76 |         self.v = 0.0 # 2nd order
 77 |         self.g_prev = 0.0
 78 |         self.min = 0.0
 79 | 
 80 |     def update(self, x, g):
 81 |         self.idx += 1
 82 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
 83 |         self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
 84 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
 85 |         v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
 86 | 
 87 |         tan_theta = abs((self.g_prev - g) / (1 + self.g_prev * g))
 88 |         cos_theta = 1 / np.sqrt(1 + tan_theta**2)
 89 |         angle = np.arctan(tan_theta) * (180 / 3.141592653589793238)
 90 |      
 91 |         if angle > self.min:
 92 |             self.min = angle
 93 |             diff = abs(self.g_prev - g)
 94 |             final_cos_theta = cos_theta
 95 |         else:
 96 |             self.min = angle
 97 |             diff = abs(self.g_prev - g)
 98 |             final_cos_theta = cos_theta
 99 | 
100 |         dfc = 1.0 / (1.0 + np.exp(final_cos_theta))
101 |         x_new = x - self.lrn_rate * m_adj * dfc / (np.sqrt(v_adj) + self.eps)
102 |         self.g_prev = g
103 |         return x_new
104 | 
105 | 
106 | 
107 | 
108 | class AngularGradTan(object):
109 |     def __init__(self, lrn_rate, beta1, beta2, eps):
110 |         self.lrn_rate = lrn_rate
111 |         self.beta1 = beta1
112 |         self.beta2 = beta2
113 |         self.eps = eps
114 |         self.idx = 0
115 |         self.m = 0.0 # 1st order
116 |         self.v = 0.0 # 2nd order
117 |         self.g_prev = 0.0
118 |         self.min = 0.0
119 | 
120 |     def update(self, x, g):
121 |         self.idx += 1
122 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
123 |         self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
124 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
125 |         v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
126 | 
127 |         tan_theta = abs((self.g_prev - g) / (1 + self.g_prev * g))
128 |         cos_theta = 1 / np.sqrt(1 + tan_theta**2)
129 |         angle = np.arctan(tan_theta) * (180 / 3.141592653589793238)
130 |      
131 |         if angle > self.min:
132 |             self.min = angle
133 |             diff = abs(self.g_prev - g)
134 |             final_tan_theta = tan_theta
135 |         else:
136 |             self.min = angle
137 |             diff = abs(self.g_prev - g)
138 |             final_tan_theta = tan_theta
139 | 
140 |         dfc = 1.0 / (1.0 + np.exp(final_tan_theta))
141 |         x_new = x - self.lrn_rate * m_adj * dfc / (np.sqrt(v_adj) + self.eps)
142 |         self.g_prev = g
143 |         return x_new
144 | 
145 | 
146 | class SGDM(object):
147 |     def __init__(self, lrn_rate, beta1, eps):
148 |         self.lrn_rate = lrn_rate
149 |         self.beta1 = beta1
150 |         self.eps = eps
151 |         self.idx = 0
152 |         self.m = 0.0 # 1st order
153 |         
154 |     def update(self, x, g):
155 |         self.idx += 1
156 |         self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
157 |         m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
158 |         x_new = x - self.lrn_rate * m_adj
159 |         return x_new
160 | 
161 | 
162 | 
163 | 
164 | #class AngularGradCos(object):
165 |     #def __init__(self, lrn_rate, beta1, beta2, eps):
166 |         #self.lrn_rate = lrn_rate
167 |         #self.beta1 = beta1
168 |         #self.beta2 = beta2
169 |         #self.eps = eps
170 |         #self.idx = 0
171 |         #self.m = 0.0 # 1st order
172 |         #self.v = 0.0 # 2nd order
173 |         #self.g_prev = 0.0
174 |         #self.min = 360.0
175 | 
176 |     #def update(self, x, g):
177 |         #self.idx += 1
178 |         #self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
179 |         #self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
180 |         #m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
181 |         #v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
182 | 
183 |         #tan_theta = abs((self.g_prev - g) / (1 + self.g_prev * g))
184 |         #cos_theta = 1 / np.sqrt(1 + tan_theta ** 2)
185 | 
186 |         #angle = np.arctan(tan_theta) * (180 / 3.141592653589793238)
187 |     
188 |         #if angle < self.min:
189 |             #self.min = angle
190 |             #final_cos_theta = cos_theta
191 |         #else:
192 |             #final_cos_theta = mt.cos(self.min)
193 | 
194 |         #dfc = np.tanh(abs(final_cos_theta)) * 0.5 +0.5
195 |         #x_new = x - self.lrn_rate * m_adj * dfc / (np.sqrt(v_adj) + self.eps)
196 |         #self.g_prev = g
197 |         #return x_new
198 | 
199 | 
200 | 
201 | 
202 | #class AngularGradTan(object):
203 |     #def __init__(self, lrn_rate, beta1, beta2, eps):
204 |         #self.lrn_rate = lrn_rate
205 |         #self.beta1 = beta1
206 |         #self.beta2 = beta2
207 |         #self.eps = eps
208 |         #self.idx = 0
209 |         #self.m = 0.0 # 1st order
210 |         #self.v = 0.0 # 2nd order
211 |         #self.g_prev = 0.0
212 |         #self.min = 361.0
213 | 
214 |     #def update(self, x, g):
215 |         #self.idx += 1
216 |         #self.m = self.beta1 * self.m + (1.0 - self.beta1) * g
217 |         #self.v = self.beta2 * self.v + (1.0 - self.beta2) * g ** 2
218 |         #m_adj = self.m / (1.0 - np.power(self.beta1, self.idx))
219 |         #v_adj = self.v / (1.0 - np.power(self.beta2, self.idx))
220 | 
221 |         #tan_theta = abs((self.g_prev - g) / (1 + self.g_prev * g))
222 |         #angle = np.arctan(tan_theta) * (180 / 3.141592653589793238)
223 |      
224 |         #if angle > self.min:
225 |             #self.min = angle
226 |             #final_tan_theta = tan_theta
227 |         #else:
228 |             #final_tan_theta = mt.tan(self.min)
229 | 
230 |         #dfc = np.tanh(abs(final_tan_theta)) * 0.5 + 0.5
231 |         #x_new = x - self.lrn_rate * m_adj * dfc / (np.sqrt(v_adj) + self.eps)
232 |         #self.g_prev = g
233 |         #return x_new
234 | 


--------------------------------------------------------------------------------
/cifar/main.py:
--------------------------------------------------------------------------------
  1 | '''Train CIFAR with PyTorch.'''
  2 | from __future__ import print_function
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.backends.cudnn as cudnn
  7 | 
  8 | 
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | 
 12 | import torchvision
 13 | import torchvision.transforms as transforms
 14 | 
 15 | 
 16 | from torch.optim import lr_scheduler
 17 | import os
 18 | import argparse
 19 | from torchvision import datasets, models
 20 | from models import *
 21 | 
 22 | 
 23 | import sys 
 24 | sys.path.append('../')
 25 |  
 26 | from myoptims.Diffgrad import diffgrad
 27 | from myoptims.tanangulargrad import tanangulargrad
 28 | from myoptims.cosangulargrad import cosangulargrad
 29 | from myoptims.AdaBelief import AdaBelief
 30 | 
 31 | import random
 32 | 
 33 | 
 34 | 
 35 | def get_loaders(dsetname, bsize):
 36 |     print('==> Preparing ' + dsetname + ' data...')
 37 |     if dsetname == 'cifar10':
 38 |         mean, std = (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
 39 |         torchdset = torchvision.datasets.CIFAR10
 40 |     elif dsetname == 'cifar100':
 41 |         mean, std = (0.507, 0.487, 0.441), (0.267, 0.256, 0.276)
 42 |         torchdset = torchvision.datasets.CIFAR100
 43 |     else:
 44 |         print('==> Dataset not avaiable...')
 45 |         exit()
 46 | 
 47 |     transform_train = transforms.Compose([
 48 |         transforms.RandomCrop(32, padding=4),
 49 |         transforms.RandomHorizontalFlip(),
 50 |         transforms.ToTensor(),
 51 |         transforms.Normalize(mean, std),
 52 |     ])
 53 |     transform_test = transforms.Compose([
 54 |         transforms.ToTensor(),
 55 |         transforms.Normalize(mean, std),
 56 |     ])
 57 | 
 58 |     trainset = torchdset(root='./data/'+dsetname+'/', train=True, download=True, transform=transform_train)
 59 |     trainloader = torch.utils.data.DataLoader(trainset, batch_size=bsize, shuffle=True, num_workers=4,drop_last=True)
 60 |     testset = torchdset(root='./data/'+dsetname+'/', train=False, download=True, transform=transform_test)
 61 |     testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=4)
 62 | 
 63 |     return trainloader, testloader
 64 | 
 65 | 
 66 | def get_model(modelname, Num_classes):
 67 |     if   modelname == 'v16':  net = VGG('VGG16',    Num_classes=Num_classes)
 68 |     elif modelname == 'r18':  net = ResNet18(       Num_classes=Num_classes)
 69 |     elif modelname == 'r34':  net = ResNet34(       Num_classes=Num_classes)
 70 |     elif modelname == 'r50':  net = ResNet50(       Num_classes=Num_classes)
 71 |     elif modelname == 'r101': net = ResNet101(      Num_classes=Num_classes)
 72 |     elif modelname == 'rx29': net = ResNeXt29_4x64d(Num_classes=Num_classes)
 73 |     elif modelname == 'dla':  net = DLA(            Num_classes=Num_classes)
 74 |     elif modelname == 'd121': net = DenseNet121(    Num_classes=Num_classes)
 75 |     else:
 76 |         print('==> Network not found...')
 77 |         exit()
 78 |     return net
 79 | 
 80 | 
 81 | def get_optim(optim_name, learning_rate, net):
 82 |     if   optim_name == 'sgd':            optimizer = optim.SGD(     net.parameters(), lr=learning_rate, momentum=0.9)
 83 |     elif optim_name == 'rmsprop':        optimizer = optim.RMSprop( net.parameters(), lr=learning_rate)
 84 |     elif optim_name == 'adam':           optimizer = optim.Adam(    net.parameters(), lr=learning_rate)
 85 |     elif optim_name == 'adamw':          optimizer = optim.AdamW(   net.parameters(), lr=learning_rate)
 86 |     elif optim_name == 'diffgrad':       optimizer = diffgrad(      net.parameters(), lr=learning_rate)
 87 |     elif optim_name == 'adabelief':      optimizer = AdaBelief(     net.parameters(), lr=learning_rate)
 88 |     elif optim_name == 'cosangulargrad': optimizer = cosangulargrad(net.parameters(), lr=learning_rate)
 89 |     elif optim_name == 'tanangulargrad': optimizer = tanangulargrad(net.parameters(), lr=learning_rate)
 90 |     else:
 91 |         print('==> Optimizer not found...')
 92 |         exit()
 93 |     return optimizer
 94 | 
 95 | 
 96 | def train(trainloader, epoch, net, optimizer, criterion, device='cuda'):
 97 |     print('\nEpoch: %d' % epoch)
 98 |     net.train()
 99 |     train_loss = 0
100 |     correct = 0
101 |     total = 0
102 |     for batch_idx, (inputs, targets) in enumerate(trainloader):
103 |         inputs, targets = inputs.to(device), targets.to(device)
104 |         optimizer.zero_grad()
105 |         outputs = net(inputs)
106 |         loss = criterion(outputs, targets)
107 |         loss.backward()
108 |         optimizer.step()
109 | 
110 |         train_loss += loss.item()
111 |         _, predicted = outputs.max(1)
112 |         total += targets.size(0)
113 |         correct += predicted.eq(targets).sum().item()
114 |     print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),correct/total))
115 |     acc=100.*correct/total
116 |     return acc, train_loss/(batch_idx+1)
117 | 
118 | 
119 | def test(testloader, epoch, net, criterion, device='cuda'):
120 |     net.eval()
121 |     test_loss = 0
122 |     correct = 0
123 |     total = 0
124 |     with torch.no_grad():
125 |       for batch_idx, (inputs, targets) in enumerate(testloader):
126 |             inputs, targets = inputs.to(device), targets.to(device)
127 |             outputs = net(inputs)
128 |             loss = criterion(outputs, targets)
129 | 
130 |             test_loss += loss.item()
131 |             _, predicted = outputs.max(1)
132 |             total += targets.size(0)
133 |             correct += predicted.eq(targets).sum().item()
134 |     print('Testing:  Loss: {:.4f} | Acc: {:.4f}'.format(test_loss/(batch_idx+1),correct/total) )
135 |     acc=100.*correct/total
136 |     return acc, test_loss/(batch_idx+1)
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | def main(args):
144 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
145 | 
146 |     # Random seed
147 |     if args.manualSeed is None:
148 |         args.manualSeed = random.randint(1, 10000)
149 |     random.seed(args.manualSeed)
150 |     torch.manual_seed(args.manualSeed)
151 |     if device == 'cuda':
152 |         torch.cuda.manual_seed_all(args.manualSeed)
153 | 
154 |     trainloader, testloader = get_loaders(args.dataset, args.bs)
155 |     net = get_model(args.model, 10 if args.dataset == 'cifar10' else 100)
156 | 
157 |     if device == 'cuda':
158 |         net = net.cuda()
159 |         net = torch.nn.DataParallel(net)
160 |         cudnn.benchmark = True
161 | 
162 | 
163 |     if args.resume:
164 |         print('==> Resuming from checkpoint..')
165 |         assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
166 |         checkpoint = torch.load('./checkpoint/ckpt' + '_' + args.dataset + '_' + args.model + '.t7')
167 |         net.load_state_dict(checkpoint['net'])
168 |         best_acc = checkpoint['acc']
169 |         start_epoch = checkpoint['epoch']
170 |     else:
171 |         best_acc = -1
172 |         start_epoch = 0
173 |         
174 | 
175 |     optimizer = get_optim(args.alg, args.lr, net)
176 |     criterion = nn.CrossEntropyLoss()
177 |     scheduler_lr = lr_scheduler.StepLR(optimizer, step_size=80, gamma=0.1)
178 | 
179 |     
180 |     for epoch in range(start_epoch, start_epoch+args.epochs):
181 |         train_acc, train_loss = train(trainloader, epoch, net, optimizer, criterion, device=device)
182 |         scheduler_lr.step()
183 |         val_acc, val_loss = test(testloader, epoch, net, criterion, device=device)
184 | 
185 |         # Save checkpoint.
186 |         if val_acc > best_acc:
187 |             print('Saving..')
188 |             state = {
189 |                 'net': net.state_dict(),
190 |                 'acc': val_acc,
191 |                 'epoch': epoch,
192 |             }
193 |             if not os.path.isdir('checkpoint'):
194 |                 os.mkdir('checkpoint')
195 |             torch.save(state, './checkpoint/ckpt' + '_' + args.dataset + '_' + args.model + '.t7')
196 |             best_acc = val_acc
197 | 
198 |     print('Best Acc: {:.2f}'.format(best_acc))
199 | 
200 | 
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     parser = argparse.ArgumentParser(description='PyTorch CIFAR Training')
205 |     parser.add_argument('--dataset', type=str, default='cifar10', \
206 |                                 choices=['cifar10', 'cifar100'], \
207 |                                 help='dataset (options: cifar10, cifar100)')
208 |     parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
209 |     parser.add_argument('--epochs', default=100, type=int, help='epochs')
210 |     parser.add_argument('--model', type=str, default='r50', \
211 |                                 choices=['v16', 'r18', 'r34', 'r50', 'r101', 'rx29', 'dla', 'd121'], \
212 |                                 help='dataset (options: v16, r18, r34, r50, r101, rx29, dla, d121)')
213 |     parser.add_argument('--bs', default=128, type=int, help='batchsize')
214 |     parser.add_argument('--alg', type=str, default='adam', \
215 |                                 choices=['sgd', 'rmsprop', 'adam', 'adamw', 'diffgrad', 'adabelief', 'cosangulargrad', 'tanangulargrad'], \
216 |                                 help='dataset (options: sgd, rmsprop, adam, adamw, diffgrad, adabelief, cosangulargrad, tanangulargrad)')
217 |     parser.add_argument('--lr', default=1e-3, type=float, help='learning rate')
218 |     parser.add_argument('--manualSeed', default=1111, type=int, help='random seed')
219 | 
220 |     args = parser.parse_args()
221 | 
222 |     main(args)
223 | 
224 | 


--------------------------------------------------------------------------------
/cifar/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .vgg import *
 2 | from .dpn import *
 3 | from .lenet import *
 4 | from .senet import *
 5 | from .pnasnet import *
 6 | from .densenet import *
 7 | from .googlenet import *
 8 | from .shufflenet import *
 9 | from .resnet import *
10 | from .resnext import *
11 | from .preact_resnet import *
12 | from .mobilenet import *
13 | from .mobilenetv2 import *
14 | from .dla import *
15 | 


--------------------------------------------------------------------------------
/cifar/models/densenet.py:
--------------------------------------------------------------------------------
  1 | '''DenseNet in PyTorch.'''
  2 | import math
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class Bottleneck(nn.Module):
 10 |     def __init__(self, in_planes, growth_rate):
 11 |         super(Bottleneck, self).__init__()
 12 |         self.bn1 = nn.BatchNorm2d(in_planes)
 13 |         self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
 14 |         self.bn2 = nn.BatchNorm2d(4*growth_rate)
 15 |         self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
 16 | 
 17 |     def forward(self, x):
 18 |         out = self.conv1(F.relu(self.bn1(x)))
 19 |         out = self.conv2(F.relu(self.bn2(out)))
 20 |         out = torch.cat([out,x], 1)
 21 |         return out
 22 | 
 23 | 
 24 | class Transition(nn.Module):
 25 |     def __init__(self, in_planes, out_planes):
 26 |         super(Transition, self).__init__()
 27 |         self.bn = nn.BatchNorm2d(in_planes)
 28 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
 29 | 
 30 |     def forward(self, x):
 31 |         out = self.conv(F.relu(self.bn(x)))
 32 |         out = F.avg_pool2d(out, 2)
 33 |         return out
 34 | 
 35 | 
 36 | class DenseNet(nn.Module):
 37 |     def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
 38 |         super(DenseNet, self).__init__()
 39 |         self.growth_rate = growth_rate
 40 | 
 41 |         num_planes = 2*growth_rate
 42 |         self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
 43 | 
 44 |         self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
 45 |         num_planes += nblocks[0]*growth_rate
 46 |         out_planes = int(math.floor(num_planes*reduction))
 47 |         self.trans1 = Transition(num_planes, out_planes)
 48 |         num_planes = out_planes
 49 | 
 50 |         self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
 51 |         num_planes += nblocks[1]*growth_rate
 52 |         out_planes = int(math.floor(num_planes*reduction))
 53 |         self.trans2 = Transition(num_planes, out_planes)
 54 |         num_planes = out_planes
 55 | 
 56 |         self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
 57 |         num_planes += nblocks[2]*growth_rate
 58 |         out_planes = int(math.floor(num_planes*reduction))
 59 |         self.trans3 = Transition(num_planes, out_planes)
 60 |         num_planes = out_planes
 61 | 
 62 |         self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
 63 |         num_planes += nblocks[3]*growth_rate
 64 | 
 65 |         self.bn = nn.BatchNorm2d(num_planes)
 66 |         self.linear = nn.Linear(num_planes, num_classes)
 67 | 
 68 |     def _make_dense_layers(self, block, in_planes, nblock):
 69 |         layers = []
 70 |         for i in range(nblock):
 71 |             layers.append(block(in_planes, self.growth_rate))
 72 |             in_planes += self.growth_rate
 73 |         return nn.Sequential(*layers)
 74 | 
 75 |     def forward(self, x):
 76 |         out = self.conv1(x)
 77 |         out = self.trans1(self.dense1(out))
 78 |         out = self.trans2(self.dense2(out))
 79 |         out = self.trans3(self.dense3(out))
 80 |         out = self.dense4(out)
 81 |         out = F.avg_pool2d(F.relu(self.bn(out)), 4)
 82 |         out = out.view(out.size(0), -1)
 83 |         out = self.linear(out)
 84 |         return out
 85 | 
 86 | def DenseNet121(Num_classes=10):
 87 |     return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32, num_classes=Num_classes)
 88 | 
 89 | def DenseNet169():
 90 |     return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)
 91 | 
 92 | def DenseNet201():
 93 |     return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)
 94 | 
 95 | def DenseNet161():
 96 |     return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)
 97 | 
 98 | def densenet_cifar():
 99 |     return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
100 | 
101 | def test():
102 |     net = densenet_cifar()
103 |     x = torch.randn(1,3,32,32)
104 |     y = net(x)
105 |     print(y)
106 | 
107 | # test()
108 | 


--------------------------------------------------------------------------------
/cifar/models/dla.py:
--------------------------------------------------------------------------------
  1 | '''DLA in PyTorch.
  2 | 
  3 | Reference:
  4 |     Deep Layer Aggregation. https://arxiv.org/abs/1707.06484
  5 | '''
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class BasicBlock(nn.Module):
 12 |     expansion = 1
 13 | 
 14 |     def __init__(self, in_planes, planes, stride=1):
 15 |         super(BasicBlock, self).__init__()
 16 |         self.conv1 = nn.Conv2d(
 17 |             in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 18 |         self.bn1 = nn.BatchNorm2d(planes)
 19 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
 20 |                                stride=1, padding=1, bias=False)
 21 |         self.bn2 = nn.BatchNorm2d(planes)
 22 | 
 23 |         self.shortcut = nn.Sequential()
 24 |         if stride != 1 or in_planes != self.expansion*planes:
 25 |             self.shortcut = nn.Sequential(
 26 |                 nn.Conv2d(in_planes, self.expansion*planes,
 27 |                           kernel_size=1, stride=stride, bias=False),
 28 |                 nn.BatchNorm2d(self.expansion*planes)
 29 |             )
 30 | 
 31 |     def forward(self, x):
 32 |         out = F.relu(self.bn1(self.conv1(x)))
 33 |         out = self.bn2(self.conv2(out))
 34 |         out += self.shortcut(x)
 35 |         out = F.relu(out)
 36 |         return out
 37 | 
 38 | 
 39 | class Root(nn.Module):
 40 |     def __init__(self, in_channels, out_channels, kernel_size=1):
 41 |         super(Root, self).__init__()
 42 |         self.conv = nn.Conv2d(
 43 |             in_channels, out_channels, kernel_size,
 44 |             stride=1, padding=(kernel_size - 1) // 2, bias=False)
 45 |         self.bn = nn.BatchNorm2d(out_channels)
 46 | 
 47 |     def forward(self, xs):
 48 |         x = torch.cat(xs, 1)
 49 |         out = F.relu(self.bn(self.conv(x)))
 50 |         return out
 51 | 
 52 | 
 53 | class Tree(nn.Module):
 54 |     def __init__(self, block, in_channels, out_channels, level=1, stride=1):
 55 |         super(Tree, self).__init__()
 56 |         self.level = level
 57 |         if level == 1:
 58 |             self.root = Root(2*out_channels, out_channels)
 59 |             self.left_node = block(in_channels, out_channels, stride=stride)
 60 |             self.right_node = block(out_channels, out_channels, stride=1)
 61 |         else:
 62 |             self.root = Root((level+2)*out_channels, out_channels)
 63 |             for i in reversed(range(1, level)):
 64 |                 subtree = Tree(block, in_channels, out_channels,
 65 |                                level=i, stride=stride)
 66 |                 self.__setattr__('level_%d' % i, subtree)
 67 |             self.prev_root = block(in_channels, out_channels, stride=stride)
 68 |             self.left_node = block(out_channels, out_channels, stride=1)
 69 |             self.right_node = block(out_channels, out_channels, stride=1)
 70 | 
 71 |     def forward(self, x):
 72 |         xs = [self.prev_root(x)] if self.level > 1 else []
 73 |         for i in reversed(range(1, self.level)):
 74 |             level_i = self.__getattr__('level_%d' % i)
 75 |             x = level_i(x)
 76 |             xs.append(x)
 77 |         x = self.left_node(x)
 78 |         xs.append(x)
 79 |         x = self.right_node(x)
 80 |         xs.append(x)
 81 |         out = self.root(xs)
 82 |         return out
 83 | 
 84 | 
 85 | class DLA(nn.Module):
 86 |     def __init__(self, block=BasicBlock, Num_classes=10):
 87 |         super(DLA, self).__init__()
 88 |         self.base = nn.Sequential(
 89 |             nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
 90 |             nn.BatchNorm2d(16),
 91 |             nn.ReLU(True)
 92 |         )
 93 | 
 94 |         self.layer1 = nn.Sequential(
 95 |             nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1, bias=False),
 96 |             nn.BatchNorm2d(16),
 97 |             nn.ReLU(True)
 98 |         )
 99 | 
100 |         self.layer2 = nn.Sequential(
101 |             nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1, bias=False),
102 |             nn.BatchNorm2d(32),
103 |             nn.ReLU(True)
104 |         )
105 | 
106 |         self.layer3 = Tree(block,  32,  64, level=1, stride=1)
107 |         self.layer4 = Tree(block,  64, 128, level=2, stride=2)
108 |         self.layer5 = Tree(block, 128, 256, level=2, stride=2)
109 |         self.layer6 = Tree(block, 256, 512, level=1, stride=2)
110 |         self.linear = nn.Linear(512, Num_classes)
111 | 
112 |     def forward(self, x):
113 |         out = self.base(x)
114 |         out = self.layer1(out)
115 |         out = self.layer2(out)
116 |         out = self.layer3(out)
117 |         out = self.layer4(out)
118 |         out = self.layer5(out)
119 |         out = self.layer6(out)
120 |         out = F.avg_pool2d(out, 4)
121 |         out = out.view(out.size(0), -1)
122 |         out = self.linear(out)
123 |         return out
124 | 
125 | 
126 | def test():
127 |     net = DLA()
128 |     print(net)
129 |     x = torch.randn(1, 3, 32, 32)
130 |     y = net(x)
131 |     print(y.size())
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     test()
136 | 


--------------------------------------------------------------------------------
/cifar/models/dpn.py:
--------------------------------------------------------------------------------
 1 | '''Dual Path Networks in PyTorch.'''
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class Bottleneck(nn.Module):
 8 |     def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
 9 |         super(Bottleneck, self).__init__()
10 |         self.out_planes = out_planes
11 |         self.dense_depth = dense_depth
12 | 
13 |         self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
14 |         self.bn1 = nn.BatchNorm2d(in_planes)
15 |         self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
16 |         self.bn2 = nn.BatchNorm2d(in_planes)
17 |         self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
18 |         self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)
19 | 
20 |         self.shortcut = nn.Sequential()
21 |         if first_layer:
22 |             self.shortcut = nn.Sequential(
23 |                 nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
24 |                 nn.BatchNorm2d(out_planes+dense_depth)
25 |             )
26 | 
27 |     def forward(self, x):
28 |         out = F.relu(self.bn1(self.conv1(x)))
29 |         out = F.relu(self.bn2(self.conv2(out)))
30 |         out = self.bn3(self.conv3(out))
31 |         x = self.shortcut(x)
32 |         d = self.out_planes
33 |         out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
34 |         out = F.relu(out)
35 |         return out
36 | 
37 | 
38 | class DPN(nn.Module):
39 |     def __init__(self, cfg):
40 |         super(DPN, self).__init__()
41 |         in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
42 |         num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
43 | 
44 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
45 |         self.bn1 = nn.BatchNorm2d(64)
46 |         self.last_planes = 64
47 |         self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
48 |         self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
49 |         self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
50 |         self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
51 |         self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
52 | 
53 |     def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
54 |         strides = [stride] + [1]*(num_blocks-1)
55 |         layers = []
56 |         for i,stride in enumerate(strides):
57 |             layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
58 |             self.last_planes = out_planes + (i+2) * dense_depth
59 |         return nn.Sequential(*layers)
60 | 
61 |     def forward(self, x):
62 |         out = F.relu(self.bn1(self.conv1(x)))
63 |         out = self.layer1(out)
64 |         out = self.layer2(out)
65 |         out = self.layer3(out)
66 |         out = self.layer4(out)
67 |         out = F.avg_pool2d(out, 4)
68 |         out = out.view(out.size(0), -1)
69 |         out = self.linear(out)
70 |         return out
71 | 
72 | 
73 | def DPN26():
74 |     cfg = {
75 |         'in_planes': (96,192,384,768),
76 |         'out_planes': (256,512,1024,2048),
77 |         'num_blocks': (2,2,2,2),
78 |         'dense_depth': (16,32,24,128)
79 |     }
80 |     return DPN(cfg)
81 | 
82 | def DPN92():
83 |     cfg = {
84 |         'in_planes': (96,192,384,768),
85 |         'out_planes': (256,512,1024,2048),
86 |         'num_blocks': (3,4,20,3),
87 |         'dense_depth': (16,32,24,128)
88 |     }
89 |     return DPN(cfg)
90 | 
91 | 
92 | def test():
93 |     net = DPN92()
94 |     x = torch.randn(1,3,32,32)
95 |     y = net(x)
96 |     print(y)
97 | 
98 | # test()
99 | 


--------------------------------------------------------------------------------
/cifar/models/googlenet.py:
--------------------------------------------------------------------------------
  1 | '''GoogLeNet with PyTorch.'''
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class Inception(nn.Module):
  8 |     def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
  9 |         super(Inception, self).__init__()
 10 |         # 1x1 conv branch
 11 |         self.b1 = nn.Sequential(
 12 |             nn.Conv2d(in_planes, n1x1, kernel_size=1),
 13 |             nn.BatchNorm2d(n1x1),
 14 |             nn.ReLU(True),
 15 |         )
 16 | 
 17 |         # 1x1 conv -> 3x3 conv branch
 18 |         self.b2 = nn.Sequential(
 19 |             nn.Conv2d(in_planes, n3x3red, kernel_size=1),
 20 |             nn.BatchNorm2d(n3x3red),
 21 |             nn.ReLU(True),
 22 |             nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
 23 |             nn.BatchNorm2d(n3x3),
 24 |             nn.ReLU(True),
 25 |         )
 26 | 
 27 |         # 1x1 conv -> 5x5 conv branch
 28 |         self.b3 = nn.Sequential(
 29 |             nn.Conv2d(in_planes, n5x5red, kernel_size=1),
 30 |             nn.BatchNorm2d(n5x5red),
 31 |             nn.ReLU(True),
 32 |             nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
 33 |             nn.BatchNorm2d(n5x5),
 34 |             nn.ReLU(True),
 35 |             nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
 36 |             nn.BatchNorm2d(n5x5),
 37 |             nn.ReLU(True),
 38 |         )
 39 | 
 40 |         # 3x3 pool -> 1x1 conv branch
 41 |         self.b4 = nn.Sequential(
 42 |             nn.MaxPool2d(3, stride=1, padding=1),
 43 |             nn.Conv2d(in_planes, pool_planes, kernel_size=1),
 44 |             nn.BatchNorm2d(pool_planes),
 45 |             nn.ReLU(True),
 46 |         )
 47 | 
 48 |     def forward(self, x):
 49 |         y1 = self.b1(x)
 50 |         y2 = self.b2(x)
 51 |         y3 = self.b3(x)
 52 |         y4 = self.b4(x)
 53 |         return torch.cat([y1,y2,y3,y4], 1)
 54 | 
 55 | 
 56 | class GoogLeNet(nn.Module):
 57 |     def __init__(self):
 58 |         super(GoogLeNet, self).__init__()
 59 |         self.pre_layers = nn.Sequential(
 60 |             nn.Conv2d(3, 192, kernel_size=3, padding=1),
 61 |             nn.BatchNorm2d(192),
 62 |             nn.ReLU(True),
 63 |         )
 64 | 
 65 |         self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
 66 |         self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
 67 | 
 68 |         self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
 69 | 
 70 |         self.a4 = Inception(480, 192,  96, 208, 16,  48,  64)
 71 |         self.b4 = Inception(512, 160, 112, 224, 24,  64,  64)
 72 |         self.c4 = Inception(512, 128, 128, 256, 24,  64,  64)
 73 |         self.d4 = Inception(512, 112, 144, 288, 32,  64,  64)
 74 |         self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
 75 | 
 76 |         self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
 77 |         self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
 78 | 
 79 |         self.avgpool = nn.AvgPool2d(8, stride=1)
 80 |         self.linear = nn.Linear(1024, 10)
 81 | 
 82 |     def forward(self, x):
 83 |         out = self.pre_layers(x)
 84 |         out = self.a3(out)
 85 |         out = self.b3(out)
 86 |         out = self.maxpool(out)
 87 |         out = self.a4(out)
 88 |         out = self.b4(out)
 89 |         out = self.c4(out)
 90 |         out = self.d4(out)
 91 |         out = self.e4(out)
 92 |         out = self.maxpool(out)
 93 |         out = self.a5(out)
 94 |         out = self.b5(out)
 95 |         out = self.avgpool(out)
 96 |         out = out.view(out.size(0), -1)
 97 |         out = self.linear(out)
 98 |         return out
 99 | 
100 | 
101 | def test():
102 |     net = GoogLeNet()
103 |     x = torch.randn(1,3,32,32)
104 |     y = net(x)
105 |     print(y.size())
106 | 
107 | # test()
108 | 


--------------------------------------------------------------------------------
/cifar/models/lenet.py:
--------------------------------------------------------------------------------
 1 | '''LeNet in PyTorch.'''
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class LeNet(nn.Module):
 6 |     def __init__(self):
 7 |         super(LeNet, self).__init__()
 8 |         self.conv1 = nn.Conv2d(3, 6, 5)
 9 |         self.conv2 = nn.Conv2d(6, 16, 5)
10 |         self.fc1   = nn.Linear(16*5*5, 120)
11 |         self.fc2   = nn.Linear(120, 84)
12 |         self.fc3   = nn.Linear(84, 10)
13 | 
14 |     def forward(self, x):
15 |         out = F.relu(self.conv1(x))
16 |         out = F.max_pool2d(out, 2)
17 |         out = F.relu(self.conv2(out))
18 |         out = F.max_pool2d(out, 2)
19 |         out = out.view(out.size(0), -1)
20 |         out = F.relu(self.fc1(out))
21 |         out = F.relu(self.fc2(out))
22 |         out = self.fc3(out)
23 |         return out
24 | 


--------------------------------------------------------------------------------
/cifar/models/mobilenet.py:
--------------------------------------------------------------------------------
 1 | '''MobileNet in PyTorch.
 2 | 
 3 | See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
 4 | for more details.
 5 | '''
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class Block(nn.Module):
12 |     '''Depthwise conv + Pointwise conv'''
13 |     def __init__(self, in_planes, out_planes, stride=1):
14 |         super(Block, self).__init__()
15 |         self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
16 |         self.bn1 = nn.BatchNorm2d(in_planes)
17 |         self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
18 |         self.bn2 = nn.BatchNorm2d(out_planes)
19 | 
20 |     def forward(self, x):
21 |         out = F.relu(self.bn1(self.conv1(x)))
22 |         out = F.relu(self.bn2(self.conv2(out)))
23 |         return out
24 | 
25 | 
26 | class MobileNet(nn.Module):
27 |     # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
28 |     cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]
29 | 
30 |     def __init__(self, num_classes=10):
31 |         super(MobileNet, self).__init__()
32 |         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
33 |         self.bn1 = nn.BatchNorm2d(32)
34 |         self.layers = self._make_layers(in_planes=32)
35 |         self.linear = nn.Linear(1024, num_classes)
36 | 
37 |     def _make_layers(self, in_planes):
38 |         layers = []
39 |         for x in self.cfg:
40 |             out_planes = x if isinstance(x, int) else x[0]
41 |             stride = 1 if isinstance(x, int) else x[1]
42 |             layers.append(Block(in_planes, out_planes, stride))
43 |             in_planes = out_planes
44 |         return nn.Sequential(*layers)
45 | 
46 |     def forward(self, x):
47 |         out = F.relu(self.bn1(self.conv1(x)))
48 |         out = self.layers(out)
49 |         out = F.avg_pool2d(out, 2)
50 |         out = out.view(out.size(0), -1)
51 |         out = self.linear(out)
52 |         return out
53 | 
54 | 
55 | def test():
56 |     net = MobileNet()
57 |     x = torch.randn(1,3,32,32)
58 |     y = net(x)
59 |     print(y.size())
60 | 
61 | # test()
62 | 


--------------------------------------------------------------------------------
/cifar/models/mobilenetv2.py:
--------------------------------------------------------------------------------
 1 | '''MobileNetV2 in PyTorch.
 2 | 
 3 | See the paper "Inverted Residuals and Linear Bottlenecks:
 4 | Mobile Networks for Classification, Detection and Segmentation" for more details.
 5 | '''
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class Block(nn.Module):
12 |     '''expand + depthwise + pointwise'''
13 |     def __init__(self, in_planes, out_planes, expansion, stride):
14 |         super(Block, self).__init__()
15 |         self.stride = stride
16 | 
17 |         planes = expansion * in_planes
18 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
19 |         self.bn1 = nn.BatchNorm2d(planes)
20 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
21 |         self.bn2 = nn.BatchNorm2d(planes)
22 |         self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
23 |         self.bn3 = nn.BatchNorm2d(out_planes)
24 | 
25 |         self.shortcut = nn.Sequential()
26 |         if stride == 1 and in_planes != out_planes:
27 |             self.shortcut = nn.Sequential(
28 |                 nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
29 |                 nn.BatchNorm2d(out_planes),
30 |             )
31 | 
32 |     def forward(self, x):
33 |         out = F.relu(self.bn1(self.conv1(x)))
34 |         out = F.relu(self.bn2(self.conv2(out)))
35 |         out = self.bn3(self.conv3(out))
36 |         out = out + self.shortcut(x) if self.stride==1 else out
37 |         return out
38 | 
39 | 
40 | class MobileNetV2(nn.Module):
41 |     # (expansion, out_planes, num_blocks, stride)
42 |     cfg = [(1,  16, 1, 1),
43 |            (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
44 |            (6,  32, 3, 2),
45 |            (6,  64, 4, 2),
46 |            (6,  96, 3, 1),
47 |            (6, 160, 3, 2),
48 |            (6, 320, 1, 1)]
49 | 
50 |     def __init__(self, num_classes=10):
51 |         super(MobileNetV2, self).__init__()
52 |         # NOTE: change conv1 stride 2 -> 1 for CIFAR10
53 |         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
54 |         self.bn1 = nn.BatchNorm2d(32)
55 |         self.layers = self._make_layers(in_planes=32)
56 |         self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
57 |         self.bn2 = nn.BatchNorm2d(1280)
58 |         self.linear = nn.Linear(1280, num_classes)
59 | 
60 |     def _make_layers(self, in_planes):
61 |         layers = []
62 |         for expansion, out_planes, num_blocks, stride in self.cfg:
63 |             strides = [stride] + [1]*(num_blocks-1)
64 |             for stride in strides:
65 |                 layers.append(Block(in_planes, out_planes, expansion, stride))
66 |                 in_planes = out_planes
67 |         return nn.Sequential(*layers)
68 | 
69 |     def forward(self, x):
70 |         out = F.relu(self.bn1(self.conv1(x)))
71 |         out = self.layers(out)
72 |         out = F.relu(self.bn2(self.conv2(out)))
73 |         # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
74 |         out = F.avg_pool2d(out, 4)
75 |         out = out.view(out.size(0), -1)
76 |         out = self.linear(out)
77 |         return out
78 | 
79 | 
80 | def test():
81 |     net = MobileNetV2()
82 |     x = torch.randn(2,3,32,32)
83 |     y = net(x)
84 |     print(y.size())
85 | 
86 | # test()
87 | 


--------------------------------------------------------------------------------
/cifar/models/pnasnet.py:
--------------------------------------------------------------------------------
  1 | '''PNASNet in PyTorch.
  2 | 
  3 | Paper: Progressive Neural Architecture Search
  4 | '''
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | class SepConv(nn.Module):
 11 |     '''Separable Convolution.'''
 12 |     def __init__(self, in_planes, out_planes, kernel_size, stride):
 13 |         super(SepConv, self).__init__()
 14 |         self.conv1 = nn.Conv2d(in_planes, out_planes,
 15 |                                kernel_size, stride,
 16 |                                padding=(kernel_size-1)//2,
 17 |                                bias=False, groups=in_planes)
 18 |         self.bn1 = nn.BatchNorm2d(out_planes)
 19 | 
 20 |     def forward(self, x):
 21 |         return self.bn1(self.conv1(x))
 22 | 
 23 | 
 24 | class CellA(nn.Module):
 25 |     def __init__(self, in_planes, out_planes, stride=1):
 26 |         super(CellA, self).__init__()
 27 |         self.stride = stride
 28 |         self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
 29 |         if stride==2:
 30 |             self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
 31 |             self.bn1 = nn.BatchNorm2d(out_planes)
 32 | 
 33 |     def forward(self, x):
 34 |         y1 = self.sep_conv1(x)
 35 |         y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
 36 |         if self.stride==2:
 37 |             y2 = self.bn1(self.conv1(y2))
 38 |         return F.relu(y1+y2)
 39 | 
 40 | class CellB(nn.Module):
 41 |     def __init__(self, in_planes, out_planes, stride=1):
 42 |         super(CellB, self).__init__()
 43 |         self.stride = stride
 44 |         # Left branch
 45 |         self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
 46 |         self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
 47 |         # Right branch
 48 |         self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
 49 |         if stride==2:
 50 |             self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
 51 |             self.bn1 = nn.BatchNorm2d(out_planes)
 52 |         # Reduce channels
 53 |         self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
 54 |         self.bn2 = nn.BatchNorm2d(out_planes)
 55 | 
 56 |     def forward(self, x):
 57 |         # Left branch
 58 |         y1 = self.sep_conv1(x)
 59 |         y2 = self.sep_conv2(x)
 60 |         # Right branch
 61 |         y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
 62 |         if self.stride==2:
 63 |             y3 = self.bn1(self.conv1(y3))
 64 |         y4 = self.sep_conv3(x)
 65 |         # Concat & reduce channels
 66 |         b1 = F.relu(y1+y2)
 67 |         b2 = F.relu(y3+y4)
 68 |         y = torch.cat([b1,b2], 1)
 69 |         return F.relu(self.bn2(self.conv2(y)))
 70 | 
 71 | class PNASNet(nn.Module):
 72 |     def __init__(self, cell_type, num_cells, num_planes):
 73 |         super(PNASNet, self).__init__()
 74 |         self.in_planes = num_planes
 75 |         self.cell_type = cell_type
 76 | 
 77 |         self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
 78 |         self.bn1 = nn.BatchNorm2d(num_planes)
 79 | 
 80 |         self.layer1 = self._make_layer(num_planes, num_cells=6)
 81 |         self.layer2 = self._downsample(num_planes*2)
 82 |         self.layer3 = self._make_layer(num_planes*2, num_cells=6)
 83 |         self.layer4 = self._downsample(num_planes*4)
 84 |         self.layer5 = self._make_layer(num_planes*4, num_cells=6)
 85 | 
 86 |         self.linear = nn.Linear(num_planes*4, 10)
 87 | 
 88 |     def _make_layer(self, planes, num_cells):
 89 |         layers = []
 90 |         for _ in range(num_cells):
 91 |             layers.append(self.cell_type(self.in_planes, planes, stride=1))
 92 |             self.in_planes = planes
 93 |         return nn.Sequential(*layers)
 94 | 
 95 |     def _downsample(self, planes):
 96 |         layer = self.cell_type(self.in_planes, planes, stride=2)
 97 |         self.in_planes = planes
 98 |         return layer
 99 | 
100 |     def forward(self, x):
101 |         out = F.relu(self.bn1(self.conv1(x)))
102 |         out = self.layer1(out)
103 |         out = self.layer2(out)
104 |         out = self.layer3(out)
105 |         out = self.layer4(out)
106 |         out = self.layer5(out)
107 |         out = F.avg_pool2d(out, 8)
108 |         out = self.linear(out.view(out.size(0), -1))
109 |         return out
110 | 
111 | 
112 | def PNASNetA():
113 |     return PNASNet(CellA, num_cells=6, num_planes=44)
114 | 
115 | def PNASNetB():
116 |     return PNASNet(CellB, num_cells=6, num_planes=32)
117 | 
118 | 
119 | def test():
120 |     net = PNASNetB()
121 |     x = torch.randn(1,3,32,32)
122 |     y = net(x)
123 |     print(y)
124 | 
125 | # test()
126 | 


--------------------------------------------------------------------------------
/cifar/models/preact_resnet.py:
--------------------------------------------------------------------------------
  1 | '''Pre-activation ResNet in PyTorch.
  2 | 
  3 | Reference:
  4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  5 |     Identity Mappings in Deep Residual Networks. arXiv:1603.05027
  6 | '''
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | class PreActBlock(nn.Module):
 13 |     '''Pre-activation version of the BasicBlock.'''
 14 |     expansion = 1
 15 | 
 16 |     def __init__(self, in_planes, planes, stride=1):
 17 |         super(PreActBlock, self).__init__()
 18 |         self.bn1 = nn.BatchNorm2d(in_planes)
 19 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 20 |         self.bn2 = nn.BatchNorm2d(planes)
 21 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 22 | 
 23 |         if stride != 1 or in_planes != self.expansion*planes:
 24 |             self.shortcut = nn.Sequential(
 25 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 26 |             )
 27 | 
 28 |     def forward(self, x):
 29 |         out = F.relu(self.bn1(x))
 30 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 31 |         out = self.conv1(out)
 32 |         out = self.conv2(F.relu(self.bn2(out)))
 33 |         out += shortcut
 34 |         return out
 35 | 
 36 | 
 37 | class PreActBottleneck(nn.Module):
 38 |     '''Pre-activation version of the original Bottleneck module.'''
 39 |     expansion = 4
 40 | 
 41 |     def __init__(self, in_planes, planes, stride=1):
 42 |         super(PreActBottleneck, self).__init__()
 43 |         self.bn1 = nn.BatchNorm2d(in_planes)
 44 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 45 |         self.bn2 = nn.BatchNorm2d(planes)
 46 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 47 |         self.bn3 = nn.BatchNorm2d(planes)
 48 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
 49 | 
 50 |         if stride != 1 or in_planes != self.expansion*planes:
 51 |             self.shortcut = nn.Sequential(
 52 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
 53 |             )
 54 | 
 55 |     def forward(self, x):
 56 |         out = F.relu(self.bn1(x))
 57 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 58 |         out = self.conv1(out)
 59 |         out = self.conv2(F.relu(self.bn2(out)))
 60 |         out = self.conv3(F.relu(self.bn3(out)))
 61 |         out += shortcut
 62 |         return out
 63 | 
 64 | 
 65 | class PreActResNet(nn.Module):
 66 |     def __init__(self, block, num_blocks, num_classes=10):
 67 |         super(PreActResNet, self).__init__()
 68 |         self.in_planes = 64
 69 | 
 70 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
 71 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
 72 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
 73 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
 74 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
 75 |         self.linear = nn.Linear(512*block.expansion, num_classes)
 76 | 
 77 |     def _make_layer(self, block, planes, num_blocks, stride):
 78 |         strides = [stride] + [1]*(num_blocks-1)
 79 |         layers = []
 80 |         for stride in strides:
 81 |             layers.append(block(self.in_planes, planes, stride))
 82 |             self.in_planes = planes * block.expansion
 83 |         return nn.Sequential(*layers)
 84 | 
 85 |     def forward(self, x):
 86 |         out = self.conv1(x)
 87 |         out = self.layer1(out)
 88 |         out = self.layer2(out)
 89 |         out = self.layer3(out)
 90 |         out = self.layer4(out)
 91 |         out = F.avg_pool2d(out, 4)
 92 |         out = out.view(out.size(0), -1)
 93 |         out = self.linear(out)
 94 |         return out
 95 | 
 96 | 
 97 | def PreActResNet18():
 98 |     return PreActResNet(PreActBlock, [2,2,2,2])
 99 | 
100 | def PreActResNet34():
101 |     return PreActResNet(PreActBlock, [3,4,6,3])
102 | 
103 | def PreActResNet50():
104 |     return PreActResNet(PreActBottleneck, [3,4,6,3])
105 | 
106 | def PreActResNet101():
107 |     return PreActResNet(PreActBottleneck, [3,4,23,3])
108 | 
109 | def PreActResNet152():
110 |     return PreActResNet(PreActBottleneck, [3,8,36,3])
111 | 
112 | 
113 | def test():
114 |     net = PreActResNet18()
115 |     y = net((torch.randn(1,3,32,32)))
116 |     print(y.size())
117 | 
118 | # test()
119 | 


--------------------------------------------------------------------------------
/cifar/models/resnet.py:
--------------------------------------------------------------------------------
  1 | '''ResNet in PyTorch.
  2 | 
  3 | For Pre-activation ResNet, see 'preact_resnet.py'.
  4 | 
  5 | Reference:
  6 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  7 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
  8 | '''
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | class BasicBlock(nn.Module):
 15 |     expansion = 1
 16 | 
 17 |     def __init__(self, in_planes, planes, stride=1):
 18 |         super(BasicBlock, self).__init__()
 19 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 20 |         self.bn1 = nn.BatchNorm2d(planes)
 21 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 22 |         self.bn2 = nn.BatchNorm2d(planes)
 23 | 
 24 |         self.shortcut = nn.Sequential()
 25 |         if stride != 1 or in_planes != self.expansion*planes:
 26 |             self.shortcut = nn.Sequential(
 27 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
 28 |                 nn.BatchNorm2d(self.expansion*planes)
 29 |             )
 30 | 
 31 |     def forward(self, x):
 32 |         out = F.relu(self.bn1(self.conv1(x)))
 33 |         out = self.bn2(self.conv2(out))
 34 |         out += self.shortcut(x)
 35 |         out = F.relu(out)
 36 |         return out
 37 | 
 38 | 
 39 | class Bottleneck(nn.Module):
 40 |     expansion = 4
 41 | 
 42 |     def __init__(self, in_planes, planes, stride=1):
 43 |         super(Bottleneck, self).__init__()
 44 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 45 |         self.bn1 = nn.BatchNorm2d(planes)
 46 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 47 |         self.bn2 = nn.BatchNorm2d(planes)
 48 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
 49 |         self.bn3 = nn.BatchNorm2d(self.expansion*planes)
 50 | 
 51 |         self.shortcut = nn.Sequential()
 52 |         if stride != 1 or in_planes != self.expansion*planes:
 53 |             self.shortcut = nn.Sequential(
 54 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
 55 |                 nn.BatchNorm2d(self.expansion*planes)
 56 |             )
 57 | 
 58 |     def forward(self, x):
 59 |         out = F.relu(self.bn1(self.conv1(x)))
 60 |         out = F.relu(self.bn2(self.conv2(out)))
 61 |         out = self.bn3(self.conv3(out))
 62 |         out += self.shortcut(x)
 63 |         out = F.relu(out)
 64 |         return out
 65 | 
 66 | 
 67 | class ResNet(nn.Module):
 68 |     def __init__(self, block, num_blocks, num_classes=10):
 69 |         super(ResNet, self).__init__()
 70 |         self.in_planes = 64
 71 | 
 72 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
 73 |         self.bn1 = nn.BatchNorm2d(64)
 74 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
 75 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
 76 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
 77 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
 78 |         self.linear = nn.Linear(512*block.expansion, num_classes)
 79 | 
 80 |     def _make_layer(self, block, planes, num_blocks, stride):
 81 |         strides = [stride] + [1]*(num_blocks-1)
 82 |         layers = []
 83 |         for stride in strides:
 84 |             layers.append(block(self.in_planes, planes, stride))
 85 |             self.in_planes = planes * block.expansion
 86 |         return nn.Sequential(*layers)
 87 | 
 88 |     def forward(self, x):
 89 |         out = F.relu(self.bn1(self.conv1(x)))
 90 |         out = self.layer1(out)
 91 |         out = self.layer2(out)
 92 |         out = self.layer3(out)
 93 |         out = self.layer4(out)
 94 |         out = F.avg_pool2d(out, 4)
 95 |         out = out.view(out.size(0), -1)
 96 |         out = self.linear(out)
 97 |         return out
 98 | 
 99 | 
100 | def ResNet18(Num_classes=10):
101 |     return ResNet(BasicBlock, [2,2,2,2],num_classes=Num_classes)
102 | 
103 | def ResNet34(Num_classes=10):
104 |     return ResNet(BasicBlock, [3,4,6,3],num_classes=Num_classes)
105 | 
106 | def ResNet50(Num_classes=10):
107 |     return ResNet(Bottleneck, [3,4,6,3],num_classes=Num_classes)
108 | 
109 | def ResNet101(Num_classes=10):
110 |     return ResNet(Bottleneck, [3,4,23,3],num_classes=Num_classes)
111 | 
112 | def ResNet152(Num_classes=10):
113 |     return ResNet(Bottleneck, [3,8,36,3],num_classes=Num_classes)
114 | 
115 | 
116 | def test():
117 |     net = ResNet18()
118 |     y = net(torch.randn(1,3,32,32))
119 |     print(y.size())
120 | 
121 | # test()
122 | 


--------------------------------------------------------------------------------
/cifar/models/resnext.py:
--------------------------------------------------------------------------------
 1 | '''ResNeXt in PyTorch.
 2 | 
 3 | See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
 4 | '''
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | class Block(nn.Module):
11 |     '''Grouped convolution block.'''
12 |     expansion = 2
13 | 
14 |     def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
15 |         super(Block, self).__init__()
16 |         group_width = cardinality * bottleneck_width
17 |         self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
18 |         self.bn1 = nn.BatchNorm2d(group_width)
19 |         self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
20 |         self.bn2 = nn.BatchNorm2d(group_width)
21 |         self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
22 |         self.bn3 = nn.BatchNorm2d(self.expansion*group_width)
23 | 
24 |         self.shortcut = nn.Sequential()
25 |         if stride != 1 or in_planes != self.expansion*group_width:
26 |             self.shortcut = nn.Sequential(
27 |                 nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
28 |                 nn.BatchNorm2d(self.expansion*group_width)
29 |             )
30 | 
31 |     def forward(self, x):
32 |         out = F.relu(self.bn1(self.conv1(x)))
33 |         out = F.relu(self.bn2(self.conv2(out)))
34 |         out = self.bn3(self.conv3(out))
35 |         out += self.shortcut(x)
36 |         out = F.relu(out)
37 |         return out
38 | 
39 | 
40 | class ResNeXt(nn.Module):
41 |     def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
42 |         super(ResNeXt, self).__init__()
43 |         self.cardinality = cardinality
44 |         self.bottleneck_width = bottleneck_width
45 |         self.in_planes = 64
46 | 
47 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
48 |         self.bn1 = nn.BatchNorm2d(64)
49 |         self.layer1 = self._make_layer(num_blocks[0], 1)
50 |         self.layer2 = self._make_layer(num_blocks[1], 2)
51 |         self.layer3 = self._make_layer(num_blocks[2], 2)
52 |         # self.layer4 = self._make_layer(num_blocks[3], 2)
53 |         self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
54 | 
55 |     def _make_layer(self, num_blocks, stride):
56 |         strides = [stride] + [1]*(num_blocks-1)
57 |         layers = []
58 |         for stride in strides:
59 |             layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
60 |             self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
61 |         # Increase bottleneck_width by 2 after each stage.
62 |         self.bottleneck_width *= 2
63 |         return nn.Sequential(*layers)
64 | 
65 |     def forward(self, x):
66 |         out = F.relu(self.bn1(self.conv1(x)))
67 |         out = self.layer1(out)
68 |         out = self.layer2(out)
69 |         out = self.layer3(out)
70 |         # out = self.layer4(out)
71 |         out = F.avg_pool2d(out, 8)
72 |         out = out.view(out.size(0), -1)
73 |         out = self.linear(out)
74 |         return out
75 | 
76 | 
77 | def ResNeXt29_2x64d(Num_classes=10):
78 |     return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64, num_classes=Num_classes)
79 | 
80 | def ResNeXt29_4x64d(Num_classes=10):
81 |     return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64, num_classes=Num_classes)
82 | 
83 | def ResNeXt29_8x64d(Num_classes=10):
84 |     return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64, num_classes=Num_classes)
85 | 
86 | def ResNeXt29_32x4d(Num_classes=10):
87 |     return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4, num_classes=Num_classes)
88 | 
89 | def test_resnext():
90 |     net = ResNeXt29_2x64d()
91 |     x = torch.randn(1,3,32,32)
92 |     y = net(x)
93 |     print(y.size())
94 | 
95 | # test_resnext()
96 | 


--------------------------------------------------------------------------------
/cifar/models/senet.py:
--------------------------------------------------------------------------------
  1 | '''SENet in PyTorch.
  2 | 
  3 | SENet is the winner of ImageNet-2017. The paper is not released yet.
  4 | '''
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | class BasicBlock(nn.Module):
 11 |     def __init__(self, in_planes, planes, stride=1):
 12 |         super(BasicBlock, self).__init__()
 13 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 14 |         self.bn1 = nn.BatchNorm2d(planes)
 15 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 16 |         self.bn2 = nn.BatchNorm2d(planes)
 17 | 
 18 |         self.shortcut = nn.Sequential()
 19 |         if stride != 1 or in_planes != planes:
 20 |             self.shortcut = nn.Sequential(
 21 |                 nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
 22 |                 nn.BatchNorm2d(planes)
 23 |             )
 24 | 
 25 |         # SE layers
 26 |         self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
 27 |         self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
 28 | 
 29 |     def forward(self, x):
 30 |         out = F.relu(self.bn1(self.conv1(x)))
 31 |         out = self.bn2(self.conv2(out))
 32 | 
 33 |         # Squeeze
 34 |         w = F.avg_pool2d(out, out.size(2))
 35 |         w = F.relu(self.fc1(w))
 36 |         w = F.sigmoid(self.fc2(w))
 37 |         # Excitation
 38 |         out = out * w  # New broadcasting feature from v0.2!
 39 | 
 40 |         out += self.shortcut(x)
 41 |         out = F.relu(out)
 42 |         return out
 43 | 
 44 | 
 45 | class PreActBlock(nn.Module):
 46 |     def __init__(self, in_planes, planes, stride=1):
 47 |         super(PreActBlock, self).__init__()
 48 |         self.bn1 = nn.BatchNorm2d(in_planes)
 49 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 50 |         self.bn2 = nn.BatchNorm2d(planes)
 51 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 52 | 
 53 |         if stride != 1 or in_planes != planes:
 54 |             self.shortcut = nn.Sequential(
 55 |                 nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
 56 |             )
 57 | 
 58 |         # SE layers
 59 |         self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
 60 |         self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
 61 | 
 62 |     def forward(self, x):
 63 |         out = F.relu(self.bn1(x))
 64 |         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
 65 |         out = self.conv1(out)
 66 |         out = self.conv2(F.relu(self.bn2(out)))
 67 | 
 68 |         # Squeeze
 69 |         w = F.avg_pool2d(out, out.size(2))
 70 |         w = F.relu(self.fc1(w))
 71 |         w = F.sigmoid(self.fc2(w))
 72 |         # Excitation
 73 |         out = out * w
 74 | 
 75 |         out += shortcut
 76 |         return out
 77 | 
 78 | 
 79 | class SENet(nn.Module):
 80 |     def __init__(self, block, num_blocks, num_classes=10):
 81 |         super(SENet, self).__init__()
 82 |         self.in_planes = 64
 83 | 
 84 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
 85 |         self.bn1 = nn.BatchNorm2d(64)
 86 |         self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
 87 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
 88 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
 89 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
 90 |         self.linear = nn.Linear(512, num_classes)
 91 | 
 92 |     def _make_layer(self, block, planes, num_blocks, stride):
 93 |         strides = [stride] + [1]*(num_blocks-1)
 94 |         layers = []
 95 |         for stride in strides:
 96 |             layers.append(block(self.in_planes, planes, stride))
 97 |             self.in_planes = planes
 98 |         return nn.Sequential(*layers)
 99 | 
100 |     def forward(self, x):
101 |         out = F.relu(self.bn1(self.conv1(x)))
102 |         out = self.layer1(out)
103 |         out = self.layer2(out)
104 |         out = self.layer3(out)
105 |         out = self.layer4(out)
106 |         out = F.avg_pool2d(out, 4)
107 |         out = out.view(out.size(0), -1)
108 |         out = self.linear(out)
109 |         return out
110 | 
111 | 
112 | def SENet18():
113 |     return SENet(PreActBlock, [2,2,2,2])
114 | 
115 | 
116 | def test():
117 |     net = SENet18()
118 |     y = net(torch.randn(1,3,32,32))
119 |     print(y.size())
120 | 
121 | # test()
122 | 


--------------------------------------------------------------------------------
/cifar/models/shufflenet.py:
--------------------------------------------------------------------------------
  1 | '''ShuffleNet in PyTorch.
  2 | 
  3 | See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
  4 | '''
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | class ShuffleBlock(nn.Module):
 11 |     def __init__(self, groups):
 12 |         super(ShuffleBlock, self).__init__()
 13 |         self.groups = groups
 14 | 
 15 |     def forward(self, x):
 16 |         '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
 17 |         N,C,H,W = x.size()
 18 |         g = self.groups
 19 |         return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)
 20 | 
 21 | 
 22 | class Bottleneck(nn.Module):
 23 |     def __init__(self, in_planes, out_planes, stride, groups):
 24 |         super(Bottleneck, self).__init__()
 25 |         self.stride = stride
 26 | 
 27 |         mid_planes = out_planes/4
 28 |         g = 1 if in_planes==24 else groups
 29 |         self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
 30 |         self.bn1 = nn.BatchNorm2d(mid_planes)
 31 |         self.shuffle1 = ShuffleBlock(groups=g)
 32 |         self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
 33 |         self.bn2 = nn.BatchNorm2d(mid_planes)
 34 |         self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
 35 |         self.bn3 = nn.BatchNorm2d(out_planes)
 36 | 
 37 |         self.shortcut = nn.Sequential()
 38 |         if stride == 2:
 39 |             self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
 40 | 
 41 |     def forward(self, x):
 42 |         out = F.relu(self.bn1(self.conv1(x)))
 43 |         out = self.shuffle1(out)
 44 |         out = F.relu(self.bn2(self.conv2(out)))
 45 |         out = self.bn3(self.conv3(out))
 46 |         res = self.shortcut(x)
 47 |         out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
 48 |         return out
 49 | 
 50 | 
 51 | class ShuffleNet(nn.Module):
 52 |     def __init__(self, cfg):
 53 |         super(ShuffleNet, self).__init__()
 54 |         out_planes = cfg['out_planes']
 55 |         num_blocks = cfg['num_blocks']
 56 |         groups = cfg['groups']
 57 | 
 58 |         self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
 59 |         self.bn1 = nn.BatchNorm2d(24)
 60 |         self.in_planes = 24
 61 |         self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
 62 |         self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
 63 |         self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
 64 |         self.linear = nn.Linear(out_planes[2], 10)
 65 | 
 66 |     def _make_layer(self, out_planes, num_blocks, groups):
 67 |         layers = []
 68 |         for i in range(num_blocks):
 69 |             stride = 2 if i == 0 else 1
 70 |             cat_planes = self.in_planes if i == 0 else 0
 71 |             layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
 72 |             self.in_planes = out_planes
 73 |         return nn.Sequential(*layers)
 74 | 
 75 |     def forward(self, x):
 76 |         out = F.relu(self.bn1(self.conv1(x)))
 77 |         out = self.layer1(out)
 78 |         out = self.layer2(out)
 79 |         out = self.layer3(out)
 80 |         out = F.avg_pool2d(out, 4)
 81 |         out = out.view(out.size(0), -1)
 82 |         out = self.linear(out)
 83 |         return out
 84 | 
 85 | 
 86 | def ShuffleNetG2():
 87 |     cfg = {
 88 |         'out_planes': [200,400,800],
 89 |         'num_blocks': [4,8,4],
 90 |         'groups': 2
 91 |     }
 92 |     return ShuffleNet(cfg)
 93 | 
 94 | def ShuffleNetG3():
 95 |     cfg = {
 96 |         'out_planes': [240,480,960],
 97 |         'num_blocks': [4,8,4],
 98 |         'groups': 3
 99 |     }
100 |     return ShuffleNet(cfg)
101 | 
102 | 
103 | def test():
104 |     net = ShuffleNetG2()
105 |     x = torch.randn(1,3,32,32)
106 |     y = net(x)
107 |     print(y)
108 | 
109 | # test()
110 | 


--------------------------------------------------------------------------------
/cifar/models/vgg.py:
--------------------------------------------------------------------------------
 1 | '''VGG11/13/16/19 in Pytorch.'''
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | cfg = {
 7 |     'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
 8 |     'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
 9 |     'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
10 |     'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
11 | }
12 | 
13 | 
14 | class VGG(nn.Module):
15 |     def __init__(self, vgg_name,Num_classes=100):
16 |         super(VGG, self).__init__()
17 |         self.features = self._make_layers(cfg[vgg_name])
18 |         self.classifier = nn.Linear(512, Num_classes)
19 | 
20 |     def forward(self, x):
21 |         out = self.features(x)
22 |         out = out.view(out.size(0), -1)
23 |         out = self.classifier(out)
24 |         return out
25 | 
26 |     def _make_layers(self, cfg):
27 |         layers = []
28 |         in_channels = 3
29 |         for x in cfg:
30 |             if x == 'M':
31 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
32 |             else:
33 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
34 |                            nn.BatchNorm2d(x),
35 |                            nn.ReLU(inplace=True)]
36 |                 in_channels = x
37 |         layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
38 |         return nn.Sequential(*layers)
39 | 
40 | 
41 | def test():
42 |     net = VGG('VGG11')
43 |     x = torch.randn(2,3,32,32)
44 |     y = net(x)
45 |     print(y.size())
46 | 
47 | # test()
48 | 


--------------------------------------------------------------------------------
/figs/Rosenbrock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhaut/AngularGrad/cd8cbeceb8ba729f120c9a0e5cf521c3a8a23bcf/figs/Rosenbrock.png


--------------------------------------------------------------------------------
/fine-grained/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import shutil
  5 | import time
  6 | import warnings
  7 | import sys
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.parallel
 12 | import torch.backends.cudnn as cudnn
 13 | import torch.distributed as dist
 14 | import torch.optim
 15 | import torch.optim as optim
 16 | import torch.multiprocessing as mp
 17 | import torch.utils.data
 18 | import torch.utils.data.distributed
 19 | import torchvision.transforms as transforms
 20 | import torchvision.datasets as datasets
 21 | import torchvision.models as models
 22 | 
 23 | from torch.optim import lr_scheduler
 24 | 
 25 | import numpy as np
 26 | 
 27 | import sys 
 28 | sys.path.append('../')
 29 |  
 30 | from myoptims.Diffgrad import diffgrad
 31 | from myoptims.tanangulargrad import tanangulargrad
 32 | from myoptims.cosangulargrad import cosangulargrad
 33 | 
 34 | 
 35 | 
 36 | def get_model(modelname, out_size):
 37 |     if modelname == 'r50p':
 38 |       model = models.resnet50(pretrained=True)
 39 |       model.fc = nn.Linear(in_features=2048, out_features=out_size, bias=True)
 40 |     elif modelname == 'r50':
 41 |       model = models.resnet50()
 42 |       model.fc = nn.Linear(in_features=2048, out_features=out_size, bias=True)
 43 |     else:
 44 |         print('==> Network not found...')
 45 |         exit()
 46 |     return model
 47 | 
 48 | 
 49 | def get_loaders(args):
 50 |     traindir = os.path.join(args.data, 'train')
 51 |     valdir = os.path.join(args.data, 'val')
 52 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
 53 |                                      std=[0.229, 0.224, 0.225])
 54 | 
 55 |     train_dataset = datasets.ImageFolder(
 56 |         traindir,
 57 |         transforms.Compose([
 58 |                     transforms.Resize(512),
 59 |                     transforms.RandomHorizontalFlip(),
 60 |                     transforms.ColorJitter(brightness=0.2, contrast=0.2),
 61 |                     transforms.RandomCrop(448),
 62 |                     transforms.ToTensor(),
 63 |                     normalize,
 64 |                 ]))
 65 | 
 66 |     train_loader = torch.utils.data.DataLoader(
 67 |         train_dataset, batch_size=args.batch_size, shuffle=True,
 68 |         num_workers=args.workers, pin_memory=True, drop_last=True)
 69 | 
 70 |     val_loader = torch.utils.data.DataLoader(
 71 |         datasets.ImageFolder(valdir, transforms.Compose([
 72 |                     transforms.Resize(512),
 73 |                     transforms.CenterCrop(448),
 74 |                     transforms.ToTensor(),
 75 |                     normalize,
 76 |                 ])),
 77 |         batch_size=args.batch_size, shuffle=False,
 78 |         num_workers=args.workers, pin_memory=True,drop_last=True)
 79 |     
 80 |     return train_loader, val_loader
 81 | 
 82 | 
 83 | 
 84 | def train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args):
 85 |     print('\nEpoch: %d' % epoch)
 86 |     model.train()
 87 |     total = 0
 88 |     train_loss = 0
 89 |     correct = 0
 90 |     for batch_idx, (input, target) in enumerate(train_loader):
 91 |         input, target = input.to('cuda'), target.to('cuda')
 92 | 
 93 |         output = model(input)
 94 |         loss = criterion(output, target)
 95 | 
 96 |         _, predicted = output.max(1)
 97 |         correct += predicted.eq(target).sum().item()
 98 | 
 99 |         train_loss += loss.item()
100 |         total += target.size(0)
101 |         optimizer_new.zero_grad()
102 |         optimizer_base.zero_grad()
103 |         loss.backward()
104 |         optimizer_new.step()
105 |         optimizer_base.step()
106 |         
107 |     print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),100.*correct/total))
108 |     acc=100.*correct/total
109 |     return acc, train_loss/(batch_idx+1)
110 | 
111 | 
112 | def validate(val_loader, model, criterion, args):
113 |     model.eval()
114 | 
115 |     val_loss = 0
116 |     total = 0
117 |     correct = 0
118 |     with torch.no_grad():
119 |         end = time.time()
120 |         for batch_idx, (input, target) in enumerate(val_loader):
121 |             input = input.cuda(non_blocking=True)
122 |             target = target.cuda(non_blocking=True)
123 | 
124 |             output = model(input)
125 |             loss = criterion(output, target)
126 | 
127 |             _, predicted = output.max(1)
128 |             total += target.size(0)
129 |             correct += predicted.eq(target).sum().item()
130 |             val_loss +=loss.item()
131 |         acc = 100.*correct/total
132 |         print('Testing: Loss: {:.4f} | Acc: {:.4f}'.format(val_loss/(batch_idx+1), acc))
133 |  
134 |     return acc, val_loss/(batch_idx+1)
135 | 
136 | def main(args):
137 |     args = parser.parse_args()
138 |     os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
139 |     class_num={'cub':200,'cars':196,'fgvc':100}
140 | 
141 |     if args.seed is None:
142 |         args.seed = random.randint(1, 10000)
143 |     random.seed(args.seed)
144 |     torch.manual_seed(args.seed)
145 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
146 |     if device == 'cuda':
147 |         torch.cuda.manual_seed_all(args.seed)
148 | 
149 |     model = get_model(args.model, class_num[args.dataset])
150 | 
151 |     model = torch.nn.DataParallel(model).cuda()
152 |     if device == 'cuda':
153 |         model = model.cuda()
154 |         #model = torch.nn.DataParallel(model)
155 |         cudnn.benchmark = True
156 | 
157 |     criterion = nn.CrossEntropyLoss()
158 | 
159 | 
160 |     new_param_ids = set(map(id, model.module.fc.parameters()))
161 |     base_params = [p for p in model.parameters() if id(p) not in new_param_ids]
162 |     param_groups_base =[{'params': base_params, 'lr_mult': 0.1}]
163 |     param_groups_new=[{'params': model.module.fc.parameters(), 'lr_mult': 1.0}]
164 | 
165 | 
166 |     if args.alg=='sgd':
167 |         optimizer_base = optim.SGD(param_groups_base, args.lr, momentum=0.9)
168 |         optimizer_new  = optim.SGD(param_groups_new, args.lr, momentum=0.9)
169 |     elif args.alg=='rmsprop':
170 |         optimizer_base = optim.RMSprop(param_groups_base, args.lr)
171 |         optimizer_new  = optim.RMSprop(param_groups_new, args.lr)
172 |     elif args.alg=='adam':
173 |         optimizer_base = optim.Adam(param_groups_base, args.lr)
174 |         optimizer_new  = optim.Adam(param_groups_new, args.lr)
175 |     elif args.alg=='adamw':
176 |         optimizer_base = optim.AdamW(param_groups_base, args.lr)
177 |         optimizer_new  = optim.AdamW(param_groups_new, args.lr)
178 |     elif args.alg=='diffgrad':
179 |         optimizer_base = diffgrad(param_groups_base, args.lr)
180 |         optimizer_new  = diffgrad(param_groups_new, args.lr)
181 |     elif args.alg=='cosangulargrad':
182 |         optimizer_base = cosangulargrad(param_groups_base, args.lr)
183 |         optimizer_new  = cosangulargrad(param_groups_new, args.lr)
184 |     elif args.alg=='tanangulargrad':
185 |         optimizer_base = tanangulargrad(param_groups_base, args.lr)
186 |         optimizer_new  = tanangulargrad(param_groups_new, args.lr)
187 |     else:
188 |         print('==> Optimizer not found...')
189 |         exit()
190 |     exp_lr_scheduler_new = lr_scheduler.MultiStepLR(optimizer_new, milestones=[30,50], gamma=0.1)
191 |     exp_lr_scheduler_base = lr_scheduler.MultiStepLR(optimizer_base, milestones=[30,50], gamma=0.1)
192 |     
193 | 
194 |     train_loader, val_loader = get_loaders(args)
195 | 
196 |     best_acc = -1
197 |     datass = np.ones((4,args.epochs)) * -1000.0
198 |     for epoch in range(args.start_epoch, args.epochs):
199 |         train_acc, train_loss=train(train_loader, model, criterion, optimizer_base, optimizer_new, epoch, args)
200 |         exp_lr_scheduler_new.step()
201 |         exp_lr_scheduler_base.step()
202 |         val_acc, val_loss = validate(val_loader, model, criterion, args)
203 | 
204 |         if val_acc > best_acc:
205 |             print('Saving..')
206 |             state = {
207 |                 'model': model.state_dict(),
208 |                 'acc': val_acc,
209 |                 'epoch': epoch,
210 |                 'best_acc': best_acc,
211 |             }
212 |             if not os.path.isdir('checkpoint'):
213 |                 os.mkdir('checkpoint')
214 |             torch.save(state, './checkpoint/ckpt.t7')
215 |             best_acc = val_acc
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     parser = argparse.ArgumentParser(description='PyTorch Fine-Grained Training')
220 |     parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N', help='mini-batch size')
221 |     parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
222 |                     metavar='LR', help='initial learning rate', dest='lr')
223 |     parser.add_argument('data', metavar='DIR',
224 |                     help='path to dataset')
225 |     parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
226 |                     help='number of data loading workers (default: 4)')
227 |     parser.add_argument('--epochs', default=60, type=int, metavar='N',
228 |                     help='number of total epochs to run')
229 |     parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
230 |                     help='manual epoch number (useful on restarts)')
231 |     parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
232 |                     help='momentum')
233 |     parser.add_argument('--seed', default=None, type=int,
234 |                     help='seed for initializing training. ')
235 |     parser.add_argument('--model', default='r50p', type=str, help='model')
236 |     parser.add_argument('--path', default='test', type=str, help='model')
237 |     parser.add_argument('--alg', default='adam', type=str, help='algorithm')
238 |     parser.add_argument('--dataset', default='cub', type=str, help='model')
239 |     args = parser.parse_args()
240 |     main(args)
241 | 


--------------------------------------------------------------------------------
/mini-imagenet/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import shutil
  5 | import time
  6 | import warnings
  7 | import sys
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.parallel
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.distributed as dist
 13 | #import torch.optim
 14 | import torch.optim as optim
 15 | import torch.multiprocessing as mp
 16 | import torch.utils.data
 17 | import torch.utils.data.distributed
 18 | import torchvision.transforms as transforms
 19 | import torchvision.datasets as datasets
 20 | import torchvision.models as models
 21 | from models.resnet_ws import l_resnet50, l_resnet18, l_resnet101
 22 | 
 23 | import torchvision.models as models
 24 | import math
 25 | import numpy as np
 26 | from torch.optim import lr_scheduler
 27 | 
 28 | 
 29 | import sys 
 30 | sys.path.append('../')
 31 |  
 32 | from myoptims.Diffgrad import diffgrad
 33 | from myoptims.tanangulargrad import tanangulargrad
 34 | from myoptims.cosangulargrad import cosangulargrad
 35 | from myoptims.AdaBelief import AdaBelief
 36 | 
 37 | 
 38 | 
 39 | 
 40 | def get_optim(optim_name, learning_rate, net):
 41 |     if   optim_name == 'sgd':            optimizer = optim.SGD(     net.parameters(), lr=learning_rate, momentum=0.9)
 42 |     elif optim_name == 'rmsprop':        optimizer = optim.RMSprop( net.parameters(), lr=learning_rate)
 43 |     elif optim_name == 'adam':           optimizer = optim.Adam(    net.parameters(), lr=learning_rate)
 44 |     elif optim_name == 'adamw':          optimizer = optim.AdamW(   net.parameters(), lr=learning_rate)
 45 |     elif optim_name == 'diffgrad':       optimizer = diffgrad(      net.parameters(), lr=learning_rate)
 46 |     elif optim_name == 'adabelief':      optimizer = AdaBelief(     net.parameters(), lr=learning_rate)
 47 |     elif optim_name == 'cosangulargrad': optimizer = cosangulargrad(net.parameters(), lr=learning_rate)
 48 |     elif optim_name == 'tanangulargrad': optimizer = tanangulargrad(net.parameters(), lr=learning_rate)
 49 |     else:
 50 |         print('==> Optimizer not found...')
 51 |         exit()
 52 |     return optimizer
 53 | 
 54 | 
 55 | def get_model(modelname):
 56 |     # create model
 57 |     num_classes=100
 58 |     if modelname=='r18':
 59 |         model = models.resnet18()
 60 |         model.fc = nn.Linear(in_features=512, out_features=num_classes, bias=True)
 61 |     elif modelname=='r50':
 62 |         model = models.resnet50()
 63 |         model.fc = nn.Linear(in_features=2048, out_features=num_classes, bias=True)
 64 |     elif modelname=='r101':
 65 |         model = models.resnet101()
 66 |         model.fc = nn.Linear(in_features=2048, out_features=num_classes, bias=True)
 67 |     elif modelname=='r18ws':
 68 |       model = l_resnet18(num_classes=num_classes)
 69 |     elif modelname=='r50ws':
 70 |       model = l_resnet50(num_classes=num_classes)
 71 |     elif modelname=='r101ws':
 72 |       model = l_resnet101(num_classes=num_classes)
 73 |     else:
 74 |         print('==> Network not found...')
 75 |         exit()
 76 |     for m in model.modules():
 77 |             if isinstance(m, nn.Conv2d):
 78 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 79 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 80 |             elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.GroupNorm):
 81 |                 m.weight.data.uniform_()
 82 |                 m.bias.data.zero_()
 83 |     return model
 84 | 
 85 | 
 86 | def get_loaders(args):
 87 |     print('==> Preparing MINI-Imagenet data...')
 88 |     traindir = os.path.join(args.data, 'train')
 89 |     valdir = os.path.join(args.data, 'val')
 90 |     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
 91 |                                      std=[0.229, 0.224, 0.225])
 92 |     train_dataset = datasets.ImageFolder(
 93 |         traindir,
 94 |         transforms.Compose([
 95 |             transforms.RandomResizedCrop(224),
 96 |             transforms.RandomHorizontalFlip(),
 97 |             transforms.ToTensor(),
 98 |             normalize,
 99 |          ]))
100 | 
101 |     train_loader = torch.utils.data.DataLoader(
102 |         train_dataset, batch_size=args.batch_size, shuffle=True,
103 |         num_workers=args.workers, pin_memory=True,drop_last=True)
104 | 
105 | 
106 |     val_loader = torch.utils.data.DataLoader(
107 |         datasets.ImageFolder(valdir, transforms.Compose([
108 |             transforms.Resize(256),
109 |             transforms.CenterCrop(224),
110 |             transforms.ToTensor(),
111 |             normalize,
112 |         ])),
113 |         batch_size=args.batch_size, shuffle=False,
114 |         num_workers=args.workers, pin_memory=True)
115 | 
116 |     return train_loader, val_loader
117 | 
118 | 
119 | 
120 | def train(train_loader, model, criterion, optimizer, epoch, args):
121 |     print('\nEpoch: %d' % epoch)
122 |     model.train()
123 |     total = 0
124 |     train_loss = 0
125 |     correct = 0
126 |     for batch_idx, (input, target) in enumerate(train_loader):
127 |         input, target = input.to('cuda'), target.to('cuda')
128 | 
129 |         output = model(input)
130 |         loss = criterion(output, target)
131 | 
132 |         _, predicted = output.max(1)
133 |         correct += predicted.eq(target).sum().item()
134 | 
135 |         train_loss += loss.item()
136 |         total += target.size(0)
137 |         optimizer.zero_grad()
138 |         loss.backward()
139 |         optimizer.step()
140 | 
141 |     print('Training: Loss: {:.4f} | Acc: {:.4f}'.format(train_loss/(batch_idx+1),100.*correct/total))
142 |     acc=100.*correct/total
143 |     return acc, train_loss/(batch_idx+1)
144 | 
145 | 
146 | def validate(val_loader, model, criterion, args):
147 |     model.eval()
148 | 
149 |     val_loss = 0
150 |     total = 0
151 |     correct = 0
152 |     with torch.no_grad():
153 |         end = time.time()
154 |         for batch_idx, (input, target) in enumerate(val_loader):
155 |             input = input.cuda(non_blocking=True)
156 |             target = target.cuda(non_blocking=True)
157 | 
158 |             output = model(input)
159 |             loss = criterion(output, target)
160 | 
161 |             _, predicted = output.max(1)
162 |             total += target.size(0)
163 |             correct += predicted.eq(target).sum().item()
164 |             val_loss +=loss.item()
165 |         acc = 100.*correct/total
166 |         print('Testing: Loss: {:.4f} | Acc: {:.4f}'.format(val_loss/(batch_idx+1), acc))
167 |  
168 |     return acc, val_loss/(batch_idx+1)
169 | 
170 | 
171 | def main(args):
172 |     args.arch = args.model
173 |     os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
174 | 
175 |     # Random seed
176 |     if args.seed is None:
177 |         args.seed = random.randint(1, 10000)
178 |     random.seed(args.seed)
179 |     torch.manual_seed(args.seed)
180 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
181 |     if device == 'cuda':
182 |         torch.cuda.manual_seed_all(args.seed)
183 | 
184 |     model = get_model(args.model)
185 |     if device == 'cuda':
186 |         model = model.cuda()
187 |         model = torch.nn.DataParallel(model)
188 |         cudnn.benchmark = True
189 | 
190 |     criterion = nn.CrossEntropyLoss()
191 |     optimizer = get_optim(args.alg, args.lr, model)
192 |     exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=80, gamma=0.1)
193 |     
194 |     train_loader, val_loader = get_loaders(args)
195 | 
196 | 
197 |     best_acc = -1
198 |     for epoch in range(args.start_epoch, args.epochs):
199 |         train_acc, train_loss = train(train_loader, model, criterion, optimizer, epoch, args)
200 |         exp_lr_scheduler.step()
201 |         val_acc, val_loss = validate(val_loader, model, criterion, args)
202 | 
203 |         if val_acc > best_acc:
204 |             print('Saving..')
205 |             state = {
206 |                 'model': model.state_dict(),
207 |                 'acc': val_acc,
208 |                 'epoch': epoch,
209 |             }
210 |             if not os.path.isdir('checkpoint'):
211 |                 os.mkdir('checkpoint')
212 |             torch.save(state, './checkpoint/ckpt' + '_' + args.model + '.t7')
213 |             best_acc = val_acc
214 |     print('Best Acc: {:.2f}'.format(best_acc))
215 | 
216 | 
217 | 
218 | 
219 | 
220 | if __name__ == '__main__':
221 |     model_names = sorted(name for name in models.__dict__
222 |         if name.islower() and not name.startswith("__")
223 |         and callable(models.__dict__[name]))
224 | 
225 |     parser = argparse.ArgumentParser(description='PyTorch Mini-ImageNet Training')
226 | 
227 |     parser.add_argument('-b', '--batch_size', default=128, type=int,
228 |                         metavar='N', help='mini-batch size')
229 | 
230 |     parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
231 |                         metavar='LR', help='initial learning rate', dest='lr')
232 | 
233 |     parser.add_argument('data', metavar='DIR', help='path to dataset')
234 | 
235 |     parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
236 |                         help='number of data loading workers (default: 4)')
237 |     parser.add_argument('--epochs', default=100, type=int, metavar='N',
238 |                         help='number of total epochs to run')
239 |     parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
240 |                         help='manual epoch number (useful on restarts)')
241 | 
242 |     parser.add_argument('--resume', default='', type=str, metavar='PATH',
243 |                         help='path to latest checkpoint (default: none)')
244 |     parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
245 |                         help='evaluate model on validation set')
246 |     parser.add_argument('--seed', default=None, type=int,
247 |                         help='seed for initializing training. ')
248 | 
249 |     parser.add_argument('--model', default='r50', type=str, help='model')
250 |     parser.add_argument('--alg', default='adam', type=str, help='optimizer')
251 |     args = parser.parse_args()
252 |     main(args)
253 | 


--------------------------------------------------------------------------------
/mini-imagenet/models/resnet_ws.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.utils.model_zoo as model_zoo
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn.parameter import Parameter
  7 | from torch.nn import functional as F
  8 | 
  9 | #from .. import layers as L
 10 | import math
 11 | 
 12 | __all__ = ['ResNet', 'l_resnet18', 'l_resnet34', 'l_resnet50', 'l_resnet101',
 13 |            'l_resnet152']
 14 | 
 15 | 
 16 | class Conv2d(nn.Conv2d):
 17 | 
 18 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
 19 |                  padding=0, dilation=1, groups=1, bias=True):
 20 |         super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
 21 |                  padding, dilation, groups, bias)
 22 | 
 23 |     def forward(self, x):
 24 |         # return super(Conv2d, self).forward(x)
 25 |         weight = self.weight
 26 |         weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,
 27 |                                   keepdim=True).mean(dim=3, keepdim=True)
 28 |         weight = weight - weight_mean
 29 |         std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5
 30 |         weight = weight / std.expand_as(weight)
 31 |         return F.conv2d(x, weight, self.bias, self.stride,
 32 |                         self.padding, self.dilation, self.groups)
 33 | 
 34 | 
 35 | def BatchNorm2d(num_features):
 36 | 
 37 |     #return nn.GroupNorm(num_channels=num_features, num_groups=32)
 38 |     return nn.BatchNorm2d(num_features=num_features)
 39 | 
 40 | 
 41 | def conv3x3(in_planes, out_planes, stride=1):
 42 |     """3x3 convolution with padding"""
 43 |     return Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 44 |                      padding=1, bias=False)
 45 | 
 46 | 
 47 | def conv1x1(in_planes, out_planes, stride=1):
 48 |     """1x1 convolution"""
 49 |     return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 50 | 
 51 | 
 52 | class BasicBlock(nn.Module):
 53 |     expansion = 1
 54 | 
 55 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 56 |         super(BasicBlock, self).__init__()
 57 |         self.conv1 = conv3x3(inplanes, planes, stride)
 58 |         self.bn1 = BatchNorm2d(planes)
 59 |         self.relu = nn.ReLU(inplace=True)
 60 |         self.conv2 = conv3x3(planes, planes)
 61 |         self.bn2 = BatchNorm2d(planes)
 62 |         self.downsample = downsample
 63 |         self.stride = stride
 64 | 
 65 |     def forward(self, x):
 66 |         identity = x
 67 | 
 68 |         out = self.conv1(x)
 69 |         out = self.bn1(out)
 70 |         out = self.relu(out)
 71 | 
 72 |         out = self.conv2(out)
 73 |         out = self.bn2(out)
 74 | 
 75 |         if self.downsample is not None:
 76 |             identity = self.downsample(x)
 77 | 
 78 |         out += identity
 79 |         out = self.relu(out)
 80 | 
 81 |         return out
 82 | 
 83 | 
 84 | class Bottleneck(nn.Module):
 85 |     expansion = 4
 86 | 
 87 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 88 |         super(Bottleneck, self).__init__()
 89 |         self.conv1 = conv1x1(inplanes, planes)
 90 |         self.bn1 = BatchNorm2d(planes)
 91 |         self.conv2 = conv3x3(planes, planes, stride)
 92 |         self.bn2 = BatchNorm2d(planes)
 93 |         self.conv3 = conv1x1(planes, planes * self.expansion)
 94 |         self.bn3 = BatchNorm2d(planes * self.expansion)
 95 |         self.relu = nn.ReLU(inplace=True)
 96 |         self.downsample = downsample
 97 |         self.stride = stride
 98 | 
 99 |     def forward(self, x):
100 |         identity = x
101 | 
102 |         out = self.conv1(x)
103 |         out = self.bn1(out)
104 |         out = self.relu(out)
105 | 
106 |         out = self.conv2(out)
107 |         out = self.bn2(out)
108 |         out = self.relu(out)
109 | 
110 |         out = self.conv3(out)
111 |         out = self.bn3(out)
112 | 
113 |         if self.downsample is not None:
114 |             identity = self.downsample(x)
115 | 
116 |         out += identity
117 |         out = self.relu(out)
118 | 
119 |         return out
120 | 
121 | 
122 | class ResNet(nn.Module):
123 | 
124 |     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
125 |         super(ResNet, self).__init__()
126 |         self.inplanes = 64
127 |         self.conv1 = Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
128 |                                bias=False)
129 |         self.bn1 = BatchNorm2d(64)
130 |         self.relu = nn.ReLU(inplace=True)
131 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
132 |         self.layer1 = self._make_layer(block, 64, layers[0])
133 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
134 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
135 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
136 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
137 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
138 | 
139 |         for m in self.modules():
140 |             if isinstance(m, Conv2d):
141 |                 #nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
142 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
143 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
144 |             elif isinstance(m,nn.BatchNorm2d):
145 |                 #nn.init.constant_(m.weight, 1)
146 |                 #nn.init.constant_(m.bias, 0)
147 |                 m.weight.data.uniform_()
148 |                 m.bias.data.zero_()
149 | 
150 | 
151 |         # Zero-initialize the last BN in each residual branch,
152 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
153 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
154 |         if zero_init_residual:
155 |             for m in self.modules():
156 |                 if isinstance(m, Bottleneck):
157 |                     nn.init.constant_(m.bn3.weight, 0)
158 |                 elif isinstance(m, BasicBlock):
159 |                     nn.init.constant_(m.bn2.weight, 0)
160 | 
161 |     def _make_layer(self, block, planes, blocks, stride=1):
162 |         downsample = None
163 |         if stride != 1 or self.inplanes != planes * block.expansion:
164 |             downsample = nn.Sequential(
165 |                 conv1x1(self.inplanes, planes * block.expansion, stride),
166 |                 BatchNorm2d(planes * block.expansion),
167 |             )
168 | 
169 |         layers = []
170 |         layers.append(block(self.inplanes, planes, stride, downsample))
171 |         self.inplanes = planes * block.expansion
172 |         for _ in range(1, blocks):
173 |             layers.append(block(self.inplanes, planes))
174 | 
175 |         return nn.Sequential(*layers)
176 | 
177 |     def forward(self, x):
178 |         x = self.conv1(x)
179 |         x = self.bn1(x)
180 |         x = self.relu(x)
181 |         x = self.maxpool(x)
182 | 
183 |         x = self.layer1(x)
184 |         x = self.layer2(x)
185 |         x = self.layer3(x)
186 |         x = self.layer4(x)
187 | 
188 |         x = self.avgpool(x)
189 |         x = x.view(x.size(0), -1)
190 |         x = self.fc(x)
191 | 
192 |         return x
193 | 
194 | 
195 | def l_resnet18(pretrained=False, **kwargs):
196 |     """Constructs a ResNet-18 model.
197 |     Args:
198 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
199 |     """
200 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
201 |     return model
202 | 
203 | 
204 | def l_resnet34(pretrained=False, **kwargs):
205 |     """Constructs a ResNet-34 model.
206 |     Args:
207 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
208 |     """
209 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
210 |     return model
211 | 
212 | 
213 | def l_resnet50(pretrained=False, **kwargs):
214 |     """Constructs a ResNet-50 model.
215 |     Args:
216 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
217 |     """
218 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
219 |     return model
220 | 
221 | 
222 | def l_resnet101(pretrained=False, **kwargs):
223 |     """Constructs a ResNet-101 model.
224 |     Args:
225 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
226 |     """
227 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
228 |     return model
229 | 
230 | 
231 | def l_resnet152(pretrained=False, **kwargs):
232 |     """Constructs a ResNet-152 model.
233 |     Args:
234 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
235 |     """
236 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
237 |     return model
238 | 


--------------------------------------------------------------------------------
/myoptims/AdaBelief.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer
  4 | 
  5 | version_higher = ( torch.__version__ >= "1.5.0" )
  6 | 
  7 | class AdaBelief(Optimizer):
  8 |     r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch
  9 | 
 10 |     Arguments:
 11 |         params (iterable): iterable of parameters to optimize or dicts defining
 12 |             parameter groups
 13 |         lr (float, optional): learning rate (default: 1e-3)
 14 |         betas (Tuple[float, float], optional): coefficients used for computing
 15 |             running averages of gradient and its square (default: (0.9, 0.999))
 16 |         eps (float, optional): term added to the denominator to improve
 17 |             numerical stability (default: 1e-8)
 18 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 19 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 20 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 21 |             (default: False)
 22 |         weight_decouple (boolean, optional): ( default: False) If set as True, then
 23 |             the optimizer uses decoupled weight decay as in AdamW
 24 |         fixed_decay (boolean, optional): (default: False) This is used when weight_decouple
 25 |             is set as True.
 26 |             When fixed_decay == True, the weight decay is performed as
 27 |             $W_{new} = W_{old} - W_{old} \times decay$.
 28 |             When fixed_decay == False, the weight decay is performed as
 29 |             $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the
 30 |             weight decay ratio decreases with learning rate (lr).
 31 |         rectify (boolean, optional): (default: False) If set as True, then perform the rectified
 32 |             update similar to RAdam
 33 | 
 34 |     reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients
 35 |                NeurIPS 2020 Spotlight
 36 |     """
 37 | 
 38 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 39 |                  weight_decay=0, amsgrad=False, weight_decouple = False, fixed_decay=False, rectify = False ):
 40 |         if not 0.0 <= lr:
 41 |             raise ValueError("Invalid learning rate: {}".format(lr))
 42 |         if not 0.0 <= eps:
 43 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 44 |         if not 0.0 <= betas[0] < 1.0:
 45 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 46 |         if not 0.0 <= betas[1] < 1.0:
 47 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 48 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 49 |                         weight_decay=weight_decay, amsgrad=amsgrad)
 50 |         super(AdaBelief, self).__init__(params, defaults)
 51 | 
 52 |         self.weight_decouple = weight_decouple
 53 |         self.rectify = rectify
 54 |         self.fixed_decay = fixed_decay
 55 |         if self.weight_decouple:
 56 |             print('Weight decoupling enabled in AdaBelief')
 57 |             if self.fixed_decay:
 58 |                 print('Weight decay fixed')
 59 |         if self.rectify:
 60 |             print('Rectification enabled in AdaBelief')
 61 |         if amsgrad:
 62 |             print('AMS enabled in AdaBelief')
 63 |     def __setstate__(self, state):
 64 |         super(AdaBelief, self).__setstate__(state)
 65 |         for group in self.param_groups:
 66 |             group.setdefault('amsgrad', False)
 67 | 
 68 |     def reset(self):
 69 |         for group in self.param_groups:
 70 |             for p in group['params']:
 71 |                 state = self.state[p]
 72 |                 amsgrad = group['amsgrad']
 73 | 
 74 |                 # State initialization
 75 |                 state['step'] = 0
 76 |                 # Exponential moving average of gradient values
 77 |                 state['exp_avg'] = torch.zeros_like(p.data,
 78 |                                    memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
 79 | 
 80 |                 # Exponential moving average of squared gradient values
 81 |                 state['exp_avg_var'] = torch.zeros_like(p.data,
 82 |                                     memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
 83 |                 if amsgrad:
 84 |                     # Maintains max of all exp. moving avg. of sq. grad. values
 85 |                     state['max_exp_avg_var'] = torch.zeros_like(p.data,
 86 |                                     memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
 87 | 
 88 |     def step(self, closure=None):
 89 |         """Performs a single optimization step.
 90 | 
 91 |         Arguments:
 92 |             closure (callable, optional): A closure that reevaluates the model
 93 |                 and returns the loss.
 94 |         """
 95 |         loss = None
 96 |         if closure is not None:
 97 |             loss = closure()
 98 | 
 99 |         for group in self.param_groups:
100 |             for p in group['params']:
101 |                 if p.grad is None:
102 |                     continue
103 |                 grad = p.grad.data
104 |                 if grad.is_sparse:
105 |                     raise RuntimeError('AdaBelief does not support sparse gradients, please consider SparseAdam instead')
106 |                 amsgrad = group['amsgrad']
107 | 
108 |                 state = self.state[p]
109 |                
110 |                 beta1, beta2 = group['betas']
111 | 
112 |                 # State initialization
113 |                 if len(state) == 0:
114 |                     state['rho_inf'] = 2.0 / (1.0 - beta2) - 1.0
115 |                     state['step'] = 0
116 |                     # Exponential moving average of gradient values
117 |                     state['exp_avg'] = torch.zeros_like(p.data,
118 |                                     memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
119 |                     # Exponential moving average of squared gradient values
120 |                     state['exp_avg_var'] = torch.zeros_like(p.data,
121 |                                     memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
122 |                     if amsgrad:
123 |                         # Maintains max of all exp. moving avg. of sq. grad. values
124 |                         state['max_exp_avg_var'] = torch.zeros_like(p.data,
125 |                                     memory_format=torch.preserve_format) if version_higher else torch.zeros_like(p.data)
126 | 
127 |                 # get current state variable
128 |                 exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var']
129 | 
130 |                 state['step'] += 1
131 |                 bias_correction1 = 1 - beta1 ** state['step']
132 |                 bias_correction2 = 1 - beta2 ** state['step']
133 | 
134 |                 # perform weight decay, check if decoupled weight decay
135 |                 if self.weight_decouple:
136 |                     if not self.fixed_decay:
137 |                         p.data.mul_(1.0 - group['lr'] * group['weight_decay'])
138 |                     else:
139 |                         p.data.mul_(1.0 - group['weight_decay'])
140 |                 else:
141 |                     if group['weight_decay'] != 0:
142 |                         grad.add_(group['weight_decay'], p.data)
143 | 
144 |                 # Update first and second moment running average
145 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
146 |                 grad_residual = grad - exp_avg
147 |                 exp_avg_var.mul_(beta2).addcmul_(1 - beta2, grad_residual, grad_residual)
148 | 
149 |                 if amsgrad:
150 |                     max_exp_avg_var = state['max_exp_avg_var']
151 |                     # Maintains the maximum of all 2nd moment running avg. till now
152 |                     torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var)
153 | 
154 |                     # Use the max. for normalizing running avg. of gradient
155 |                     denom = (max_exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
156 |                 else:
157 |                     denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
158 | 
159 |                 if not self.rectify:
160 |                     # Default update
161 |                     step_size = group['lr'] / bias_correction1
162 |                     p.data.addcdiv_(-step_size, exp_avg, denom)
163 | 
164 |                 else:# Rectified update
165 |                     # calculate rho_t
166 |                     state['rho_t'] = state['rho_inf'] - 2 * state['step'] * beta2 ** state['step'] / (
167 |                             1.0 - beta2 ** state['step'])
168 | 
169 |                     if state['rho_t'] > 4: # perform Adam style update if variance is small
170 |                         rho_inf, rho_t = state['rho_inf'], state['rho_t']
171 |                         rt = (rho_t - 4.0) * (rho_t - 2.0) * rho_inf / (rho_inf - 4.0) / (rho_inf - 2.0) / rho_t
172 |                         rt = math.sqrt(rt)
173 | 
174 |                         step_size = rt * group['lr'] / bias_correction1
175 | 
176 |                         p.data.addcdiv_(-step_size, exp_avg, denom)
177 | 
178 |                     else: # perform SGD style update
179 |                         p.data.add_( -group['lr'], exp_avg)
180 | 
181 |         return loss
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/myoptims/Diffgrad.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer
  4 | import numpy as np
  5 | import torch.nn as nn
  6 |      
  7 | 
  8 | class diffgrad(Optimizer):
  9 |     r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam.
 10 |     It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_.
 11 |     Arguments:
 12 |         params (iterable): iterable of parameters to optimize or dicts defining
 13 |             parameter groups
 14 |         lr (float, optional): learning rate (default: 1e-3)
 15 |         betas (Tuple[float, float], optional): coefficients used for computing
 16 |             running averages of gradient and its square (default: (0.9, 0.999))
 17 |         eps (float, optional): term added to the denominator to improve
 18 |             numerical stability (default: 1e-8)
 19 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 20 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 21 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 22 |             (default: False)
 23 |     .. _diffGrad: An Optimization Method for Convolutional Neural Networks:
 24 |         https://arxiv.org/abs/1909.11015
 25 |     .. _Adam\: A Method for Stochastic Optimization:
 26 |         https://arxiv.org/abs/1412.6980
 27 |     .. _On the Convergence of Adam and Beyond:
 28 |         https://openreview.net/forum?id=ryQu7f-RZ
 29 |     """
 30 | 
 31 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0
 32 |                  ,use_gc=False, gc_conv_only=False,gc_loc=False):
 33 |         if not 0.0 <= lr:
 34 |             raise ValueError("Invalid learning rate: {}".format(lr))
 35 |         if not 0.0 <= eps:
 36 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 37 |         if not 0.0 <= betas[0] < 1.0:
 38 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 39 |         if not 0.0 <= betas[1] < 1.0:
 40 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 41 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 42 |         super(diffgrad, self).__init__(params, defaults)
 43 |         self.gc_loc=gc_loc
 44 |         self.use_gc=use_gc
 45 |         self.gc_conv_only=gc_conv_only
 46 | 
 47 |     def __setstate__(self, state):
 48 |         super(diffgrad, self).__setstate__(state)
 49 | 
 50 |     def step(self, closure=None):
 51 |         """Performs a single optimization step.
 52 |         Arguments:
 53 |             closure (callable, optional): A closure that reevaluates the model
 54 |                 and returns the loss.
 55 |         """
 56 |         loss = None
 57 |         if closure is not None:
 58 |             loss = closure()
 59 | 
 60 |         for group in self.param_groups:
 61 |             for p in group['params']:
 62 |                 if p.grad is None:
 63 |                     continue
 64 |                 grad = p.grad.data
 65 |                 if grad.is_sparse:
 66 |                     raise RuntimeError('diffGrad does not support sparse gradients, please consider SparseAdam instead')
 67 | 
 68 |                 state = self.state[p]
 69 | 
 70 |                 # State initialization
 71 |                 if len(state) == 0:
 72 |                     state['step'] = 0
 73 |                     # Exponential moving average of gradient values
 74 |                     state['exp_avg'] = torch.zeros_like(p.data)
 75 |                     # Exponential moving average of squared gradient values
 76 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 77 |                     # Previous gradient
 78 |                     state['previous_grad'] = torch.zeros_like(p.data)
 79 | 
 80 |                 exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
 81 |                 beta1, beta2 = group['betas']
 82 | 
 83 |                 state['step'] += 1
 84 | 
 85 |                 if group['weight_decay'] != 0:
 86 |                     grad.add_(group['weight_decay'], p.data)
 87 | 
 88 |                 # Decay the first and second moment running average coefficient
 89 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 90 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 91 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 92 | 
 93 |                 bias_correction1 = 1 - beta1 ** state['step']
 94 |                 bias_correction2 = 1 - beta2 ** state['step']
 95 | 
 96 |                 # compute diffgrad coefficient (dfc)
 97 |                 diff = abs(previous_grad - grad)
 98 |                 dfc = 1. / (1. + torch.exp(-diff))
 99 |                 #state['previous_grad'] = grad %used in paper but has the bug that previous grad is overwritten with grad and diff becomes always zero. Fixed in the next line.
100 |                 state['previous_grad'] = grad.clone()
101 | 
102 | 				# update momentum with dfc
103 |                 exp_avg1 = exp_avg * dfc
104 | 
105 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
106 | 
107 |                 #GC operation
108 |                 G_grad=exp_avg/denom
109 | 
110 |                 p.data.add_( G_grad, alpha=-step_size)
111 | 
112 | 
113 |         return loss
114 | 


--------------------------------------------------------------------------------
/myoptims/cosangulargrad.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | class cosangulargrad(Optimizer):
  9 | 
 10 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 11 |         if not 0.0 <= lr:
 12 |             raise ValueError("Invalid learning rate: {}".format(lr))
 13 |         if not 0.0 <= eps:
 14 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 15 |         if not 0.0 <= betas[0] < 1.0:
 16 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 17 |         if not 0.0 <= betas[1] < 1.0:
 18 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 20 |         super(cosangulargrad, self).__init__(params, defaults)
 21 | 
 22 |     def __setstate__(self, state):
 23 |         super(cosangulargrad, self).__setstate__(state)
 24 | 
 25 |     def step(self, closure=None):
 26 |         """Performs a single optimization step.
 27 |         Arguments:
 28 |             closure (callable, optional): A closure that reevaluates the model
 29 |                 and returns the loss.
 30 |         """
 31 |         loss = None
 32 |         if closure is not None:
 33 |             loss = closure()
 34 | 
 35 |         for group in self.param_groups:
 36 |             for p in group['params']:
 37 |                 if p.grad is None:
 38 |                     continue
 39 |                 grad = p.grad.data
 40 |                 if grad.is_sparse:
 41 |                     raise RuntimeError(
 42 |                         'cosangulargrad does not support sparse gradients, please consider SparseAdam instead')
 43 | 
 44 |                 state = self.state[p]
 45 | 
 46 |                 # State initialization
 47 |                 if len(state) == 0:
 48 |                     state['step'] = 0
 49 |                     # Exponential moving average of gradient values
 50 |                     state['exp_avg'] = torch.zeros_like(p.data)
 51 |                     # Exponential moving average of squared gradient values
 52 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 53 |                     # Previous gradient
 54 |                     state['previous_grad'] = torch.zeros_like(p.data)
 55 |                     # temporary minimum value for comparison
 56 |                     state['min'] = torch.zeros_like(p.data)
 57 |                     # temporary difference between gradients for comparison
 58 |                     state['diff'] = torch.zeros_like(p.data)
 59 |                     # final cos value to be used
 60 |                     state['final_cos_theta'] = torch.zeros_like(p.data)
 61 | 
 62 |                 exp_avg, exp_avg_sq, previous_grad, min, diff, final_cos_theta = state['exp_avg'], state['exp_avg_sq'], \
 63 |                                                                                  state['previous_grad'], state['min'], \
 64 |                                                                                  state['diff'], state['final_cos_theta']
 65 |                 beta1, beta2 = group['betas']
 66 | 
 67 |                 state['step'] += 1
 68 | 
 69 |                 if group['weight_decay'] != 0:
 70 |                     grad.add_(group['weight_decay'], p.data)
 71 | 
 72 |                 # Decay the first and second moment running average coefficient
 73 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 74 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 75 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 76 | 
 77 |                 bias_correction1 = 1 - beta1 ** state['step']
 78 |                 bias_correction2 = 1 - beta2 ** state['step']
 79 | 
 80 |                 tan_theta = abs((previous_grad - grad) / (1 + previous_grad * grad))
 81 |                 cos_theta = 1 / torch.sqrt(1 + torch.square(tan_theta))
 82 | 
 83 |                 angle = torch.atan(tan_theta) * (180 / 3.141592653589793238)
 84 |                 ans = torch.gt(angle, min)
 85 |                 ans1, count = torch.unique(ans, return_counts=True)
 86 | 
 87 |                 try:
 88 |                     if (count[1] < count[0]):
 89 |                         min = angle
 90 |                         diff = abs(previous_grad - grad)
 91 |                         final_cos_theta = cos_theta.clone()
 92 |                 except:
 93 |                     if (ans1[0].item() == False):
 94 |                         min = angle
 95 |                         diff = abs(previous_grad - grad)
 96 |                         final_cos_theta = cos_theta.clone()
 97 | 
 98 |                 angular_coeff = torch.tanh(abs(final_cos_theta)) * 0.5 +0.5     # Calculating Angular coefficient
 99 | 
100 |                 state['previous_grad'] = grad.clone()
101 |                 state['min'] = min.clone()
102 |                 state['diff'] = diff.clone()
103 |                 state['final_cos_theta'] = final_cos_theta.clone()
104 | 
105 |                 # update momentum with angular_coeff
106 |                 exp_avg1 = exp_avg * angular_coeff
107 | 
108 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
109 | 
110 |                 p.data.addcdiv_(-step_size, exp_avg1, denom)
111 | 
112 |         return loss
113 | 


--------------------------------------------------------------------------------
/myoptims/tanangulargrad.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | class tanangulargrad(Optimizer):
  9 | 
 10 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 11 |         if not 0.0 <= lr:
 12 |             raise ValueError("Invalid learning rate: {}".format(lr))
 13 |         if not 0.0 <= eps:
 14 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 15 |         if not 0.0 <= betas[0] < 1.0:
 16 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 17 |         if not 0.0 <= betas[1] < 1.0:
 18 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 20 |         super(tanangulargrad, self).__init__(params, defaults)
 21 | 
 22 |     def __setstate__(self, state):
 23 |         super(tanangulargrad, self).__setstate__(state)
 24 | 
 25 |     def step(self, closure=None):
 26 |         """Performs a single optimization step.
 27 |         Arguments:
 28 |             closure (callable, optional): A closure that reevaluates the model
 29 |                 and returns the loss.
 30 |         """
 31 |         loss = None
 32 |         if closure is not None:
 33 |             loss = closure()
 34 | 
 35 |         for group in self.param_groups:
 36 |             for p in group['params']:
 37 |                 if p.grad is None:
 38 |                     continue
 39 |                 grad = p.grad.data
 40 |                 if grad.is_sparse:
 41 |                     raise RuntimeError(
 42 |                         'tanangulargrad does not support sparse gradients, please consider SparseAdam instead')
 43 | 
 44 |                 state = self.state[p]
 45 | 
 46 |                 # State initialization
 47 |                 if len(state) == 0:
 48 |                     state['step'] = 0
 49 |                     # Exponential moving average of gradient values
 50 |                     state['exp_avg'] = torch.zeros_like(p.data)
 51 |                     # Exponential moving average of squared gradient values
 52 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 53 |                     # Previous gradient
 54 |                     state['previous_grad'] = torch.zeros_like(p.data)
 55 |                     # temporary minimum value for comparison
 56 |                     state['min'] = torch.zeros_like(p.data)
 57 |                     # temporary difference between gradients for comparison
 58 |                     state['diff'] = torch.zeros_like(p.data)
 59 |                     # final tan value to be used
 60 |                     state['final_tan_theta'] = torch.zeros_like(p.data)
 61 | 
 62 |                 exp_avg, exp_avg_sq, previous_grad, min, diff, final_tan_theta = state['exp_avg'], state['exp_avg_sq'], \
 63 |                                                                                  state['previous_grad'], state['min'], \
 64 |                                                                                  state['diff'], state['final_tan_theta']
 65 |                 beta1, beta2 = group['betas']
 66 | 
 67 |                 state['step'] += 1
 68 | 
 69 |                 if group['weight_decay'] != 0:
 70 |                     grad.add_(group['weight_decay'], p.data)
 71 | 
 72 |                 # Decay the first and second moment running average coefficient
 73 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 74 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 75 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 76 | 
 77 |                 bias_correction1 = 1 - beta1 ** state['step']
 78 |                 bias_correction2 = 1 - beta2 ** state['step']
 79 | 
 80 |                 tan_theta = abs((previous_grad - grad) / (1 + previous_grad * grad))
 81 | 
 82 |                 angle = torch.atan(tan_theta) * (180 / 3.141592653589793238)
 83 |                 ans = torch.gt(angle, min)
 84 |                 ans1, count = torch.unique(ans, return_counts=True)
 85 | 
 86 |                 try:
 87 |                     if (count[1] < count[0]):
 88 |                         min = angle
 89 |                         diff = abs(previous_grad - grad)
 90 |                         final_tan_theta = tan_theta.clone()
 91 |                 except:
 92 |                     if (ans1[0].item() == False):
 93 |                         min = angle
 94 |                         diff = abs(previous_grad - grad)
 95 |                         final_tan_theta = tan_theta.clone()
 96 | 
 97 |                 angular_coeff = torch.tanh(abs(final_tan_theta)) * 0.5 +0.5    # Calculating Angular coefficient
 98 | 
 99 |                 state['previous_grad'] = grad.clone()
100 |                 state['min'] = min.clone()
101 |                 state['diff'] = diff.clone()
102 |                 state['final_tan_theta'] = final_tan_theta.clone()
103 | 
104 |                 # update momentum with angular_coeff
105 |                 exp_avg1 = exp_avg * angular_coeff
106 | 
107 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
108 | 
109 |                 p.data.addcdiv_(-step_size, exp_avg1, denom)
110 | 
111 |         return loss
112 | 


--------------------------------------------------------------------------------