├── LICENSE ├── LICENSE-pytorch-cifar ├── README.md ├── models ├── __init__.py ├── alldnet.py ├── densenet.py ├── densenet3.py ├── densenet_efficient_multi_gpu.py ├── googlenet.py ├── lenet.py ├── mobilenet.py ├── resnet.py ├── resnext.py └── vgg.py ├── train.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. 400 | -------------------------------------------------------------------------------- /LICENSE-pytorch-cifar: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 liukuang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mixup-CIFAR10 2 | By [Hongyi Zhang](http://web.mit.edu/~hongyiz/www/), [Moustapha Cisse](https://mine.kaust.edu.sa/Pages/cisse.aspx), [Yann Dauphin](http://dauphin.io/), [David Lopez-Paz](https://lopezpaz.org/). 3 | 4 | Facebook AI Research 5 | 6 | ## Introduction 7 | 8 | Mixup is a generic and straightforward data augmentation principle. 9 | In essence, mixup trains a neural network on convex combinations of pairs of 10 | examples and their labels. By doing so, mixup regularizes the neural network to 11 | favor simple linear behavior in-between training examples. 12 | 13 | This repository contains the implementation used for the results in 14 | our paper (https://arxiv.org/abs/1710.09412). 15 | 16 | ## Citation 17 | 18 | If you use this method or this code in your paper, then please cite it: 19 | 20 | ``` 21 | @article{ 22 | zhang2018mixup, 23 | title={mixup: Beyond Empirical Risk Minimization}, 24 | author={Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz}, 25 | journal={International Conference on Learning Representations}, 26 | year={2018}, 27 | url={https://openreview.net/forum?id=r1Ddp1-Rb}, 28 | } 29 | ``` 30 | 31 | ## Requirements and Installation 32 | * A computer running macOS or Linux 33 | * For training new models, you'll also need a NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl) 34 | * Python version 3.6 35 | * A [PyTorch installation](http://pytorch.org/) 36 | 37 | ## Training 38 | Use `python train.py` to train a new model. 39 | Here is an example setting: 40 | ``` 41 | $ CUDA_VISIBLE_DEVICES=0 python train.py --lr=0.1 --seed=20170922 --decay=1e-4 42 | ``` 43 | 44 | ## License 45 | 46 | This project is CC-BY-NC-licensed. 47 | 48 | ## Acknowledgement 49 | The CIFAR-10 reimplementation of _mixup_ is adapted from the [pytorch-cifar](https://github.com/kuangliu/pytorch-cifar) repository by [kuangliu](https://github.com/kuangliu). 50 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .vgg import * 2 | from .lenet import * 3 | from .resnet import * 4 | from .resnext import * 5 | from .densenet import * 6 | from .googlenet import * 7 | from .mobilenet import * 8 | # from .densenet_efficient_multi_gpu import DenseNet190 9 | from .densenet3 import DenseNet190 10 | -------------------------------------------------------------------------------- /models/alldnet.py: -------------------------------------------------------------------------------- 1 | '''LeNet in PyTorch.''' 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | class AllDNet(nn.Module): 7 | def __init__(self): 8 | super(AllDNet, self).__init__() 9 | self.conv1 = nn.Conv2d(3, 6, 5) 10 | self.conv2 = nn.Conv2d(6, 16, 5) 11 | # self.conv2 = nn.Linear(6*14*14, 16*10*10) 12 | self.fc1 = nn.Linear(16*5*5, 120) 13 | self.fc2 = nn.Linear(120, 84) 14 | self.fc3 = nn.Linear(84, 10) 15 | 16 | def forward(self, x): 17 | activations = [] 18 | out = F.relu(self.conv1(x)) 19 | out = F.max_pool2d(out, 2) 20 | # out = out.view(out.size(0), -1) 21 | # activations.append(out) 22 | out = F.relu(self.conv2(out)) 23 | # out = out.view(out.size(0), 16, 10, -1) 24 | out = F.max_pool2d(out, 2) 25 | out = out.view(out.size(0), -1) 26 | activations.append(out) 27 | out = F.relu(self.fc1(out)) 28 | activations.append(out) 29 | out = F.relu(self.fc2(out)) 30 | activations.append(out) 31 | out = self.fc3(out) 32 | return out, activations 33 | 34 | -------------------------------------------------------------------------------- /models/densenet.py: -------------------------------------------------------------------------------- 1 | '''DenseNet in PyTorch.''' 2 | import math 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from torch.autograd import Variable 9 | 10 | 11 | class Bottleneck(nn.Module): 12 | def __init__(self, in_planes, growth_rate): 13 | super(Bottleneck, self).__init__() 14 | self.bn1 = nn.BatchNorm2d(in_planes) 15 | self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) 16 | self.bn2 = nn.BatchNorm2d(4*growth_rate) 17 | self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) 18 | 19 | def forward(self, x): 20 | out = self.conv1(F.relu(self.bn1(x))) 21 | out = self.conv2(F.relu(self.bn2(out))) 22 | out = torch.cat([out,x], 1) 23 | return out 24 | 25 | 26 | class Transition(nn.Module): 27 | def __init__(self, in_planes, out_planes): 28 | super(Transition, self).__init__() 29 | self.bn = nn.BatchNorm2d(in_planes) 30 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) 31 | 32 | def forward(self, x): 33 | out = self.conv(F.relu(self.bn(x))) 34 | out = F.avg_pool2d(out, 2) 35 | return out 36 | 37 | 38 | class DenseNet(nn.Module): 39 | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): 40 | super(DenseNet, self).__init__() 41 | self.growth_rate = growth_rate 42 | 43 | num_planes = 2*growth_rate 44 | self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) 45 | 46 | self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) 47 | num_planes += nblocks[0]*growth_rate 48 | out_planes = int(math.floor(num_planes*reduction)) 49 | self.trans1 = Transition(num_planes, out_planes) 50 | num_planes = out_planes 51 | 52 | self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) 53 | num_planes += nblocks[1]*growth_rate 54 | out_planes = int(math.floor(num_planes*reduction)) 55 | self.trans2 = Transition(num_planes, out_planes) 56 | num_planes = out_planes 57 | 58 | self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) 59 | num_planes += nblocks[2]*growth_rate 60 | out_planes = int(math.floor(num_planes*reduction)) 61 | self.trans3 = Transition(num_planes, out_planes) 62 | num_planes = out_planes 63 | 64 | self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) 65 | num_planes += nblocks[3]*growth_rate 66 | 67 | self.bn = nn.BatchNorm2d(num_planes) 68 | self.linear = nn.Linear(num_planes, num_classes) 69 | 70 | def _make_dense_layers(self, block, in_planes, nblock): 71 | layers = [] 72 | for i in range(nblock): 73 | layers.append(block(in_planes, self.growth_rate)) 74 | in_planes += self.growth_rate 75 | return nn.Sequential(*layers) 76 | 77 | def forward(self, x): 78 | out = self.conv1(x) 79 | out = self.trans1(self.dense1(out)) 80 | out = self.trans2(self.dense2(out)) 81 | out = self.trans3(self.dense3(out)) 82 | out = self.dense4(out) 83 | out = F.avg_pool2d(F.relu(self.bn(out)), 4) 84 | out = out.view(out.size(0), -1) 85 | out = self.linear(out) 86 | return out 87 | 88 | def DenseNet121(): 89 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) 90 | 91 | def DenseNet169(): 92 | return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) 93 | 94 | def DenseNet201(): 95 | return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) 96 | 97 | def DenseNet161(): 98 | return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) 99 | 100 | def densenet_cifar(): 101 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) 102 | 103 | def test_densenet(): 104 | net = densenet_cifar() 105 | x = torch.randn(1,3,32,32) 106 | y = net(Variable(x)) 107 | print(y) 108 | 109 | # test_densenet() 110 | -------------------------------------------------------------------------------- /models/densenet3.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class BasicBlock(nn.Module): 8 | def __init__(self, in_planes, out_planes, dropRate=0.0): 9 | super(BasicBlock, self).__init__() 10 | self.bn1 = nn.BatchNorm2d(in_planes) 11 | self.relu = nn.ReLU(inplace=True) 12 | self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1, 13 | padding=1, bias=False) 14 | self.droprate = dropRate 15 | def forward(self, x): 16 | out = self.conv1(self.relu(self.bn1(x))) 17 | if self.droprate > 0: 18 | out = F.dropout(out, p=self.droprate, training=self.training) 19 | return torch.cat([x, out], 1) 20 | 21 | class BottleneckBlock(nn.Module): 22 | def __init__(self, in_planes, out_planes, dropRate=0.0): 23 | super(BottleneckBlock, self).__init__() 24 | inter_planes = out_planes * 4 25 | self.bn1 = nn.BatchNorm2d(in_planes) 26 | self.relu = nn.ReLU(inplace=True) 27 | self.conv1 = nn.Conv2d(in_planes, inter_planes, kernel_size=1, stride=1, 28 | padding=0, bias=False) 29 | self.bn2 = nn.BatchNorm2d(inter_planes) 30 | self.conv2 = nn.Conv2d(inter_planes, out_planes, kernel_size=3, stride=1, 31 | padding=1, bias=False) 32 | self.droprate = dropRate 33 | def forward(self, x): 34 | out = self.conv1(self.relu(self.bn1(x))) 35 | if self.droprate > 0: 36 | out = F.dropout(out, p=self.droprate, inplace=False, training=self.training) 37 | out = self.conv2(self.relu(self.bn2(out))) 38 | if self.droprate > 0: 39 | out = F.dropout(out, p=self.droprate, inplace=False, training=self.training) 40 | return torch.cat([x, out], 1) 41 | 42 | class TransitionBlock(nn.Module): 43 | def __init__(self, in_planes, out_planes, dropRate=0.0): 44 | super(TransitionBlock, self).__init__() 45 | self.bn1 = nn.BatchNorm2d(in_planes) 46 | self.relu = nn.ReLU(inplace=True) 47 | self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, 48 | padding=0, bias=False) 49 | self.droprate = dropRate 50 | def forward(self, x): 51 | out = self.conv1(self.relu(self.bn1(x))) 52 | if self.droprate > 0: 53 | out = F.dropout(out, p=self.droprate, inplace=False, training=self.training) 54 | return F.avg_pool2d(out, 2) 55 | 56 | class DenseBlock(nn.Module): 57 | def __init__(self, nb_layers, in_planes, growth_rate, block, dropRate=0.0): 58 | super(DenseBlock, self).__init__() 59 | self.layer = self._make_layer(block, in_planes, growth_rate, nb_layers, dropRate) 60 | def _make_layer(self, block, in_planes, growth_rate, nb_layers, dropRate): 61 | layers = [] 62 | for i in range(nb_layers): 63 | layers.append(block(in_planes+i*growth_rate, growth_rate, dropRate)) 64 | return nn.Sequential(*layers) 65 | def forward(self, x): 66 | return self.layer(x) 67 | 68 | class DenseNet3(nn.Module): 69 | def __init__(self, depth, num_classes, growth_rate=12, 70 | reduction=0.5, bottleneck=True, dropRate=0.0): 71 | super(DenseNet3, self).__init__() 72 | in_planes = 2 * growth_rate 73 | n = (depth - 4) // 3 74 | if bottleneck == True: 75 | n = n//2 76 | block = BottleneckBlock 77 | else: 78 | block = BasicBlock 79 | # 1st conv before any dense block 80 | self.conv1 = nn.Conv2d(3, in_planes, kernel_size=3, stride=1, 81 | padding=1, bias=False) 82 | # 1st block 83 | self.block1 = DenseBlock(n, in_planes, growth_rate, block, dropRate) 84 | in_planes = int(in_planes+n*growth_rate) 85 | self.trans1 = TransitionBlock(in_planes, int(math.floor(in_planes*reduction)), dropRate=dropRate) 86 | in_planes = int(math.floor(in_planes*reduction)) 87 | # 2nd block 88 | self.block2 = DenseBlock(n, in_planes, growth_rate, block, dropRate) 89 | in_planes = int(in_planes+n*growth_rate) 90 | self.trans2 = TransitionBlock(in_planes, int(math.floor(in_planes*reduction)), dropRate=dropRate) 91 | in_planes = int(math.floor(in_planes*reduction)) 92 | # 3rd block 93 | self.block3 = DenseBlock(n, in_planes, growth_rate, block, dropRate) 94 | in_planes = int(in_planes+n*growth_rate) 95 | # global average pooling and classifier 96 | self.bn1 = nn.BatchNorm2d(in_planes) 97 | self.relu = nn.ReLU(inplace=True) 98 | self.fc = nn.Linear(in_planes, num_classes) 99 | self.in_planes = in_planes 100 | 101 | for m in self.modules(): 102 | if isinstance(m, nn.Conv2d): 103 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 104 | m.weight.data.normal_(0, math.sqrt(2. / n)) 105 | elif isinstance(m, nn.BatchNorm2d): 106 | m.weight.data.fill_(1) 107 | m.bias.data.zero_() 108 | elif isinstance(m, nn.Linear): 109 | m.bias.data.zero_() 110 | def forward(self, x): 111 | out = self.conv1(x) 112 | out = self.trans1(self.block1(out)) 113 | out = self.trans2(self.block2(out)) 114 | out = self.block3(out) 115 | out = self.relu(self.bn1(out)) 116 | out = F.avg_pool2d(out, 8) 117 | out = out.view(-1, self.in_planes) 118 | return self.fc(out) 119 | 120 | def DenseNet190(): 121 | return DenseNet3(190, 10, growth_rate=40) 122 | -------------------------------------------------------------------------------- /models/densenet_efficient_multi_gpu.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import math 3 | import torch 4 | from torch.autograd import Function, Variable 5 | from torch.nn import Parameter, Module 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.backends.cudnn as cudnn 9 | from collections import OrderedDict 10 | from operator import mul 11 | from functools import reduce 12 | 13 | 14 | def create_multi_gpu_storage(size=1024): 15 | multi_storage = [] 16 | device_cnt = torch.cuda.device_count() 17 | for device_no in range(device_cnt): 18 | with torch.cuda.device(device_no): 19 | multi_storage.append(torch.Storage(size).cuda()) 20 | return multi_storage 21 | 22 | 23 | class _SharedAllocation(object): 24 | def __init__(self, storage): 25 | self.multi_storage = storage 26 | self.storage = self.multi_storage[0] 27 | 28 | def type(self, t): 29 | self.storage = self.storage.type(t) 30 | 31 | def type_as(self, obj): 32 | new_sto = [] 33 | if isinstance(obj, Variable): 34 | for sto in self.multi_storage: 35 | new_sto.append(sto.type(obj.data.storage().type())) 36 | elif isinstance(obj, torch._TensorBase): 37 | for sto in self.multi_storage: 38 | new_sto.append(sto.type(obj.storage().type())) 39 | else: 40 | for sto in self.multi_storage: 41 | new_sto.append(sto.type(obj.type())) 42 | self.multi_storage = new_sto 43 | 44 | def change_device(self, id): 45 | return self.multi_storage[id] 46 | 47 | def resize_(self, size): 48 | for device_no, sto in enumerate(self.multi_storage): 49 | if sto.size() < size: 50 | with torch.cuda.device(device_no): # this line is crucial!! 51 | sto.resize_(size) 52 | return self 53 | 54 | 55 | class EfficientDensenetBottleneck(Module): 56 | """ 57 | A optimized layer which encapsulates the batch normalization, ReLU, and 58 | convolution operations within the bottleneck of a DenseNet layer. 59 | 60 | This layer usage shared memory allocations to store the outputs of the 61 | concatenation and batch normalization features. Because the shared memory 62 | is not perminant, these features are recomputed during the backward pass. 63 | """ 64 | def __init__(self, shared_alloc, num_input_channels, num_output_channels): 65 | super(EfficientDensenetBottleneck, self).__init__() 66 | self.shared_alloc = shared_alloc 67 | self.num_input_channels = num_input_channels 68 | self.norm_weight = Parameter(torch.Tensor(num_input_channels)) 69 | self.norm_bias = Parameter(torch.Tensor(num_input_channels)) 70 | self.register_buffer('norm_running_mean', torch.zeros(num_input_channels)) 71 | self.register_buffer('norm_running_var', torch.ones(num_input_channels)) 72 | self.conv_weight = Parameter(torch.Tensor(num_output_channels, num_input_channels, 1, 1)) 73 | self._reset_parameters() 74 | 75 | def _reset_parameters(self): 76 | self.norm_running_mean.zero_() 77 | self.norm_running_var.fill_(1) 78 | self.norm_weight.data.uniform_() 79 | self.norm_bias.data.zero_() 80 | stdv = 1. / math.sqrt(self.num_input_channels * 1 * 1) 81 | self.conv_weight.data.uniform_(-stdv, stdv) 82 | 83 | def forward(self, inputs): 84 | if isinstance(inputs, Variable): 85 | inputs = [inputs] 86 | fn = _EfficientDensenetBottleneckFn(self.shared_alloc, 87 | self.norm_running_mean, self.norm_running_var, 88 | stride=1, padding=0, dilation=1, groups=1, 89 | training=self.training, momentum=0.1, eps=1e-5) 90 | return fn(self.norm_weight, self.norm_bias, self.conv_weight, *inputs) 91 | 92 | 93 | class _DenseLayer(Module): 94 | def __init__(self, shared_alloc, num_input_features, growth_rate, bn_size, drop_rate): 95 | super(_DenseLayer, self).__init__() 96 | self.shared_alloc = shared_alloc 97 | self.drop_rate = drop_rate 98 | self.bn_size = bn_size 99 | 100 | if bn_size > 0: 101 | self.efficient = EfficientDensenetBottleneck(shared_alloc, 102 | num_input_features, bn_size * growth_rate) 103 | self.bn = nn.BatchNorm2d(bn_size * growth_rate) 104 | self.relu = nn.ReLU(inplace=True) 105 | self.conv = nn.Conv2d(bn_size * growth_rate, growth_rate, 106 | kernel_size=3, stride=1, padding=1, bias=False) 107 | else: 108 | self.efficient = EfficientDensenetBottleneck(shared_alloc, 109 | num_input_features, growth_rate) 110 | self.conv1 = nn.Conv2d(num_input_features, growth_rate, 111 | kernel_size=3, stride=1, padding=1, bias=False) 112 | 113 | def forward(self, x): 114 | if isinstance(x, Variable): 115 | prev_features = [x] 116 | else: 117 | prev_features = x 118 | out = self.efficient(prev_features) 119 | # out = self.conv1(out) 120 | if self.bn_size > 0: 121 | out = self.bn(out) 122 | out = self.relu(out) 123 | out = self.conv(out) 124 | return out 125 | 126 | 127 | class _DenseBlock(Module): 128 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, storage): 129 | super(_DenseBlock, self).__init__() 130 | self.storage = storage 131 | self.final_num_features = num_input_features + (growth_rate * num_layers) 132 | self.shared_alloc = _SharedAllocation(storage) 133 | self.register_buffer('CatBN_output_buffer', self.storage) 134 | 135 | print('bnsize _DenseBlock', bn_size) 136 | 137 | super(_DenseBlock, self).__init__() 138 | for i in range(num_layers): 139 | layer = _DenseLayer(self.shared_alloc, 140 | num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate) 141 | self.add_module('denselayer%d' % (i + 1), layer) 142 | 143 | def forward(self, x): 144 | # Update storage type 145 | self.shared_alloc.type_as(x) 146 | 147 | # Resize storage 148 | final_size = list(x.size()) 149 | final_size[1] = self.final_num_features 150 | final_storage_size = reduce(mul, final_size, 1) 151 | self.shared_alloc.resize_(final_storage_size) 152 | 153 | outputs = [x] 154 | for module in self.children(): 155 | outputs.append(module.forward(outputs)) 156 | return torch.cat(outputs, dim=1) 157 | 158 | 159 | class TransitionBlock(nn.Module): 160 | def __init__(self, in_planes, out_planes, dropRate=0.0): 161 | super(TransitionBlock, self).__init__() 162 | self.bn1 = nn.BatchNorm2d(in_planes) 163 | self.relu = nn.ReLU(inplace=True) 164 | self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, 165 | padding=0, bias=False) 166 | self.droprate = dropRate 167 | 168 | def forward(self, x): 169 | out = self.conv1(self.relu(self.bn1(x))) 170 | if self.droprate > 0: 171 | out = F.dropout(out, p=self.droprate, inplace=False, training=self.training) 172 | return F.avg_pool2d(out, 2) 173 | 174 | 175 | class DenseNetEfficientMulti(Module): 176 | r"""Densenet-BC model class, based on 177 | `"Densely Connected Convolutional Networks" ` 178 | 179 | This model uses shared memory allocations for the outputs of batch norm and 180 | concat operations, as described in `"Memory-Efficient Implementation of DenseNets"`. 181 | 182 | Args: 183 | growth_rate (int) - how many filters to add each layer (`k` in paper) 184 | block_config (list of 4 ints) - how many layers in each pooling block 185 | num_init_features (int) - the number of filters to learn in the first convolution layer 186 | bn_size (int) - multiplicative factor for number of bottle neck layers 187 | (i.e. bn_size * k features in the bottleneck layer) 188 | drop_rate (float) - dropout rate after each dense layer 189 | num_classes (int) - number of classification classes 190 | """ 191 | def __init__(self, growth_rate=12, block_config=(16, 16, 16), compression=0.5, 192 | num_init_features=24, bn_size=4, drop_rate=0., avgpool_size=8, 193 | num_classes=10): 194 | 195 | super(DenseNetEfficientMulti, self).__init__() 196 | assert 0 < compression <= 1, 'compression of densenet should be between ' 197 | self.avgpool_size = avgpool_size 198 | 199 | # First convolution 200 | self.features = nn.Sequential(OrderedDict([ 201 | ('conv0', nn.Conv2d(3, num_init_features, kernel_size=3, stride=1, padding=1, bias=False)), 202 | ])) 203 | 204 | # Each dense block 205 | storage = create_multi_gpu_storage() 206 | num_features = num_init_features 207 | for i, num_layers in enumerate(block_config): 208 | block = _DenseBlock(num_layers=num_layers, 209 | num_input_features=num_features, 210 | bn_size=bn_size, growth_rate=growth_rate, 211 | drop_rate=drop_rate, storage=storage) 212 | self.features.add_module('denseblock_%d' % (i + 1), block) 213 | num_features = num_features + num_layers * growth_rate 214 | if i != len(block_config) - 1: 215 | trans = TransitionBlock(in_planes=num_features, 216 | out_planes=int(num_features * compression), 217 | dropRate=drop_rate) 218 | self.features.add_module('transition_%d' % (i + 1), trans) 219 | num_features = int(num_features * compression) 220 | 221 | # Final batch norm 222 | self.features.add_module('norm_final', nn.BatchNorm2d(num_features)) 223 | 224 | # Linear layer 225 | self.classifier = nn.Linear(num_features, num_classes) 226 | 227 | def forward(self, x): 228 | features = self.features(x) 229 | out = F.relu(features, inplace=True) 230 | out = F.avg_pool2d(out, kernel_size=self.avgpool_size).view( 231 | features.size(0), -1) 232 | out = self.classifier(out) 233 | return out 234 | 235 | def DenseNet190(): 236 | return DenseNetEfficientMulti(growth_rate=40, block_config=(31, 31, 31), num_classes=10) 237 | 238 | # Begin gross code :/ 239 | # Here's where we define the internals of the efficient bottleneck layer 240 | 241 | class _EfficientCat(object): 242 | def __init__(self, storage): 243 | self.storage = storage 244 | 245 | def forward(self, *inputs): 246 | # Get size of new varible 247 | self.all_num_channels = [input.size(1) for input in inputs] 248 | size = list(inputs[0].size()) 249 | for num_channels in self.all_num_channels[1:]: 250 | size[1] += num_channels 251 | 252 | # Create variable, using existing storage 253 | cur_device_id = inputs[0].get_device() 254 | res = type(inputs[0])(self.storage.change_device(cur_device_id)).resize_(size) 255 | 256 | assert inputs[0].get_device() == res.get_device(), \ 257 | "input and output are not on the same chip!" 258 | torch.cat(inputs, dim=1, out=res) 259 | return res 260 | 261 | def backward(self, grad_output): 262 | # Return a table of tensors pointing to same storage 263 | res = [] 264 | index = 0 265 | for num_channels in self.all_num_channels: 266 | new_index = num_channels + index 267 | res.append(grad_output[:, index:new_index]) 268 | index = new_index 269 | 270 | return tuple(res) 271 | 272 | 273 | class _EfficientBatchNorm(object): 274 | def __init__(self, storage, running_mean, running_var, 275 | training=False, momentum=0.1, eps=1e-5): 276 | self.storage = storage 277 | self.running_mean = running_mean 278 | self.running_var = running_var 279 | self.training = training 280 | self.momentum = momentum 281 | self.eps = eps 282 | 283 | def forward(self, weight, bias, input): 284 | # Assert we're using cudnn 285 | for i in ([weight, bias, input]): 286 | if i is not None and not(cudnn.is_acceptable(i)): 287 | raise Exception('You must be using CUDNN to use EfficientBatchNorm') 288 | 289 | # Create save variables 290 | self.save_mean = self.running_mean.new() 291 | self.save_mean.resize_as_(self.running_mean) 292 | self.save_var = self.running_var.new() 293 | self.save_var.resize_as_(self.running_var) 294 | 295 | # Do forward pass - store in input variable 296 | cur_device_id = weight.get_device() 297 | res = type(input)(self.storage.change_device(cur_device_id)).resize_as_(input) 298 | assert weight.get_device() == res.get_device(), \ 299 | "input and output should be on the same chip!" 300 | 301 | torch._C._cudnn_batch_norm_forward(input, res, 302 | weight, bias, 303 | self.running_mean, self.running_var, 304 | self.save_mean, self.save_var, 305 | self.training, 306 | self.momentum, 307 | self.eps) 308 | return res 309 | 310 | def recompute_forward(self, weight, bias, input): 311 | # Do forward pass - store in input variable 312 | cur_device_id = input.get_device() 313 | res = type(input)(self.storage.change_device(cur_device_id)).resize_as_(input) 314 | 315 | torch._C._cudnn_batch_norm_forward(input, res, 316 | weight, bias, 317 | self.running_mean, self.running_var, 318 | self.save_mean, self.save_var, 319 | self.training, self.momentum, self.eps) 320 | 321 | return res 322 | 323 | def backward(self, weight, bias, input, grad_output): 324 | # Create grad variables 325 | grad_weight = weight.new() 326 | grad_weight.resize_as_(weight) 327 | grad_bias = bias.new() 328 | grad_bias.resize_as_(bias) 329 | 330 | # Run backwards pass - result stored in grad_output 331 | grad_input = grad_output 332 | torch._C._cudnn_batch_norm_backward(input, grad_output, 333 | grad_input, grad_weight, grad_bias, 334 | weight, self.running_mean, self.running_var, 335 | self.save_mean, self.save_var, 336 | self.training, self.eps) 337 | 338 | # Unpack grad_output 339 | res = tuple([grad_weight, grad_bias, grad_input]) 340 | return res 341 | 342 | 343 | class _EfficientReLU(object): 344 | def __init__(self): 345 | pass 346 | 347 | def forward(self, input): 348 | backend = torch._thnn.type2backend[type(input)] 349 | output = input 350 | backend.Threshold_updateOutput( 351 | backend.library_state, 352 | input, 353 | output, 354 | 0, 355 | 0, 356 | True 357 | ) 358 | return output 359 | 360 | def backward(self, input, grad_output): 361 | grad_input = grad_output 362 | grad_input.masked_fill_(input <= 0, 0) 363 | return grad_input 364 | 365 | 366 | class _EfficientConv2d(object): 367 | def __init__(self, stride=1, padding=0, dilation=1, groups=1): 368 | self.stride = stride 369 | self.padding = padding 370 | self.dilation = dilation 371 | self.groups = groups 372 | 373 | def _output_size(self, input, weight): 374 | channels = weight.size(0) 375 | output_size = (input.size(0), channels) 376 | for d in range(input.dim() - 2): 377 | in_size = input.size(d + 2) 378 | pad = self.padding 379 | kernel = self.dilation * (weight.size(d + 2) - 1) + 1 380 | stride = self.stride 381 | output_size += ((in_size + (2 * pad) - kernel) // stride + 1,) 382 | if not all(map(lambda s: s > 0, output_size)): 383 | raise ValueError("convolution input is too small (output would be {})".format( 384 | 'x'.join(map(str, output_size)))) 385 | return output_size 386 | 387 | def forward(self, weight, bias, input): 388 | # Assert we're using cudnn 389 | for i in ([weight, bias, input]): 390 | if i is not None and not(cudnn.is_acceptable(i)): 391 | raise Exception('You must be using CUDNN to use _EfficientBatchNorm') 392 | 393 | res = input.new(*self._output_size(input, weight)) 394 | self._cudnn_info = torch._C._cudnn_convolution_full_forward( 395 | input, weight, bias, res, 396 | (self.padding, self.padding), 397 | (self.stride, self.stride), 398 | (self.dilation, self.dilation), 399 | self.groups, cudnn.benchmark 400 | ) 401 | 402 | return res 403 | 404 | def backward(self, weight, bias, input, grad_output): 405 | grad_input = input.new() 406 | grad_input.resize_as_(input) 407 | torch._C._cudnn_convolution_backward_data( 408 | grad_output, grad_input, weight, self._cudnn_info, 409 | cudnn.benchmark) 410 | 411 | grad_weight = weight.new().resize_as_(weight) 412 | torch._C._cudnn_convolution_backward_filter(grad_output, input, grad_weight, self._cudnn_info, 413 | cudnn.benchmark) 414 | 415 | if bias is not None: 416 | grad_bias = bias.new().resize_as_(bias) 417 | torch._C._cudnn_convolution_backward_bias(grad_output, grad_bias, self._cudnn_info) 418 | else: 419 | grad_bias = None 420 | 421 | return grad_weight, grad_bias, grad_input 422 | 423 | 424 | 425 | class _EfficientDensenetBottleneckFn(Function): 426 | """ 427 | The autograd function which performs the efficient bottlenck operations. 428 | Each of the sub-operations -- concatenation, batch normalization, ReLU, 429 | and convolution -- are abstracted into their own classes 430 | """ 431 | def __init__(self, shared_alloc, 432 | running_mean, running_var, 433 | stride=1, padding=0, dilation=1, groups=1, 434 | training=False, momentum=0.1, eps=1e-5): 435 | super(_EfficientDensenetBottleneckFn, self).__init__() 436 | 437 | self.efficient_cat = _EfficientCat(shared_alloc) 438 | self.efficient_batch_norm = _EfficientBatchNorm(shared_alloc, running_mean, 439 | running_var, training, momentum, eps) 440 | self.efficient_relu = _EfficientReLU() 441 | 442 | self.efficient_conv = _EfficientConv2d(stride, padding, dilation, groups) 443 | 444 | 445 | # Buffers to store old versions of bn statistics 446 | self.prev_running_mean = self.efficient_batch_norm.running_mean.new() 447 | self.prev_running_mean.resize_as_(self.efficient_batch_norm.running_mean) 448 | self.prev_running_var = self.efficient_batch_norm.running_var.new() 449 | self.prev_running_var.resize_as_(self.efficient_batch_norm.running_var) 450 | self.curr_running_mean = self.efficient_batch_norm.running_mean.new() 451 | self.curr_running_mean.resize_as_(self.efficient_batch_norm.running_mean) 452 | self.curr_running_var = self.efficient_batch_norm.running_var.new() 453 | self.curr_running_var.resize_as_(self.efficient_batch_norm.running_var) 454 | 455 | def forward(self, bn_weight, bn_bias, conv_weight, *inputs): 456 | self.prev_running_mean.copy_(self.efficient_batch_norm.running_mean) 457 | self.prev_running_var.copy_(self.efficient_batch_norm.running_var) 458 | 459 | bn_input = self.efficient_cat.forward(*inputs) 460 | bn_output = self.efficient_batch_norm.forward(bn_weight, bn_bias, bn_input) 461 | relu_output = self.efficient_relu.forward(bn_output) 462 | bias = None 463 | conv_output = self.efficient_conv.forward(conv_weight, None, relu_output) 464 | 465 | self.bn_weight = bn_weight 466 | self.bn_bias = bn_bias 467 | self.conv_weight = conv_weight 468 | self.inputs = inputs 469 | return conv_output 470 | 471 | def backward(self, grad_output): 472 | # Turn off bn training status, and temporarily reset statistics 473 | 474 | training = self.efficient_batch_norm.training 475 | self.curr_running_mean.copy_(self.efficient_batch_norm.running_mean) 476 | self.curr_running_var.copy_(self.efficient_batch_norm.running_var) 477 | # self.efficient_batch_norm.training = False 478 | self.efficient_batch_norm.running_mean.copy_(self.prev_running_mean) 479 | self.efficient_batch_norm.running_var.copy_(self.prev_running_var) 480 | 481 | # Recompute concat and BN 482 | cat_output = self.efficient_cat.forward(*self.inputs) 483 | bn_output = self.efficient_batch_norm.forward(self.bn_weight, self.bn_bias, cat_output) 484 | relu_output = self.efficient_relu.forward(bn_output) 485 | 486 | # Conv backward 487 | conv_weight_grad, _, conv_grad_output = self.efficient_conv.backward( 488 | self.conv_weight, None, relu_output, grad_output) 489 | 490 | # ReLU backward 491 | relu_grad_output = self.efficient_relu.backward(bn_output, conv_grad_output) 492 | 493 | # BN backward 494 | cat_output = self.efficient_cat.forward(*self.inputs) # recompute cat_output because bn_output override the storage (L481) 495 | # multi_gpu version is slightly different from the single gpu that 496 | # we only use one shared_allocation for both BN and Cat 497 | self.efficient_batch_norm.running_mean.copy_(self.curr_running_mean) 498 | self.efficient_batch_norm.running_var.copy_(self.curr_running_var) 499 | bn_weight_grad, bn_bias_grad, bn_grad_output = self.efficient_batch_norm.backward( 500 | self.bn_weight, self.bn_bias, cat_output, relu_grad_output) 501 | 502 | # Input backward 503 | grad_inputs = self.efficient_cat.backward(bn_grad_output) 504 | # Reset bn training status and statistics 505 | self.efficient_batch_norm.training = training 506 | self.efficient_batch_norm.running_mean.copy_(self.curr_running_mean) 507 | self.efficient_batch_norm.running_var.copy_(self.curr_running_var) 508 | 509 | return tuple([bn_weight_grad, bn_bias_grad, conv_weight_grad] + list(grad_inputs)) 510 | -------------------------------------------------------------------------------- /models/googlenet.py: -------------------------------------------------------------------------------- 1 | '''GoogLeNet with PyTorch.''' 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from torch.autograd import Variable 7 | 8 | 9 | class Inception(nn.Module): 10 | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): 11 | super(Inception, self).__init__() 12 | # 1x1 conv branch 13 | self.b1 = nn.Sequential( 14 | nn.Conv2d(in_planes, n1x1, kernel_size=1), 15 | nn.BatchNorm2d(n1x1), 16 | nn.ReLU(True), 17 | ) 18 | 19 | # 1x1 conv -> 3x3 conv branch 20 | self.b2 = nn.Sequential( 21 | nn.Conv2d(in_planes, n3x3red, kernel_size=1), 22 | nn.BatchNorm2d(n3x3red), 23 | nn.ReLU(True), 24 | nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), 25 | nn.BatchNorm2d(n3x3), 26 | nn.ReLU(True), 27 | ) 28 | 29 | # 1x1 conv -> 5x5 conv branch 30 | self.b3 = nn.Sequential( 31 | nn.Conv2d(in_planes, n5x5red, kernel_size=1), 32 | nn.BatchNorm2d(n5x5red), 33 | nn.ReLU(True), 34 | nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), 35 | nn.BatchNorm2d(n5x5), 36 | nn.ReLU(True), 37 | nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), 38 | nn.BatchNorm2d(n5x5), 39 | nn.ReLU(True), 40 | ) 41 | 42 | # 3x3 pool -> 1x1 conv branch 43 | self.b4 = nn.Sequential( 44 | nn.MaxPool2d(3, stride=1, padding=1), 45 | nn.Conv2d(in_planes, pool_planes, kernel_size=1), 46 | nn.BatchNorm2d(pool_planes), 47 | nn.ReLU(True), 48 | ) 49 | 50 | def forward(self, x): 51 | y1 = self.b1(x) 52 | y2 = self.b2(x) 53 | y3 = self.b3(x) 54 | y4 = self.b4(x) 55 | return torch.cat([y1,y2,y3,y4], 1) 56 | 57 | 58 | class GoogLeNet(nn.Module): 59 | def __init__(self): 60 | super(GoogLeNet, self).__init__() 61 | self.pre_layers = nn.Sequential( 62 | nn.Conv2d(3, 192, kernel_size=3, padding=1), 63 | nn.BatchNorm2d(192), 64 | nn.ReLU(True), 65 | ) 66 | 67 | self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) 68 | self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) 69 | 70 | self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) 71 | 72 | self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) 73 | self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) 74 | self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) 75 | self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) 76 | self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) 77 | 78 | self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) 79 | self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) 80 | 81 | self.avgpool = nn.AvgPool2d(8, stride=1) 82 | self.linear = nn.Linear(1024, 10) 83 | 84 | def forward(self, x): 85 | out = self.pre_layers(x) 86 | out = self.a3(out) 87 | out = self.b3(out) 88 | out = self.maxpool(out) 89 | out = self.a4(out) 90 | out = self.b4(out) 91 | out = self.c4(out) 92 | out = self.d4(out) 93 | out = self.e4(out) 94 | out = self.maxpool(out) 95 | out = self.a5(out) 96 | out = self.b5(out) 97 | out = self.avgpool(out) 98 | out = out.view(out.size(0), -1) 99 | out = self.linear(out) 100 | return out 101 | 102 | # net = GoogLeNet() 103 | # x = torch.randn(1,3,32,32) 104 | # y = net(Variable(x)) 105 | # print(y.size()) 106 | -------------------------------------------------------------------------------- /models/lenet.py: -------------------------------------------------------------------------------- 1 | '''LeNet in PyTorch.''' 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class LeNet(nn.Module): 6 | def __init__(self): 7 | super(LeNet, self).__init__() 8 | self.conv1 = nn.Conv2d(3, 6, 5) 9 | self.conv2 = nn.Conv2d(6, 16, 5) 10 | self.fc1 = nn.Linear(16*5*5, 120) 11 | self.fc2 = nn.Linear(120, 84) 12 | self.fc3 = nn.Linear(84, 10) 13 | 14 | def forward(self, x): 15 | out = F.relu(self.conv1(x)) 16 | out = F.max_pool2d(out, 2) 17 | out = F.relu(self.conv2(out)) 18 | out = F.max_pool2d(out, 2) 19 | out = out.view(out.size(0), -1) 20 | out = F.relu(self.fc1(out)) 21 | out = F.relu(self.fc2(out)) 22 | out = self.fc3(out) 23 | return out 24 | -------------------------------------------------------------------------------- /models/mobilenet.py: -------------------------------------------------------------------------------- 1 | '''MobileNet in PyTorch. 2 | 3 | See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" 4 | for more details. 5 | ''' 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | from torch.autograd import Variable 11 | 12 | 13 | class Block(nn.Module): 14 | '''Depthwise conv + Pointwise conv''' 15 | def __init__(self, in_planes, out_planes, stride=1): 16 | super(Block, self).__init__() 17 | self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) 18 | self.bn1 = nn.BatchNorm2d(in_planes) 19 | self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) 20 | self.bn2 = nn.BatchNorm2d(out_planes) 21 | 22 | def forward(self, x): 23 | out = F.relu(self.bn1(self.conv1(x))) 24 | out = F.relu(self.bn2(self.conv2(out))) 25 | return out 26 | 27 | 28 | class MobileNet(nn.Module): 29 | # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 30 | cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] 31 | 32 | def __init__(self, num_classes=10): 33 | super(MobileNet, self).__init__() 34 | self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) 35 | self.bn1 = nn.BatchNorm2d(32) 36 | self.layers = self._make_layers(in_planes=32) 37 | self.linear = nn.Linear(1024, num_classes) 38 | 39 | def _make_layers(self, in_planes): 40 | layers = [] 41 | for x in self.cfg: 42 | out_planes = x if isinstance(x, int) else x[0] 43 | stride = 1 if isinstance(x, int) else x[1] 44 | layers.append(Block(in_planes, out_planes, stride)) 45 | in_planes = out_planes 46 | return nn.Sequential(*layers) 47 | 48 | def forward(self, x): 49 | out = F.relu(self.bn1(self.conv1(x))) 50 | out = self.layers(out) 51 | out = F.avg_pool2d(out, 2) 52 | out = out.view(out.size(0), -1) 53 | out = self.linear(out) 54 | return out 55 | 56 | 57 | def test(): 58 | net = MobileNet() 59 | x = torch.randn(1,3,32,32) 60 | y = net(Variable(x)) 61 | print(y.size()) 62 | 63 | # test() 64 | -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | '''ResNet in PyTorch. 2 | 3 | BasicBlock and Bottleneck module is from the original ResNet paper: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 6 | 7 | PreActBlock and PreActBottleneck module is from the later paper: 8 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 9 | Identity Mappings in Deep Residual Networks. arXiv:1603.05027 10 | ''' 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | from torch.autograd import Variable 16 | 17 | 18 | def conv3x3(in_planes, out_planes, stride=1): 19 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 20 | 21 | 22 | class BasicBlock(nn.Module): 23 | expansion = 1 24 | 25 | def __init__(self, in_planes, planes, stride=1): 26 | super(BasicBlock, self).__init__() 27 | self.conv1 = conv3x3(in_planes, planes, stride) 28 | self.bn1 = nn.BatchNorm2d(planes) 29 | self.conv2 = conv3x3(planes, planes) 30 | self.bn2 = nn.BatchNorm2d(planes) 31 | 32 | self.shortcut = nn.Sequential() 33 | if stride != 1 or in_planes != self.expansion*planes: 34 | self.shortcut = nn.Sequential( 35 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 36 | nn.BatchNorm2d(self.expansion*planes) 37 | ) 38 | 39 | def forward(self, x): 40 | out = F.relu(self.bn1(self.conv1(x))) 41 | out = self.bn2(self.conv2(out)) 42 | out += self.shortcut(x) 43 | out = F.relu(out) 44 | return out 45 | 46 | 47 | class PreActBlock(nn.Module): 48 | '''Pre-activation version of the BasicBlock.''' 49 | expansion = 1 50 | 51 | def __init__(self, in_planes, planes, stride=1): 52 | super(PreActBlock, self).__init__() 53 | self.bn1 = nn.BatchNorm2d(in_planes) 54 | self.conv1 = conv3x3(in_planes, planes, stride) 55 | self.bn2 = nn.BatchNorm2d(planes) 56 | self.conv2 = conv3x3(planes, planes) 57 | 58 | self.shortcut = nn.Sequential() 59 | if stride != 1 or in_planes != self.expansion*planes: 60 | self.shortcut = nn.Sequential( 61 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 62 | ) 63 | 64 | def forward(self, x): 65 | out = F.relu(self.bn1(x)) 66 | shortcut = self.shortcut(out) 67 | out = self.conv1(out) 68 | out = self.conv2(F.relu(self.bn2(out))) 69 | out += shortcut 70 | return out 71 | 72 | 73 | class Bottleneck(nn.Module): 74 | expansion = 4 75 | 76 | def __init__(self, in_planes, planes, stride=1): 77 | super(Bottleneck, self).__init__() 78 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 79 | self.bn1 = nn.BatchNorm2d(planes) 80 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 81 | self.bn2 = nn.BatchNorm2d(planes) 82 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 83 | self.bn3 = nn.BatchNorm2d(self.expansion*planes) 84 | 85 | self.shortcut = nn.Sequential() 86 | if stride != 1 or in_planes != self.expansion*planes: 87 | self.shortcut = nn.Sequential( 88 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 89 | nn.BatchNorm2d(self.expansion*planes) 90 | ) 91 | 92 | def forward(self, x): 93 | out = F.relu(self.bn1(self.conv1(x))) 94 | out = F.relu(self.bn2(self.conv2(out))) 95 | out = self.bn3(self.conv3(out)) 96 | out += self.shortcut(x) 97 | out = F.relu(out) 98 | return out 99 | 100 | 101 | class PreActBottleneck(nn.Module): 102 | '''Pre-activation version of the original Bottleneck module.''' 103 | expansion = 4 104 | 105 | def __init__(self, in_planes, planes, stride=1): 106 | super(PreActBottleneck, self).__init__() 107 | self.bn1 = nn.BatchNorm2d(in_planes) 108 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 109 | self.bn2 = nn.BatchNorm2d(planes) 110 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 111 | self.bn3 = nn.BatchNorm2d(planes) 112 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 113 | 114 | self.shortcut = nn.Sequential() 115 | if stride != 1 or in_planes != self.expansion*planes: 116 | self.shortcut = nn.Sequential( 117 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 118 | ) 119 | 120 | def forward(self, x): 121 | out = F.relu(self.bn1(x)) 122 | shortcut = self.shortcut(out) 123 | out = self.conv1(out) 124 | out = self.conv2(F.relu(self.bn2(out))) 125 | out = self.conv3(F.relu(self.bn3(out))) 126 | out += shortcut 127 | return out 128 | 129 | 130 | class ResNet(nn.Module): 131 | def __init__(self, block, num_blocks, num_classes=10): 132 | super(ResNet, self).__init__() 133 | self.in_planes = 64 134 | 135 | self.conv1 = conv3x3(3,64) 136 | self.bn1 = nn.BatchNorm2d(64) 137 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 138 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 139 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 140 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 141 | self.linear = nn.Linear(512*block.expansion, num_classes) 142 | 143 | def _make_layer(self, block, planes, num_blocks, stride): 144 | strides = [stride] + [1]*(num_blocks-1) 145 | layers = [] 146 | for stride in strides: 147 | layers.append(block(self.in_planes, planes, stride)) 148 | self.in_planes = planes * block.expansion 149 | return nn.Sequential(*layers) 150 | 151 | def forward(self, x, lin=0, lout=5): 152 | out = x 153 | if lin < 1 and lout > -1: 154 | out = self.conv1(out) 155 | out = self.bn1(out) 156 | out = F.relu(out) 157 | if lin < 2 and lout > 0: 158 | out = self.layer1(out) 159 | if lin < 3 and lout > 1: 160 | out = self.layer2(out) 161 | if lin < 4 and lout > 2: 162 | out = self.layer3(out) 163 | if lin < 5 and lout > 3: 164 | out = self.layer4(out) 165 | if lout > 4: 166 | out = F.avg_pool2d(out, 4) 167 | out = out.view(out.size(0), -1) 168 | out = self.linear(out) 169 | return out 170 | 171 | 172 | def ResNet18(): 173 | return ResNet(PreActBlock, [2,2,2,2]) 174 | 175 | def ResNet34(): 176 | return ResNet(BasicBlock, [3,4,6,3]) 177 | 178 | def ResNet50(): 179 | return ResNet(Bottleneck, [3,4,6,3]) 180 | 181 | def ResNet101(): 182 | return ResNet(Bottleneck, [3,4,23,3]) 183 | 184 | def ResNet152(): 185 | return ResNet(Bottleneck, [3,8,36,3]) 186 | 187 | 188 | def test(): 189 | net = ResNet18() 190 | y = net(Variable(torch.randn(1,3,32,32))) 191 | print(y.size()) 192 | 193 | # test() 194 | -------------------------------------------------------------------------------- /models/resnext.py: -------------------------------------------------------------------------------- 1 | '''ResNeXt in PyTorch. 2 | 3 | See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from torch.autograd import Variable 10 | 11 | 12 | class Block(nn.Module): 13 | '''Grouped convolution block.''' 14 | expansion = 2 15 | 16 | def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): 17 | super(Block, self).__init__() 18 | group_width = cardinality * bottleneck_width 19 | self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) 20 | self.bn1 = nn.BatchNorm2d(group_width) 21 | self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) 22 | self.bn2 = nn.BatchNorm2d(group_width) 23 | self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) 24 | self.bn3 = nn.BatchNorm2d(self.expansion*group_width) 25 | 26 | self.shortcut = nn.Sequential() 27 | if stride != 1 or in_planes != self.expansion*group_width: 28 | self.shortcut = nn.Sequential( 29 | nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), 30 | nn.BatchNorm2d(self.expansion*group_width) 31 | ) 32 | 33 | def forward(self, x): 34 | out = F.relu(self.bn1(self.conv1(x))) 35 | out = F.relu(self.bn2(self.conv2(out))) 36 | out = self.bn3(self.conv3(out)) 37 | out += self.shortcut(x) 38 | out = F.relu(out) 39 | return out 40 | 41 | 42 | class ResNeXt(nn.Module): 43 | def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): 44 | super(ResNeXt, self).__init__() 45 | self.cardinality = cardinality 46 | self.bottleneck_width = bottleneck_width 47 | self.in_planes = 64 48 | 49 | self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) 50 | self.bn1 = nn.BatchNorm2d(64) 51 | self.layer1 = self._make_layer(num_blocks[0], 1) 52 | self.layer2 = self._make_layer(num_blocks[1], 2) 53 | self.layer3 = self._make_layer(num_blocks[2], 2) 54 | # self.layer4 = self._make_layer(num_blocks[3], 2) 55 | self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) 56 | 57 | def _make_layer(self, num_blocks, stride): 58 | strides = [stride] + [1]*(num_blocks-1) 59 | layers = [] 60 | for stride in strides: 61 | layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) 62 | self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width 63 | # Increase bottleneck_width by 2 after each stage. 64 | self.bottleneck_width *= 2 65 | return nn.Sequential(*layers) 66 | 67 | def forward(self, x): 68 | out = F.relu(self.bn1(self.conv1(x))) 69 | out = self.layer1(out) 70 | out = self.layer2(out) 71 | out = self.layer3(out) 72 | # out = self.layer4(out) 73 | out = F.avg_pool2d(out, 8) 74 | out = out.view(out.size(0), -1) 75 | out = self.linear(out) 76 | return out 77 | 78 | 79 | def ResNeXt29_2x64d(): 80 | return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64) 81 | 82 | def ResNeXt29_4x64d(): 83 | return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64) 84 | 85 | def ResNeXt29_8x64d(): 86 | return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64) 87 | 88 | def ResNeXt29_32x4d(): 89 | return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4) 90 | 91 | def test_resnext(): 92 | net = ResNeXt29_2x64d() 93 | x = torch.randn(1,3,32,32) 94 | y = net(Variable(x)) 95 | print(y.size()) 96 | 97 | # test_resnext() 98 | -------------------------------------------------------------------------------- /models/vgg.py: -------------------------------------------------------------------------------- 1 | '''VGG11/13/16/19 in Pytorch.''' 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | 7 | cfg = { 8 | 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 9 | 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 10 | 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 11 | 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], 12 | } 13 | 14 | 15 | class VGG(nn.Module): 16 | def __init__(self, vgg_name): 17 | super(VGG, self).__init__() 18 | self.features = self._make_layers(cfg[vgg_name]) 19 | self.classifier = nn.Linear(512, 10) 20 | 21 | def forward(self, x): 22 | out = self.features(x) 23 | out = out.view(out.size(0), -1) 24 | out = self.classifier(out) 25 | return out 26 | 27 | def _make_layers(self, cfg): 28 | layers = [] 29 | in_channels = 3 30 | for x in cfg: 31 | if x == 'M': 32 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 33 | else: 34 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 35 | nn.BatchNorm2d(x), 36 | nn.ReLU(inplace=True)] 37 | in_channels = x 38 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)] 39 | return nn.Sequential(*layers) 40 | 41 | # net = VGG('VGG11') 42 | # x = torch.randn(2,3,32,32) 43 | # print(net(Variable(x)).size()) 44 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -u 2 | # Copyright (c) 2017-present, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the LICENSE file in 6 | # the root directory of this source tree. 7 | from __future__ import print_function 8 | 9 | import argparse 10 | import csv 11 | import os 12 | 13 | import numpy as np 14 | import torch 15 | from torch.autograd import Variable 16 | import torch.backends.cudnn as cudnn 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | import torchvision.transforms as transforms 20 | import torchvision.datasets as datasets 21 | 22 | import models 23 | from utils import progress_bar 24 | 25 | parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') 26 | parser.add_argument('--lr', default=0.1, type=float, help='learning rate') 27 | parser.add_argument('--resume', '-r', action='store_true', 28 | help='resume from checkpoint') 29 | parser.add_argument('--model', default="ResNet18", type=str, 30 | help='model type (default: ResNet18)') 31 | parser.add_argument('--name', default='0', type=str, help='name of run') 32 | parser.add_argument('--seed', default=0, type=int, help='random seed') 33 | parser.add_argument('--batch-size', default=128, type=int, help='batch size') 34 | parser.add_argument('--epoch', default=200, type=int, 35 | help='total epochs to run') 36 | parser.add_argument('--no-augment', dest='augment', action='store_false', 37 | help='use standard augmentation (default: True)') 38 | parser.add_argument('--decay', default=1e-4, type=float, help='weight decay') 39 | parser.add_argument('--alpha', default=1., type=float, 40 | help='mixup interpolation coefficient (default: 1)') 41 | args = parser.parse_args() 42 | 43 | use_cuda = torch.cuda.is_available() 44 | 45 | best_acc = 0 # best test accuracy 46 | start_epoch = 0 # start from epoch 0 or last checkpoint epoch 47 | 48 | if args.seed != 0: 49 | torch.manual_seed(args.seed) 50 | 51 | # Data 52 | print('==> Preparing data..') 53 | if args.augment: 54 | transform_train = transforms.Compose([ 55 | transforms.RandomCrop(32, padding=4), 56 | transforms.RandomHorizontalFlip(), 57 | transforms.ToTensor(), 58 | transforms.Normalize((0.4914, 0.4822, 0.4465), 59 | (0.2023, 0.1994, 0.2010)), 60 | ]) 61 | else: 62 | transform_train = transforms.Compose([ 63 | transforms.ToTensor(), 64 | transforms.Normalize((0.4914, 0.4822, 0.4465), 65 | (0.2023, 0.1994, 0.2010)), 66 | ]) 67 | 68 | 69 | transform_test = transforms.Compose([ 70 | transforms.ToTensor(), 71 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 72 | ]) 73 | 74 | trainset = datasets.CIFAR10(root='~/data', train=True, download=False, 75 | transform=transform_train) 76 | trainloader = torch.utils.data.DataLoader(trainset, 77 | batch_size=args.batch_size, 78 | shuffle=True, num_workers=8) 79 | 80 | testset = datasets.CIFAR10(root='~/data', train=False, download=False, 81 | transform=transform_test) 82 | testloader = torch.utils.data.DataLoader(testset, batch_size=100, 83 | shuffle=False, num_workers=8) 84 | 85 | 86 | # Model 87 | if args.resume: 88 | # Load checkpoint. 89 | print('==> Resuming from checkpoint..') 90 | assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' 91 | checkpoint = torch.load('./checkpoint/ckpt.t7' + args.name + '_' 92 | + str(args.seed)) 93 | net = checkpoint['net'] 94 | best_acc = checkpoint['acc'] 95 | start_epoch = checkpoint['epoch'] + 1 96 | rng_state = checkpoint['rng_state'] 97 | torch.set_rng_state(rng_state) 98 | else: 99 | print('==> Building model..') 100 | net = models.__dict__[args.model]() 101 | 102 | if not os.path.isdir('results'): 103 | os.mkdir('results') 104 | logname = ('results/log_' + net.__class__.__name__ + '_' + args.name + '_' 105 | + str(args.seed) + '.csv') 106 | 107 | if use_cuda: 108 | net.cuda() 109 | net = torch.nn.DataParallel(net) 110 | print(torch.cuda.device_count()) 111 | cudnn.benchmark = True 112 | print('Using CUDA..') 113 | 114 | criterion = nn.CrossEntropyLoss() 115 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, 116 | weight_decay=args.decay) 117 | 118 | 119 | def mixup_data(x, y, alpha=1.0, use_cuda=True): 120 | '''Returns mixed inputs, pairs of targets, and lambda''' 121 | if alpha > 0: 122 | lam = np.random.beta(alpha, alpha) 123 | else: 124 | lam = 1 125 | 126 | batch_size = x.size()[0] 127 | if use_cuda: 128 | index = torch.randperm(batch_size).cuda() 129 | else: 130 | index = torch.randperm(batch_size) 131 | 132 | mixed_x = lam * x + (1 - lam) * x[index, :] 133 | y_a, y_b = y, y[index] 134 | return mixed_x, y_a, y_b, lam 135 | 136 | 137 | def mixup_criterion(criterion, pred, y_a, y_b, lam): 138 | return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b) 139 | 140 | 141 | def train(epoch): 142 | print('\nEpoch: %d' % epoch) 143 | net.train() 144 | train_loss = 0 145 | reg_loss = 0 146 | correct = 0 147 | total = 0 148 | for batch_idx, (inputs, targets) in enumerate(trainloader): 149 | if use_cuda: 150 | inputs, targets = inputs.cuda(), targets.cuda() 151 | 152 | inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, 153 | args.alpha, use_cuda) 154 | inputs, targets_a, targets_b = map(Variable, (inputs, 155 | targets_a, targets_b)) 156 | outputs = net(inputs) 157 | loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) 158 | train_loss += loss.data[0] 159 | _, predicted = torch.max(outputs.data, 1) 160 | total += targets.size(0) 161 | correct += (lam * predicted.eq(targets_a.data).cpu().sum().float() 162 | + (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float()) 163 | 164 | optimizer.zero_grad() 165 | loss.backward() 166 | optimizer.step() 167 | 168 | progress_bar(batch_idx, len(trainloader), 169 | 'Loss: %.3f | Reg: %.5f | Acc: %.3f%% (%d/%d)' 170 | % (train_loss/(batch_idx+1), reg_loss/(batch_idx+1), 171 | 100.*correct/total, correct, total)) 172 | return (train_loss/batch_idx, reg_loss/batch_idx, 100.*correct/total) 173 | 174 | 175 | def test(epoch): 176 | global best_acc 177 | net.eval() 178 | test_loss = 0 179 | correct = 0 180 | total = 0 181 | for batch_idx, (inputs, targets) in enumerate(testloader): 182 | if use_cuda: 183 | inputs, targets = inputs.cuda(), targets.cuda() 184 | inputs, targets = Variable(inputs, volatile=True), Variable(targets) 185 | outputs = net(inputs) 186 | loss = criterion(outputs, targets) 187 | 188 | test_loss += loss.data[0] 189 | _, predicted = torch.max(outputs.data, 1) 190 | total += targets.size(0) 191 | correct += predicted.eq(targets.data).cpu().sum() 192 | 193 | progress_bar(batch_idx, len(testloader), 194 | 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 195 | % (test_loss/(batch_idx+1), 100.*correct/total, 196 | correct, total)) 197 | acc = 100.*correct/total 198 | if epoch == start_epoch + args.epoch - 1 or acc > best_acc: 199 | checkpoint(acc, epoch) 200 | if acc > best_acc: 201 | best_acc = acc 202 | return (test_loss/batch_idx, 100.*correct/total) 203 | 204 | 205 | def checkpoint(acc, epoch): 206 | # Save checkpoint. 207 | print('Saving..') 208 | state = { 209 | 'net': net, 210 | 'acc': acc, 211 | 'epoch': epoch, 212 | 'rng_state': torch.get_rng_state() 213 | } 214 | if not os.path.isdir('checkpoint'): 215 | os.mkdir('checkpoint') 216 | torch.save(state, './checkpoint/ckpt.t7' + args.name + '_' 217 | + str(args.seed)) 218 | 219 | 220 | def adjust_learning_rate(optimizer, epoch): 221 | """decrease the learning rate at 100 and 150 epoch""" 222 | lr = args.lr 223 | if epoch >= 100: 224 | lr /= 10 225 | if epoch >= 150: 226 | lr /= 10 227 | for param_group in optimizer.param_groups: 228 | param_group['lr'] = lr 229 | 230 | 231 | if not os.path.exists(logname): 232 | with open(logname, 'w') as logfile: 233 | logwriter = csv.writer(logfile, delimiter=',') 234 | logwriter.writerow(['epoch', 'train loss', 'reg loss', 'train acc', 235 | 'test loss', 'test acc']) 236 | 237 | for epoch in range(start_epoch, args.epoch): 238 | train_loss, reg_loss, train_acc = train(epoch) 239 | test_loss, test_acc = test(epoch) 240 | adjust_learning_rate(optimizer, epoch) 241 | with open(logname, 'a') as logfile: 242 | logwriter = csv.writer(logfile, delimiter=',') 243 | logwriter.writerow([epoch, train_loss, reg_loss, train_acc, test_loss, 244 | test_acc]) 245 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | '''Some helper functions for PyTorch, including: 2 | - get_mean_and_std: calculate the mean and std value of dataset. 3 | - msr_init: net parameter initialization. 4 | - progress_bar: progress bar mimic xlua.progress. 5 | ''' 6 | import os 7 | import sys 8 | import time 9 | import math 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.init as init 14 | 15 | 16 | def get_mean_and_std(dataset): 17 | '''Compute the mean and std value of dataset.''' 18 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) 19 | mean = torch.zeros(3) 20 | std = torch.zeros(3) 21 | print('==> Computing mean and std..') 22 | for inputs, targets in dataloader: 23 | for i in range(3): 24 | mean[i] += inputs[:,i,:,:].mean() 25 | std[i] += inputs[:,i,:,:].std() 26 | mean.div_(len(dataset)) 27 | std.div_(len(dataset)) 28 | return mean, std 29 | 30 | def init_params(net): 31 | '''Init layer parameters.''' 32 | for m in net.modules(): 33 | if isinstance(m, nn.Conv2d): 34 | init.kaiming_normal(m.weight, mode='fan_out') 35 | if m.bias: 36 | init.constant(m.bias, 0) 37 | elif isinstance(m, nn.BatchNorm2d): 38 | init.constant(m.weight, 1) 39 | init.constant(m.bias, 0) 40 | elif isinstance(m, nn.Linear): 41 | init.normal(m.weight, std=1e-3) 42 | if m.bias: 43 | init.constant(m.bias, 0) 44 | 45 | 46 | _, term_width = os.popen('stty size', 'r').read().split() 47 | term_width = int(term_width) 48 | 49 | TOTAL_BAR_LENGTH = 86. 50 | last_time = time.time() 51 | begin_time = last_time 52 | def progress_bar(current, total, msg=None): 53 | global last_time, begin_time 54 | if current == 0: 55 | begin_time = time.time() # Reset for new bar. 56 | 57 | cur_len = int(TOTAL_BAR_LENGTH*current/total) 58 | rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 59 | 60 | sys.stdout.write(' [') 61 | for i in range(cur_len): 62 | sys.stdout.write('=') 63 | sys.stdout.write('>') 64 | for i in range(rest_len): 65 | sys.stdout.write('.') 66 | sys.stdout.write(']') 67 | 68 | cur_time = time.time() 69 | step_time = cur_time - last_time 70 | last_time = cur_time 71 | tot_time = cur_time - begin_time 72 | 73 | L = [] 74 | L.append(' Step: %s' % format_time(step_time)) 75 | L.append(' | Tot: %s' % format_time(tot_time)) 76 | if msg: 77 | L.append(' | ' + msg) 78 | 79 | msg = ''.join(L) 80 | sys.stdout.write(msg) 81 | for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): 82 | sys.stdout.write(' ') 83 | 84 | # Go back to the center of the bar. 85 | for i in range(term_width-int(TOTAL_BAR_LENGTH/2)): 86 | sys.stdout.write('\b') 87 | sys.stdout.write(' %d/%d ' % (current+1, total)) 88 | 89 | if current < total-1: 90 | sys.stdout.write('\r') 91 | else: 92 | sys.stdout.write('\n') 93 | sys.stdout.flush() 94 | 95 | def format_time(seconds): 96 | days = int(seconds / 3600/24) 97 | seconds = seconds - days*3600*24 98 | hours = int(seconds / 3600) 99 | seconds = seconds - hours*3600 100 | minutes = int(seconds / 60) 101 | seconds = seconds - minutes*60 102 | secondsf = int(seconds) 103 | seconds = seconds - secondsf 104 | millis = int(seconds*1000) 105 | 106 | f = '' 107 | i = 1 108 | if days > 0: 109 | f += str(days) + 'D' 110 | i += 1 111 | if hours > 0 and i <= 2: 112 | f += str(hours) + 'h' 113 | i += 1 114 | if minutes > 0 and i <= 2: 115 | f += str(minutes) + 'm' 116 | i += 1 117 | if secondsf > 0 and i <= 2: 118 | f += str(secondsf) + 's' 119 | i += 1 120 | if millis > 0 and i <= 2: 121 | f += str(millis) + 'ms' 122 | i += 1 123 | if f == '': 124 | f = '0ms' 125 | return f 126 | --------------------------------------------------------------------------------