├── LICENSE ├── README.md ├── classify.py ├── data_transforms.py ├── dataset.py ├── dla.py ├── dla_up.py ├── folder.py ├── lib ├── Makefile ├── build.py ├── dense │ ├── __init__.py │ ├── batch_norm │ │ ├── __init__.py │ │ └── _batch_norm.so │ └── batchnormp_kernel.so ├── eaconv │ ├── __init__.py │ ├── build.py │ ├── functions │ │ ├── __init__.py │ │ └── eaconv.py │ ├── modules │ │ ├── __init__.py │ │ ├── eaconv.py │ │ └── util.py │ └── src │ │ ├── EAConv2d_cuda.c │ │ ├── EAConv2d_cuda.h │ │ ├── EAConv2d_kernel.cu │ │ ├── EAConv2d_kernel.h │ │ ├── conv_params.cu │ │ ├── conv_params.h │ │ ├── cuda_check.h │ │ ├── handle.cu │ │ └── handle.h ├── functions │ ├── __init__.py │ └── batchnormp.py ├── make_eaconv.sh ├── modules │ ├── __init__.py │ └── batchnormsync.py ├── src │ ├── batchnormp.c │ ├── batchnormp.h │ ├── batchnormp_cuda.c │ ├── batchnormp_cuda.h │ ├── batchnormp_cuda_kernel.cu │ ├── batchnormp_cuda_kernel.h │ └── generic │ │ └── batchnormp_cuda.cu └── test.py ├── scripts ├── pre-commit.sh └── setup.sh └── segment.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Fisher Yu 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Classification 2 | 3 | Train image classification on ImageNet 4 | 5 | Use as many default commands as possible: 6 | 7 | ``` 8 | python3 classify.py train -a dla34 9 | 10 | ``` 11 | 12 | With more data settings: 13 | ``` 14 | python3 classify.py train -a dla34 --data-name imagenet \ 15 | --classes 1000 -j 4 --epochs 120 --start-epoch 0 --batch-size 256 \ 16 | --crop-size 224 --scale-size 256 17 | ``` 18 | 19 | If you want to train on a dataset that is not already defined in `dataset.py`, please specify a new data name and put `info.json` in the data folder. `info.json` contains a dictionary with required values `mean` and `std`, which are the mean and standard deviation of the images in the new dataset. A full set of options can be found in [`dataset.py`](dataset.py#L6). The other useful fields are `eigval` and `eigvec`, which are the eigen values and vectors for the image pixel variations in the dataset. A minimal `info.json` looks like: 20 | 21 | ``` 22 | { 23 | "mean": [0.485, 0.456, 0.406], 24 | "std": [0.229, 0.224, 0.225] 25 | } 26 | ``` 27 | 28 | If the new dataset contains 2 classes, the command can start with: 29 | 30 | ``` 31 | python3 classify.py train -a dla34 --data-name new_data \ 32 | --classes 2 33 | ``` 34 | 35 | If you want to start your training with models pretrained on ImageNet and fine tune the model with learning rate 0.01, you can do 36 | 37 | ``` 38 | python3 classify.py train -a dla34 --data-name new_data \ 39 | --classes 2 --pretrained imagenet --lr 0.01 40 | ``` 41 | 42 | ## Segmentation and Boundary Prediction 43 | 44 | Segmentation and boundary prediction data format is the same as 45 | [DRN](https://github.com/fyu/drn#prepare-data). 46 | 47 | To use `--bn-sync`, please include `lib` in `PYTHONPATH`. 48 | 49 | Cityscapes 50 | 51 | ``` 52 | python3 segment.py train -d -c 19 -s 832 --arch dla102up \ 53 | --scale 0 --batch-size 16 --lr 0.01 --momentum 0.9 --lr-mode poly \ 54 | --epochs 500 --bn-sync --random-scale 2 --random-rotate 10 \ 55 | --random-color --pretrained-base imagenet 56 | ``` 57 | 58 | bn-sync is not necessary for CamVid and boundaries with 12GB GPU memory. 59 | 60 | CamVid 61 | 62 | ``` 63 | python3 segment.py train -d -c 11 -s 448 --arch dla102up \ 64 | --scale 0 --batch-size 16 --epochs 1200 --lr 0.01 --momentum 0.9 \ 65 | --step 800 --pretrained-base imagenet --random-scale 2 --random-rotate 10 \ 66 | --random-color --save-feq 50 67 | ``` 68 | 69 | BSDS 70 | 71 | ``` 72 | python3 segment.py train -d -c 2 -s 416 --arch dla102up \ 73 | --scale 0 --batch-size 16 --epochs 1200 --lr 0.01 --momentum 0.9 \ 74 | --step 800 --pretrained-base imagenet --random-rotate 180 --random-color \ 75 | --save-freq 50 --edge-weight 10 --bn-sync 76 | ``` 77 | 78 | PASCAL Boundary 79 | 80 | ``` 81 | python3 segment.py train -d -c 2 -s 480 --arch dla102up \ 82 | --scale 0 --batch-size 32 --epochs 400 --lr 0.01 --momentum 0.9 \ 83 | --step 200 --pretrained-base imagenet --random-rotate 10 --random-color \ 84 | --save-freq 25 --edge-weight 10 85 | ``` 86 | 87 | ## FAQ 88 | 89 | *How many GPUs does the program require for training?* 90 | 91 | We tested all the training on GPUs with at least 12 GB memory. We usually tried to use fewest GPUs for the batch sizes. So the actually number of required GPUs is different between models, depending on the model sizes. Some model training may require 8 GPUs, such as training `dla102up` on Cityscapes dataset. -------------------------------------------------------------------------------- /classify.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import shutil 3 | import time 4 | 5 | import numpy as np 6 | import os 7 | 8 | import sys 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.parallel 12 | import torch.backends.cudnn as cudnn 13 | import torch.optim 14 | import torch.utils.data 15 | import torchvision.transforms as transforms 16 | import torchvision.datasets as datasets 17 | 18 | import dla 19 | import dataset 20 | 21 | from folder import ImageFolder 22 | import data_transforms 23 | 24 | model_names = sorted(name for name in dla.__dict__ 25 | if name.islower() and not name.startswith("__") 26 | and callable(dla.__dict__[name])) 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser(description='DLA ImageNet Training') 31 | parser.add_argument('cmd', choices=['train', 'test']) 32 | parser.add_argument('data', metavar='DIR', 33 | help='path to dataset') 34 | parser.add_argument('--data-name', default='imagenet', 35 | help='Name of the dataset') 36 | parser.add_argument('--arch', '-a', metavar='ARCH', default='dla34', 37 | choices=model_names, 38 | help='model architecture: ' + ' | '.join(model_names) + 39 | ' (default: dla34)') 40 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 41 | help='number of data loading workers (default: 4)') 42 | parser.add_argument('--epochs', default=120, type=int, metavar='N', 43 | help='number of total epochs to run') 44 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 45 | help='manual epoch number (useful on restarts)') 46 | parser.add_argument('-b', '--batch-size', default=256, type=int, 47 | metavar='N', help='mini-batch size (default: 256)') 48 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 49 | metavar='LR', help='initial learning rate') 50 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 51 | help='momentum') 52 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 53 | metavar='W', help='weight decay (default: 1e-4)') 54 | parser.add_argument('--print-freq', '-p', default=10, type=int, 55 | metavar='N', help='print frequency (default: 10)') 56 | parser.add_argument('--check-freq', default=1, type=int, 57 | help='print frequency (default: 1)') 58 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 59 | help='path to latest checkpoint (default: none)') 60 | parser.add_argument('-e', '--evaluate', dest='evaluate', 61 | action='store_true', 62 | help='evaluate model on validation set') 63 | parser.add_argument('--pretrained', dest='pretrained', default=None, 64 | help='use pre-trained model for ' 65 | 'the specified dataset.') 66 | parser.add_argument('--classes', default=1000, type=int, 67 | help='Number of classes in the model') 68 | parser.add_argument('--lr-adjust', dest='lr_adjust', 69 | choices=['step'], default='step') 70 | parser.add_argument('--crop-size', dest='crop_size', type=int, default=224) 71 | parser.add_argument('--scale-size', dest='scale_size', type=int, 72 | default=256) 73 | parser.add_argument('--crop-10', dest='crop_10', action='store_true') 74 | parser.add_argument('--down-ratio', dest='down_ratio', type=int, default=8, 75 | help='model downsampling ratio') 76 | parser.add_argument('--step-ratio', dest='step_ratio', default=0.1, 77 | type=float) 78 | parser.add_argument('--no-cuda', action='store_true', default=False, 79 | help='enables CUDA training') 80 | parser.add_argument('--random-color', action='store_true', default=False) 81 | parser.add_argument('--min-area-ratio', default=0.08, type=float) 82 | parser.add_argument('--aspect-ratio', type=float, default=4./3) 83 | args = parser.parse_args() 84 | args.cuda = not args.no_cuda and torch.cuda.is_available() 85 | 86 | print(' '.join(sys.argv)) 87 | 88 | return args 89 | 90 | 91 | def main(): 92 | args = parse_args() 93 | print(args) 94 | if args.cmd == 'train': 95 | run_training(args) 96 | elif args.cmd == 'test': 97 | test_model(args) 98 | 99 | 100 | def run_training(args): 101 | model = dla.__dict__[args.arch]( 102 | pretrained=args.pretrained, num_classes=args.classes, 103 | pool_size=args.crop_size // 32) 104 | model = torch.nn.DataParallel(model) 105 | 106 | best_prec1 = 0 107 | 108 | # optionally resume from a checkpoint 109 | if args.resume: 110 | if os.path.isfile(args.resume): 111 | print("=> loading checkpoint '{}'".format(args.resume)) 112 | checkpoint = torch.load(args.resume) 113 | args.start_epoch = checkpoint['epoch'] 114 | best_prec1 = checkpoint['best_prec1'] 115 | model.load_state_dict(checkpoint['state_dict']) 116 | print("=> loaded checkpoint '{}' (epoch {})" 117 | .format(args.resume, checkpoint['epoch'])) 118 | else: 119 | print("=> no checkpoint found at '{}'".format(args.resume)) 120 | 121 | cudnn.benchmark = True 122 | 123 | data = dataset.get_data(args.data_name) 124 | if data is None: 125 | data = dataset.load_dataset_info(args.data, data_name=args.data_name) 126 | if data is None: 127 | raise ValueError('{} is not pre-defined in dataset.py and info.json ' 128 | 'does not exist in {}', args.data_name, args.data) 129 | 130 | # Data loading code 131 | traindir = os.path.join(args.data, 'train') 132 | valdir = os.path.join(args.data, 'val') 133 | normalize = data_transforms.Normalize(mean=data.mean, std=data.std) 134 | tt = [data_transforms.RandomResizedCrop( 135 | args.crop_size, min_area_ratio=args.min_area_ratio, 136 | aspect_ratio=args.aspect_ratio)] 137 | if data.eigval is not None and data.eigvec is not None \ 138 | and args.random_color: 139 | ligiting = data_transforms.Lighting(0.1, data.eigval, data.eigvec) 140 | jitter = data_transforms.RandomJitter(0.4, 0.4, 0.4) 141 | tt.extend([jitter, ligiting]) 142 | tt.extend([data_transforms.RandomHorizontalFlip(), 143 | data_transforms.ToTensor(), 144 | normalize]) 145 | 146 | train_loader = torch.utils.data.DataLoader( 147 | datasets.ImageFolder(traindir, data_transforms.Compose(tt)), 148 | batch_size=args.batch_size, shuffle=True, 149 | num_workers=args.workers, pin_memory=True) 150 | 151 | val_loader = torch.utils.data.DataLoader( 152 | datasets.ImageFolder(valdir, transforms.Compose([ 153 | transforms.Resize(args.scale_size), 154 | transforms.CenterCrop(args.crop_size), 155 | transforms.ToTensor(), 156 | normalize 157 | ])), 158 | batch_size=args.batch_size, shuffle=False, 159 | num_workers=args.workers, pin_memory=True) 160 | 161 | # define loss function (criterion) and pptimizer 162 | criterion = nn.CrossEntropyLoss() 163 | 164 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 165 | momentum=args.momentum, 166 | weight_decay=args.weight_decay) 167 | 168 | if args.cuda: 169 | model = model.cuda() 170 | criterion = criterion.cuda() 171 | 172 | if args.evaluate: 173 | validate(args, val_loader, model, criterion) 174 | return 175 | 176 | for epoch in range(args.start_epoch, args.epochs): 177 | adjust_learning_rate(args, optimizer, epoch) 178 | 179 | # train for one epoch 180 | train(args, train_loader, model, criterion, optimizer, epoch) 181 | 182 | # evaluate on validation set 183 | prec1 = validate(args, val_loader, model, criterion) 184 | 185 | # remember best prec@1 and save checkpoint 186 | is_best = prec1 > best_prec1 187 | best_prec1 = max(prec1, best_prec1) 188 | checkpoint_path = 'checkpoint_latest.pth.tar' 189 | save_checkpoint({ 190 | 'epoch': epoch + 1, 191 | 'arch': args.arch, 192 | 'state_dict': model.state_dict(), 193 | 'best_prec1': best_prec1, 194 | }, is_best, filename=checkpoint_path) 195 | if (epoch + 1) % args.check_freq == 0: 196 | history_path = 'checkpoint_{:03d}.pth.tar'.format(epoch + 1) 197 | shutil.copyfile(checkpoint_path, history_path) 198 | 199 | 200 | def test_model(args): 201 | # create model 202 | model = dla.__dict__[args.arch](pretrained=args.pretrained, 203 | pool_size=args.crop_size // 32) 204 | model = torch.nn.DataParallel(model) 205 | 206 | # optionally resume from a checkpoint 207 | if args.resume: 208 | if os.path.isfile(args.resume): 209 | print("=> loading checkpoint '{}'".format(args.resume)) 210 | checkpoint = torch.load(args.resume) 211 | args.start_epoch = checkpoint['epoch'] 212 | best_prec1 = checkpoint['best_prec1'] 213 | model.load_state_dict(checkpoint['state_dict']) 214 | print("=> loaded checkpoint '{}' (epoch {} prec {:.03f}) " 215 | .format(args.resume, checkpoint['epoch'], best_prec1)) 216 | else: 217 | print("=> no checkpoint found at '{}'".format(args.resume)) 218 | 219 | cudnn.benchmark = True 220 | 221 | data = dataset.get_data(args.data_name) 222 | if data is None: 223 | data = dataset.load_dataset_info(args.data, data_name=args.data_name) 224 | if data is None: 225 | raise ValueError('{} is not pre-defined in dataset.py and info.json ' 226 | 'does not exist in {}', args.data_name, args.data) 227 | # Data loading code 228 | valdir = os.path.join(args.data, 'val') 229 | normalize = transforms.Normalize(mean=data.mean, std=data.std) 230 | 231 | if args.crop_10: 232 | t = transforms.Compose([ 233 | transforms.Resize(args.scale_size), 234 | transforms.ToTensor(), 235 | normalize]) 236 | else: 237 | t = transforms.Compose([ 238 | transforms.Resize(args.scale_size), 239 | transforms.CenterCrop(args.crop_size), 240 | transforms.ToTensor(), 241 | normalize]) 242 | val_loader = torch.utils.data.DataLoader( 243 | ImageFolder(valdir, t, out_name=args.crop_10), 244 | batch_size=args.batch_size, shuffle=False, 245 | num_workers=args.workers, pin_memory=True) 246 | 247 | # define loss function (criterion) and pptimizer 248 | criterion = nn.CrossEntropyLoss() 249 | 250 | if args.cuda: 251 | model = model.cuda() 252 | criterion = criterion.cuda() 253 | 254 | if args.crop_10: 255 | validate_10(args, val_loader, model, 256 | '{}_i_{}_c_10.txt'.format(args.arch, args.start_epoch)) 257 | else: 258 | validate(args, val_loader, model, criterion) 259 | 260 | 261 | def train(args, train_loader, model, criterion, optimizer, epoch): 262 | batch_time = AverageMeter() 263 | data_time = AverageMeter() 264 | losses = AverageMeter() 265 | top1 = AverageMeter() 266 | top5 = AverageMeter() 267 | 268 | # switch to train mode 269 | model.train() 270 | 271 | end = time.time() 272 | for i, (input, target) in enumerate(train_loader): 273 | # measure data loading time 274 | data_time.update(time.time() - end) 275 | if args.cuda: 276 | target = target.cuda(async=True) 277 | input_var = torch.autograd.Variable(input) 278 | target_var = torch.autograd.Variable(target) 279 | 280 | # compute output 281 | output = model(input_var) 282 | loss = criterion(output, target_var) 283 | 284 | # measure accuracy and record loss 285 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 286 | losses.update(loss.data[0], input.size(0)) 287 | top1.update(prec1[0], input.size(0)) 288 | top5.update(prec5[0], input.size(0)) 289 | 290 | # compute gradient and do SGD step 291 | optimizer.zero_grad() 292 | loss.backward() 293 | optimizer.step() 294 | 295 | # measure elapsed time 296 | batch_time.update(time.time() - end) 297 | end = time.time() 298 | 299 | if i % args.print_freq == 0: 300 | print('Epoch: [{0}][{1}/{2}]\t' 301 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 302 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 303 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 304 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 305 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 306 | epoch, i, len(train_loader), batch_time=batch_time, 307 | data_time=data_time, loss=losses, top1=top1, top5=top5)) 308 | 309 | 310 | def validate(args, val_loader, model, criterion): 311 | batch_time = AverageMeter() 312 | losses = AverageMeter() 313 | top1 = AverageMeter() 314 | top5 = AverageMeter() 315 | 316 | # switch to evaluate mode 317 | model.eval() 318 | 319 | end = time.time() 320 | for i, (input, target) in enumerate(val_loader): 321 | if args.cuda: 322 | target = target.cuda(async=True) 323 | input_var = torch.autograd.Variable(input, volatile=True) 324 | target_var = torch.autograd.Variable(target, volatile=True) 325 | 326 | # compute output 327 | output = model(input_var) 328 | loss = criterion(output, target_var) 329 | 330 | # measure accuracy and record loss 331 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 332 | losses.update(loss.data[0], input.size(0)) 333 | top1.update(prec1[0], input.size(0)) 334 | top5.update(prec5[0], input.size(0)) 335 | 336 | # measure elapsed time 337 | batch_time.update(time.time() - end) 338 | end = time.time() 339 | 340 | if i % args.print_freq == 0: 341 | print('Test: [{0}/{1}]\t' 342 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 343 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 344 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 345 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 346 | i, len(val_loader), batch_time=batch_time, loss=losses, 347 | top1=top1, top5=top5)) 348 | 349 | print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' 350 | .format(top1=top1, top5=top5)) 351 | 352 | return top1.avg 353 | 354 | 355 | def sample_10(image, crop_dims): 356 | # Dimensions and center. 357 | image = image.numpy() 358 | im_shape = np.array(image.shape[2:]) 359 | crop_dims = np.array(crop_dims) 360 | im_center = im_shape[:2] / 2.0 361 | 362 | # Make crop coordinates 363 | h_indices = (0, im_shape[0] - crop_dims[0]) 364 | w_indices = (0, im_shape[1] - crop_dims[1]) 365 | crops_ix = np.empty((5, 4), dtype=int) 366 | curr = 0 367 | for i in h_indices: 368 | for j in w_indices: 369 | crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1]) 370 | curr += 1 371 | crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([ 372 | -crop_dims / 2.0, 373 | crop_dims / 2.0 374 | ]) 375 | crops_ix = np.tile(crops_ix, (2, 1)) 376 | 377 | # Extract crops 378 | crops = np.empty((10, image.shape[1], crop_dims[0], crop_dims[1]), 379 | dtype=np.float32) 380 | ix = 0 381 | for crop in crops_ix: 382 | crops[ix] = image[0, :, crop[0]:crop[2], crop[1]:crop[3]] 383 | ix += 1 384 | crops[ix-5:ix] = crops[ix-5:ix, :, :, ::-1] # flip for mirrors 385 | return torch.from_numpy(crops) 386 | 387 | 388 | def validate_10(args, data_loader, model, out_path): 389 | batch_time = AverageMeter() 390 | losses = AverageMeter() 391 | top1 = AverageMeter() 392 | top5 = AverageMeter() 393 | 394 | # switch to evaluate mode 395 | model.eval() 396 | 397 | sm = nn.functional.softmax 398 | criterion = nn.NLLLoss() 399 | out_fp = open(out_path, 'w') 400 | end = time.time() 401 | for i, (input, target, name) in enumerate(data_loader): 402 | assert input.size(0) == 1 403 | input = sample_10(input, (224, 224)) 404 | if args.cuda: 405 | target = target.cuda(async=True) 406 | input_var = torch.autograd.Variable(input, volatile=True) 407 | target_var = torch.autograd.Variable(target, volatile=True) 408 | 409 | # compute output 410 | output = model(input_var) 411 | output = sm(output) 412 | output = torch.mean(output, 0) 413 | loss = criterion(output, target_var) 414 | 415 | # measure accuracy and record loss 416 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 417 | losses.update(loss.data[0], input.size(0)) 418 | top1.update(prec1[0], input.size(0)) 419 | top5.update(prec5[0], input.size(0)) 420 | 421 | _, pred = output.topk(10, 1, True, True) 422 | pred = pred.view(-1).data.cpu().numpy() 423 | output = output.view(-1).data.cpu().numpy() 424 | print(name[0], ','.join("{},{:.03f}".format(pred[i], output[pred[i]]) 425 | for i in range(10)), 426 | sep=',', file=out_fp, flush=True) 427 | 428 | # measure elapsed time 429 | batch_time.update(time.time() - end) 430 | end = time.time() 431 | 432 | if i % args.print_freq == 0: 433 | print('Test: [{0}/{1}]\t' 434 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 435 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 436 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 437 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 438 | i, len(data_loader), batch_time=batch_time, loss=losses, 439 | top1=top1, top5=top5)) 440 | out_fp.close() 441 | 442 | 443 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 444 | torch.save(state, filename) 445 | if is_best: 446 | shutil.copyfile(filename, 'model_best.pth.tar') 447 | 448 | 449 | class AverageMeter(object): 450 | """Computes and stores the average and current value""" 451 | def __init__(self): 452 | self.reset() 453 | 454 | def reset(self): 455 | self.val = 0 456 | self.avg = 0 457 | self.sum = 0 458 | self.count = 0 459 | 460 | def update(self, val, n=1): 461 | self.val = val 462 | self.sum += val * n 463 | self.count += n 464 | self.avg = self.sum / self.count 465 | 466 | 467 | def adjust_learning_rate(args, optimizer, epoch): 468 | if args.lr_adjust == 'step': 469 | """Sets the learning rate to the initial LR decayed by 10 470 | every 30 epochs""" 471 | lr = args.lr * (args.step_ratio ** (epoch // 30)) 472 | else: 473 | raise ValueError() 474 | print('Epoch [{}] Learning rate: {:0.6f}'.format(epoch, lr)) 475 | for param_group in optimizer.param_groups: 476 | param_group['lr'] = lr 477 | 478 | 479 | def accuracy(output, target, topk=(1,)): 480 | """Computes the precision@k for the specified values of k""" 481 | maxk = max(topk) 482 | batch_size = target.size(0) 483 | 484 | _, pred = output.topk(maxk, 1, True, True) 485 | pred = pred.t() 486 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 487 | 488 | res = [] 489 | for k in topk: 490 | correct_k = correct[:k].view(-1).float().sum(0) 491 | res.append(correct_k.mul_(100.0 / batch_size)) 492 | return res 493 | 494 | 495 | if __name__ == '__main__': 496 | main() 497 | -------------------------------------------------------------------------------- /data_transforms.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numbers 3 | import pdb 4 | import random 5 | 6 | import numpy as np 7 | from PIL import Image, ImageOps, ImageEnhance 8 | import torch 9 | 10 | 11 | def resize(img, size, interpolation=Image.BILINEAR): 12 | """Resize the input PIL Image to the given size. 13 | """ 14 | 15 | if isinstance(size, int): 16 | w, h = img.size 17 | if (w <= h and w == size) or (h <= w and h == size): 18 | return img 19 | if w < h: 20 | ow = size 21 | oh = int(size * h / w) 22 | return img.resize((ow, oh), interpolation) 23 | else: 24 | oh = size 25 | ow = int(size * w / h) 26 | return img.resize((ow, oh), interpolation) 27 | else: 28 | return img.resize(size[::-1], interpolation) 29 | 30 | 31 | def crop(img, i, j, h, w): 32 | """Crop the given PIL Image. 33 | Args: 34 | img (PIL Image): Image to be cropped. 35 | i: Upper pixel coordinate. 36 | j: Left pixel coordinate. 37 | h: Height of the cropped image. 38 | w: Width of the cropped image. 39 | Returns: 40 | PIL Image: Cropped image. 41 | """ 42 | return img.crop((j, i, j + w, i + h)) 43 | 44 | 45 | def resized_crop(img, i, j, h, w, size, interpolation=Image.BILINEAR): 46 | """Crop the given PIL Image and resize it to desired size. 47 | Notably used in RandomResizedCrop. 48 | Args: 49 | img (PIL Image): Image to be cropped. 50 | i: Upper pixel coordinate. 51 | j: Left pixel coordinate. 52 | h: Height of the cropped image. 53 | w: Width of the cropped image. 54 | size (sequence or int): Desired output size. 55 | interpolation (int, optional): Desired interpolation. Default is 56 | ``PIL.Image.BILINEAR``. 57 | Returns: 58 | PIL Image: Cropped image. 59 | """ 60 | img = crop(img, i, j, h, w) 61 | img = resize(img, size, interpolation) 62 | return img 63 | 64 | 65 | class RandomResizedCrop(object): 66 | """Crop the given PIL Image to random size and aspect ratio. 67 | A crop of random size of (0.08 to 1.0) of the original size and a random 68 | aspect ratio of 3/4 to 4/3 of the original aspect ratio is made. This crop 69 | is finally resized to given size. 70 | This is popularly used to train the Inception networks. 71 | Args: 72 | size: expected output size of each edge 73 | interpolation: Default: PIL.Image.BILINEAR 74 | """ 75 | 76 | def __init__(self, size, interpolation=Image.BILINEAR, 77 | min_area_ratio=0.08, aspect_ratio=4./3): 78 | self.size = (size, size) 79 | self.interpolation = interpolation 80 | self.min_area_ratio = min_area_ratio 81 | self.aspect_ratio = aspect_ratio 82 | 83 | def get_params(self, img): 84 | """Get parameters for ``crop`` for a random sized crop. 85 | Args: 86 | img (PIL Image): Image to be cropped. 87 | Returns: 88 | tuple: params (i, j, h, w) to be passed to ``crop`` for a random 89 | sized crop. 90 | """ 91 | for attempt in range(10): 92 | area = img.size[0] * img.size[1] 93 | target_area = random.uniform(self.min_area_ratio, 1.0) * area 94 | aspect_ratio = random.uniform( 95 | 1 / self.aspect_ratio, self.aspect_ratio) 96 | 97 | w = int(round(math.sqrt(target_area * aspect_ratio))) 98 | h = int(round(math.sqrt(target_area / aspect_ratio))) 99 | 100 | if random.random() < 0.5: 101 | w, h = h, w 102 | 103 | if w <= img.size[0] and h <= img.size[1]: 104 | i = random.randint(0, img.size[1] - h) 105 | j = random.randint(0, img.size[0] - w) 106 | return i, j, h, w 107 | 108 | # Fallback 109 | w = min(img.size[0], img.size[1]) 110 | i = (img.size[1] - w) // 2 111 | j = (img.size[0] - w) // 2 112 | return i, j, w, w 113 | 114 | def __call__(self, img, *args): 115 | """ 116 | Args: 117 | img (PIL Image): Image to be flipped. 118 | Returns: 119 | PIL Image: Randomly cropped and resize image. 120 | """ 121 | i, j, h, w = self.get_params(img) 122 | return (resized_crop(img, i, j, h, w, self.size, self.interpolation), 123 | *args) 124 | 125 | 126 | class RandomCrop(object): 127 | def __init__(self, size): 128 | if isinstance(size, numbers.Number): 129 | self.size = (int(size), int(size)) 130 | else: 131 | self.size = size 132 | 133 | def __call__(self, image, label, *args): 134 | assert label is None or image.size == label.size 135 | 136 | w, h = image.size 137 | tw, th = self.size 138 | top = bottom = left = right = 0 139 | if w < tw: 140 | left = (tw - w) // 2 141 | right = tw - w - left 142 | if h < th: 143 | top = (th - h) // 2 144 | bottom = th - h - top 145 | if left > 0 or right > 0 or top > 0 or bottom > 0: 146 | label = pad_image( 147 | 'constant', label, top, bottom, left, right, value=255) 148 | image = pad_image( 149 | 'reflection', image, top, bottom, left, right) 150 | w, h = image.size 151 | if w == tw and h == th: 152 | return (image, label, *args) 153 | x1 = random.randint(0, w - tw) 154 | y1 = random.randint(0, h - th) 155 | results = [image.crop((x1, y1, x1 + tw, y1 + th))] 156 | if label is not None: 157 | results.append(label.crop((x1, y1, x1 + tw, y1 + th))) 158 | results.extend(args) 159 | return results 160 | 161 | 162 | class RandomScale(object): 163 | def __init__(self, scale): 164 | if isinstance(scale, numbers.Number): 165 | scale = [1 / scale, scale] 166 | self.scale = scale 167 | 168 | def __call__(self, image, label): 169 | ratio = random.uniform(self.scale[0], self.scale[1]) 170 | w, h = image.size 171 | tw = int(ratio * w) 172 | th = int(ratio * h) 173 | if ratio == 1: 174 | return image, label 175 | elif ratio < 1: 176 | interpolation = Image.ANTIALIAS 177 | else: 178 | interpolation = Image.CUBIC 179 | return image.resize((tw, th), interpolation), \ 180 | label.resize((tw, th), Image.NEAREST) 181 | 182 | 183 | class RandomRotate(object): 184 | """Crops the given PIL.Image at a random location to have a region of 185 | the given size. size can be a tuple (target_height, target_width) 186 | or an integer, in which case the target will be of a square shape 187 | (size, size) 188 | """ 189 | 190 | def __init__(self, angle): 191 | self.angle = angle 192 | 193 | def __call__(self, image, label=None, *args): 194 | assert label is None or image.size == label.size 195 | 196 | w, h = image.size 197 | p = max((h, w)) 198 | angle = random.randint(0, self.angle * 2) - self.angle 199 | 200 | if label is not None: 201 | label = pad_image('constant', label, h, h, w, w, value=255) 202 | label = label.rotate(angle, resample=Image.NEAREST) 203 | label = label.crop((w, h, w + w, h + h)) 204 | 205 | image = pad_image('reflection', image, h, h, w, w) 206 | image = image.rotate(angle, resample=Image.BILINEAR) 207 | image = image.crop((w, h, w + w, h + h)) 208 | return image, label 209 | 210 | 211 | class RandomHorizontalFlip(object): 212 | """Randomly horizontally flips the given PIL.Image with a probability of 0.5 213 | """ 214 | 215 | def __call__(self, image, label=None): 216 | if random.random() < 0.5: 217 | image = image.transpose(Image.FLIP_LEFT_RIGHT) 218 | if label: 219 | label = label.transpose(Image.FLIP_LEFT_RIGHT) 220 | if label: 221 | return image, label 222 | else: 223 | return image, 224 | 225 | 226 | class Normalize(object): 227 | """Given mean: (R, G, B) and std: (R, G, B), 228 | will normalize each channel of the torch.*Tensor, i.e. 229 | channel = (channel - mean) / std 230 | """ 231 | 232 | def __init__(self, mean, std): 233 | self.mean = torch.FloatTensor(mean) 234 | self.std = torch.FloatTensor(std) 235 | 236 | def __call__(self, image, label=None): 237 | for t, m, s in zip(image, self.mean, self.std): 238 | t.sub_(m).div_(s) 239 | if label is None: 240 | return image 241 | else: 242 | return image, label 243 | 244 | 245 | def pad_reflection(image, top, bottom, left, right): 246 | if top == 0 and bottom == 0 and left == 0 and right == 0: 247 | return image 248 | h, w = image.shape[:2] 249 | next_top = next_bottom = next_left = next_right = 0 250 | if top > h - 1: 251 | next_top = top - h + 1 252 | top = h - 1 253 | if bottom > h - 1: 254 | next_bottom = bottom - h + 1 255 | bottom = h - 1 256 | if left > w - 1: 257 | next_left = left - w + 1 258 | left = w - 1 259 | if right > w - 1: 260 | next_right = right - w + 1 261 | right = w - 1 262 | new_shape = list(image.shape) 263 | new_shape[0] += top + bottom 264 | new_shape[1] += left + right 265 | new_image = np.empty(new_shape, dtype=image.dtype) 266 | new_image[top:top+h, left:left+w] = image 267 | new_image[:top, left:left+w] = image[top:0:-1, :] 268 | new_image[top+h:, left:left+w] = image[-1:-bottom-1:-1, :] 269 | new_image[:, :left] = new_image[:, left*2:left:-1] 270 | new_image[:, left+w:] = new_image[:, -right-1:-right*2-1:-1] 271 | return pad_reflection(new_image, next_top, next_bottom, 272 | next_left, next_right) 273 | 274 | 275 | def pad_constant(image, top, bottom, left, right, value): 276 | if top == 0 and bottom == 0 and left == 0 and right == 0: 277 | return image 278 | h, w = image.shape[:2] 279 | new_shape = list(image.shape) 280 | new_shape[0] += top + bottom 281 | new_shape[1] += left + right 282 | new_image = np.empty(new_shape, dtype=image.dtype) 283 | new_image.fill(value) 284 | new_image[top:top+h, left:left+w] = image 285 | return new_image 286 | 287 | 288 | def pad_image(mode, image, top, bottom, left, right, value=0): 289 | if mode == 'reflection': 290 | return Image.fromarray( 291 | pad_reflection(np.asarray(image), top, bottom, left, right)) 292 | elif mode == 'constant': 293 | return Image.fromarray( 294 | pad_constant(np.asarray(image), top, bottom, left, right, value)) 295 | else: 296 | raise ValueError('Unknown mode {}'.format(mode)) 297 | 298 | 299 | class Pad(object): 300 | """Pads the given PIL.Image on all sides with the given "pad" value""" 301 | 302 | def __init__(self, padding, fill=0): 303 | assert isinstance(padding, numbers.Number) 304 | assert isinstance(fill, numbers.Number) or isinstance(fill, str) or \ 305 | isinstance(fill, tuple) 306 | self.padding = padding 307 | self.fill = fill 308 | 309 | def __call__(self, image, label=None, *args): 310 | if label is not None: 311 | label = pad_image( 312 | 'constant', label, 313 | self.padding, self.padding, self.padding, self.padding, 314 | value=255) 315 | if self.fill == -1: 316 | image = pad_image( 317 | 'reflection', image, 318 | self.padding, self.padding, self.padding, self.padding) 319 | else: 320 | image = pad_image( 321 | 'constant', image, 322 | self.padding, self.padding, self.padding, self.padding, 323 | value=self.fill) 324 | return (image, label, *args) 325 | 326 | 327 | class PadToSize(object): 328 | """Pads the given PIL.Image on all sides with the given "pad" value""" 329 | 330 | def __init__(self, side, fill=-1): 331 | assert isinstance(side, numbers.Number) 332 | assert isinstance(fill, numbers.Number) or isinstance(fill, str) or \ 333 | isinstance(fill, tuple) 334 | self.side = side 335 | self.fill = fill 336 | 337 | def __call__(self, image, label=None, *args): 338 | w, h = image.size 339 | s = self.side 340 | assert s >= w and s >= h 341 | top, left = (s - h) // 2, (s - w) // 2 342 | bottom = s - h - top 343 | right = s - w - left 344 | if label is not None: 345 | label = pad_image('constant', label, top, bottom, left, right, 346 | value=255) 347 | if self.fill == -1: 348 | image = pad_image('reflection', image, top, bottom, left, right) 349 | else: 350 | image = pad_image('constant', image, top, bottom, left, right, 351 | value=self.fill) 352 | return (image, label, *args) 353 | 354 | 355 | class PadImage(object): 356 | def __init__(self, padding, fill=0): 357 | assert isinstance(padding, numbers.Number) 358 | assert isinstance(fill, numbers.Number) or isinstance(fill, str) or \ 359 | isinstance(fill, tuple) 360 | self.padding = padding 361 | self.fill = fill 362 | 363 | def __call__(self, image, label=None, *args): 364 | if self.fill == -1: 365 | image = pad_image_reflection( 366 | image, self.padding, self.padding, self.padding, self.padding) 367 | else: 368 | image = ImageOps.expand(image, border=self.padding, fill=self.fill) 369 | return (image, label, *args) 370 | 371 | 372 | class ToTensor(object): 373 | """Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 374 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range 375 | [0.0, 1.0]. 376 | """ 377 | 378 | def __call__(self, pic, label=None): 379 | if isinstance(pic, np.ndarray): 380 | # handle numpy array 381 | img = torch.from_numpy(pic) 382 | else: 383 | # handle PIL Image 384 | img = torch.ByteTensor( 385 | torch.ByteStorage.from_buffer(pic.tobytes())) 386 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 387 | if pic.mode == 'YCbCr': 388 | nchannel = 3 389 | else: 390 | nchannel = len(pic.mode) 391 | img = img.view(pic.size[1], pic.size[0], nchannel) 392 | # put it from HWC to CHW format 393 | # yikes, this transpose takes 80% of the loading time/CPU 394 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 395 | img = img.float().div(255) 396 | if label is None: 397 | return (img,) 398 | else: 399 | return img, torch.LongTensor(np.array(label, dtype=np.int)) 400 | 401 | 402 | class Compose(object): 403 | """Composes several transforms together. 404 | """ 405 | 406 | def __init__(self, transforms): 407 | self.transforms = transforms 408 | 409 | def __call__(self, *args): 410 | for t in self.transforms: 411 | args = t(*args) 412 | return args 413 | 414 | 415 | class Lighting(object): 416 | def __init__(self, alphastd, eigval, eigvec): 417 | self.alphastd = alphastd 418 | self.eigval = np.array(eigval) 419 | self.eigvec = np.array(eigvec) 420 | 421 | def __call__(self, image, *args): 422 | if self.alphastd == 0: 423 | return (image, *args) 424 | alpha = np.random.randn(3) * self.alphastd 425 | rgb = (self.eigvec @ np.diag(alpha * self.eigval)).sum(axis=1).\ 426 | round().astype(np.int32) 427 | image = np.asarray(image) 428 | image_type = image.dtype 429 | image = Image.fromarray( 430 | np.clip(image.astype(np.int32) + rgb, 0, 255).astype(image_type)) 431 | return (image, *args) 432 | 433 | 434 | class RandomBrightness(object): 435 | def __init__(self, var=0.4): 436 | self.var = var 437 | 438 | def __call__(self, image, *args): 439 | alpha = 1.0 + np.random.uniform(-self.var, self.var) 440 | image = ImageEnhance.Brightness(image).enhance(alpha) 441 | return (image, *args) 442 | 443 | 444 | class RandomColor(object): 445 | def __init__(self, var=0.4): 446 | self.var = var 447 | 448 | def __call__(self, image, *args): 449 | alpha = 1.0 + np.random.uniform(-self.var, self.var) 450 | image = ImageEnhance.Color(image).enhance(alpha) 451 | return (image, *args) 452 | 453 | 454 | class RandomContrast(object): 455 | def __init__(self, var=0.4): 456 | self.var = var 457 | 458 | def __call__(self, image, *args): 459 | alpha = 1.0 + np.random.uniform(-self.var, self.var) 460 | image = ImageEnhance.Contrast(image).enhance(alpha) 461 | return (image, *args) 462 | 463 | 464 | class RandomSharpness(object): 465 | def __init__(self, var=0.4): 466 | self.var = var 467 | 468 | def __call__(self, image, *args): 469 | alpha = 1.0 + np.random.uniform(-self.var, self.var) 470 | image = ImageEnhance.Sharpness(image).enhance(alpha) 471 | return (image, *args) 472 | 473 | 474 | class RandomChannel(object): 475 | def __init__(self): 476 | pass 477 | 478 | def __call__(self, image, *args): 479 | order = np.random.permutation(range(3)) 480 | image = np.asarray(image) 481 | out_image = np.empty(image.shape, dtype=image.dtype) 482 | for i in range(3): 483 | out_image[:, :, i] = image[:, :, order[i]] 484 | return (Image.fromarray(out_image), *args) 485 | 486 | 487 | class RandomJitter(object): 488 | def __init__(self, brightness, contrast, sharpness): 489 | self.jitter_funcs = [] 490 | if brightness > 0: 491 | self.jitter_funcs.append(RandomBrightness(brightness)) 492 | if contrast > 0: 493 | self.jitter_funcs.append(RandomContrast(contrast)) 494 | if sharpness > 0: 495 | self.jitter_funcs.append(RandomSharpness(sharpness)) 496 | 497 | def __call__(self, image, *args): 498 | if len(self.jitter_funcs) == 0: 499 | return (image, *args) 500 | order = np.random.permutation(range(len(self.jitter_funcs))) 501 | for i in range(len(order)): 502 | image = self.jitter_funcs[order[i]](image)[0] 503 | return (image, *args) 504 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import json 3 | from os.path import exists, join 4 | 5 | 6 | Dataset = namedtuple('Dataset', ['model_hash', 'classes', 'mean', 'std', 7 | 'eigval', 'eigvec', 'name']) 8 | 9 | imagenet = Dataset(name='imagenet', 10 | classes=1000, 11 | mean=[0.485, 0.456, 0.406], 12 | std=[0.229, 0.224, 0.225], 13 | eigval=[55.46, 4.794, 1.148], 14 | eigvec=[[-0.5675, 0.7192, 0.4009], 15 | [-0.5808, -0.0045, -0.8140], 16 | [-0.5836, -0.6948, 0.4203]], 17 | model_hash={'dla34': 'ba72cf86', 18 | 'dla46_c': '2bfd52c3', 19 | 'dla46x_c': 'd761bae7', 20 | 'dla60x_c': 'b870c45c', 21 | 'dla60': '24839fc4', 22 | 'dla60x': 'd15cacda', 23 | 'dla102': 'd94d9790', 24 | 'dla102x': 'ad62be81', 25 | 'dla102x2': '262837b6', 26 | 'dla169': '0914e092'}) 27 | 28 | 29 | def get_data(data_name): 30 | try: 31 | return globals()[data_name] 32 | except KeyError: 33 | return None 34 | 35 | 36 | def load_dataset_info(data_dir, data_name='new_data'): 37 | info_path = join(data_dir, 'info.json') 38 | if not exists(info_path): 39 | return None 40 | info = json.load(open(info_path, 'r')) 41 | assert 'mean' in info and 'std' in info, \ 42 | 'mean and std are required for a dataset' 43 | data = Dataset(name=data_name, classes=0, 44 | mean=None, 45 | std=None, 46 | eigval=None, 47 | eigvec=None, 48 | model_hash=dict()) 49 | return data._replace(**info) 50 | -------------------------------------------------------------------------------- /dla.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import math 5 | from os.path import join 6 | 7 | import torch 8 | from torch import nn 9 | import torch.utils.model_zoo as model_zoo 10 | 11 | import dataset 12 | 13 | BatchNorm = nn.BatchNorm2d 14 | 15 | WEB_ROOT = 'http://dl.yf.io/dla/models' 16 | 17 | 18 | def get_model_url(data, name): 19 | return join(WEB_ROOT, data.name, 20 | '{}-{}.pth'.format(name, data.model_hash[name])) 21 | 22 | 23 | def conv3x3(in_planes, out_planes, stride=1): 24 | "3x3 convolution with padding" 25 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 26 | padding=1, bias=False) 27 | 28 | 29 | class BasicBlock(nn.Module): 30 | def __init__(self, inplanes, planes, stride=1, dilation=1): 31 | super(BasicBlock, self).__init__() 32 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, 33 | stride=stride, padding=dilation, 34 | bias=False, dilation=dilation) 35 | self.bn1 = BatchNorm(planes) 36 | self.relu = nn.ReLU(inplace=True) 37 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, 38 | stride=1, padding=dilation, 39 | bias=False, dilation=dilation) 40 | self.bn2 = BatchNorm(planes) 41 | self.stride = stride 42 | 43 | def forward(self, x, residual=None): 44 | if residual is None: 45 | residual = x 46 | 47 | out = self.conv1(x) 48 | out = self.bn1(out) 49 | out = self.relu(out) 50 | 51 | out = self.conv2(out) 52 | out = self.bn2(out) 53 | 54 | out += residual 55 | out = self.relu(out) 56 | 57 | return out 58 | 59 | 60 | class Bottleneck(nn.Module): 61 | expansion = 2 62 | 63 | def __init__(self, inplanes, planes, stride=1, dilation=1): 64 | super(Bottleneck, self).__init__() 65 | expansion = Bottleneck.expansion 66 | bottle_planes = planes // expansion 67 | self.conv1 = nn.Conv2d(inplanes, bottle_planes, 68 | kernel_size=1, bias=False) 69 | self.bn1 = BatchNorm(bottle_planes) 70 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, 71 | stride=stride, padding=dilation, 72 | bias=False, dilation=dilation) 73 | self.bn2 = BatchNorm(bottle_planes) 74 | self.conv3 = nn.Conv2d(bottle_planes, planes, 75 | kernel_size=1, bias=False) 76 | self.bn3 = BatchNorm(planes) 77 | self.relu = nn.ReLU(inplace=True) 78 | self.stride = stride 79 | 80 | def forward(self, x, residual=None): 81 | if residual is None: 82 | residual = x 83 | 84 | out = self.conv1(x) 85 | out = self.bn1(out) 86 | out = self.relu(out) 87 | 88 | out = self.conv2(out) 89 | out = self.bn2(out) 90 | out = self.relu(out) 91 | 92 | out = self.conv3(out) 93 | out = self.bn3(out) 94 | 95 | out += residual 96 | out = self.relu(out) 97 | 98 | return out 99 | 100 | 101 | class BottleneckX(nn.Module): 102 | expansion = 2 103 | cardinality = 32 104 | 105 | def __init__(self, inplanes, planes, stride=1, dilation=1): 106 | super(BottleneckX, self).__init__() 107 | cardinality = BottleneckX.cardinality 108 | # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) 109 | # bottle_planes = dim * cardinality 110 | bottle_planes = planes * cardinality // 32 111 | self.conv1 = nn.Conv2d(inplanes, bottle_planes, 112 | kernel_size=1, bias=False) 113 | self.bn1 = BatchNorm(bottle_planes) 114 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, 115 | stride=stride, padding=dilation, bias=False, 116 | dilation=dilation, groups=cardinality) 117 | self.bn2 = BatchNorm(bottle_planes) 118 | self.conv3 = nn.Conv2d(bottle_planes, planes, 119 | kernel_size=1, bias=False) 120 | self.bn3 = BatchNorm(planes) 121 | self.relu = nn.ReLU(inplace=True) 122 | self.stride = stride 123 | 124 | def forward(self, x, residual=None): 125 | if residual is None: 126 | residual = x 127 | 128 | out = self.conv1(x) 129 | out = self.bn1(out) 130 | out = self.relu(out) 131 | 132 | out = self.conv2(out) 133 | out = self.bn2(out) 134 | out = self.relu(out) 135 | 136 | out = self.conv3(out) 137 | out = self.bn3(out) 138 | 139 | out += residual 140 | out = self.relu(out) 141 | 142 | return out 143 | 144 | 145 | class Root(nn.Module): 146 | def __init__(self, in_channels, out_channels, kernel_size, residual): 147 | super(Root, self).__init__() 148 | self.conv = nn.Conv2d( 149 | in_channels, out_channels, kernel_size, 150 | stride=1, bias=False, padding=(kernel_size - 1) // 2) 151 | self.bn = BatchNorm(out_channels) 152 | self.relu = nn.ReLU(inplace=True) 153 | self.residual = residual 154 | 155 | def forward(self, *x): 156 | children = x 157 | x = self.conv(torch.cat(x, 1)) 158 | x = self.bn(x) 159 | if self.residual: 160 | x += children[0] 161 | x = self.relu(x) 162 | 163 | return x 164 | 165 | 166 | class Tree(nn.Module): 167 | def __init__(self, levels, block, in_channels, out_channels, stride=1, 168 | level_root=False, root_dim=0, root_kernel_size=1, 169 | dilation=1, root_residual=False): 170 | super(Tree, self).__init__() 171 | if root_dim == 0: 172 | root_dim = 2 * out_channels 173 | if level_root: 174 | root_dim += in_channels 175 | if levels == 1: 176 | self.tree1 = block(in_channels, out_channels, stride, 177 | dilation=dilation) 178 | self.tree2 = block(out_channels, out_channels, 1, 179 | dilation=dilation) 180 | else: 181 | self.tree1 = Tree(levels - 1, block, in_channels, out_channels, 182 | stride, root_dim=0, 183 | root_kernel_size=root_kernel_size, 184 | dilation=dilation, root_residual=root_residual) 185 | self.tree2 = Tree(levels - 1, block, out_channels, out_channels, 186 | root_dim=root_dim + out_channels, 187 | root_kernel_size=root_kernel_size, 188 | dilation=dilation, root_residual=root_residual) 189 | if levels == 1: 190 | self.root = Root(root_dim, out_channels, root_kernel_size, 191 | root_residual) 192 | self.level_root = level_root 193 | self.root_dim = root_dim 194 | self.downsample = None 195 | self.project = None 196 | self.levels = levels 197 | if stride > 1: 198 | self.downsample = nn.MaxPool2d(stride, stride=stride) 199 | if in_channels != out_channels: 200 | self.project = nn.Sequential( 201 | nn.Conv2d(in_channels, out_channels, 202 | kernel_size=1, stride=1, bias=False), 203 | BatchNorm(out_channels) 204 | ) 205 | 206 | def forward(self, x, residual=None, children=None): 207 | children = [] if children is None else children 208 | bottom = self.downsample(x) if self.downsample else x 209 | residual = self.project(bottom) if self.project else bottom 210 | if self.level_root: 211 | children.append(bottom) 212 | x1 = self.tree1(x, residual) 213 | if self.levels == 1: 214 | x2 = self.tree2(x1) 215 | x = self.root(x2, x1, *children) 216 | else: 217 | children.append(x1) 218 | x = self.tree2(x1, children=children) 219 | return x 220 | 221 | 222 | class DLA(nn.Module): 223 | def __init__(self, levels, channels, num_classes=1000, 224 | block=BasicBlock, residual_root=False, return_levels=False, 225 | pool_size=7, linear_root=False): 226 | super(DLA, self).__init__() 227 | self.channels = channels 228 | self.return_levels = return_levels 229 | self.num_classes = num_classes 230 | self.base_layer = nn.Sequential( 231 | nn.Conv2d(3, channels[0], kernel_size=7, stride=1, 232 | padding=3, bias=False), 233 | BatchNorm(channels[0]), 234 | nn.ReLU(inplace=True)) 235 | self.level0 = self._make_conv_level( 236 | channels[0], channels[0], levels[0]) 237 | self.level1 = self._make_conv_level( 238 | channels[0], channels[1], levels[1], stride=2) 239 | self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, 240 | level_root=False, 241 | root_residual=residual_root) 242 | self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, 243 | level_root=True, root_residual=residual_root) 244 | self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, 245 | level_root=True, root_residual=residual_root) 246 | self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, 247 | level_root=True, root_residual=residual_root) 248 | 249 | self.avgpool = nn.AvgPool2d(pool_size) 250 | self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1, 251 | stride=1, padding=0, bias=True) 252 | 253 | for m in self.modules(): 254 | if isinstance(m, nn.Conv2d): 255 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 256 | m.weight.data.normal_(0, math.sqrt(2. / n)) 257 | elif isinstance(m, BatchNorm): 258 | m.weight.data.fill_(1) 259 | m.bias.data.zero_() 260 | 261 | def _make_level(self, block, inplanes, planes, blocks, stride=1): 262 | downsample = None 263 | if stride != 1 or inplanes != planes: 264 | downsample = nn.Sequential( 265 | nn.MaxPool2d(stride, stride=stride), 266 | nn.Conv2d(inplanes, planes, 267 | kernel_size=1, stride=1, bias=False), 268 | BatchNorm(planes), 269 | ) 270 | 271 | layers = [] 272 | layers.append(block(inplanes, planes, stride, downsample=downsample)) 273 | for i in range(1, blocks): 274 | layers.append(block(inplanes, planes)) 275 | 276 | return nn.Sequential(*layers) 277 | 278 | def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): 279 | modules = [] 280 | for i in range(convs): 281 | modules.extend([ 282 | nn.Conv2d(inplanes, planes, kernel_size=3, 283 | stride=stride if i == 0 else 1, 284 | padding=dilation, bias=False, dilation=dilation), 285 | BatchNorm(planes), 286 | nn.ReLU(inplace=True)]) 287 | inplanes = planes 288 | return nn.Sequential(*modules) 289 | 290 | def forward(self, x): 291 | y = [] 292 | x = self.base_layer(x) 293 | for i in range(6): 294 | x = getattr(self, 'level{}'.format(i))(x) 295 | y.append(x) 296 | if self.return_levels: 297 | return y 298 | else: 299 | x = self.avgpool(x) 300 | x = self.fc(x) 301 | x = x.view(x.size(0), -1) 302 | 303 | return x 304 | 305 | def load_pretrained_model(self, data_name, name): 306 | assert data_name in dataset.__dict__, \ 307 | 'No pretrained model for {}'.format(data_name) 308 | data = dataset.__dict__[data_name] 309 | fc = self.fc 310 | if self.num_classes != data.classes: 311 | self.fc = nn.Conv2d( 312 | self.channels[-1], data.classes, 313 | kernel_size=1, stride=1, padding=0, bias=True) 314 | try: 315 | model_url = get_model_url(data, name) 316 | except KeyError: 317 | raise ValueError( 318 | '{} trained on {} does not exist.'.format(data.name, name)) 319 | self.load_state_dict(model_zoo.load_url(model_url)) 320 | self.fc = fc 321 | 322 | 323 | def dla34(pretrained=None, **kwargs): # DLA-34 324 | model = DLA([1, 1, 1, 2, 2, 1], 325 | [16, 32, 64, 128, 256, 512], 326 | block=BasicBlock, **kwargs) 327 | if pretrained is not None: 328 | model.load_pretrained_model(pretrained, 'dla34') 329 | return model 330 | 331 | 332 | def dla46_c(pretrained=None, **kwargs): # DLA-46-C 333 | Bottleneck.expansion = 2 334 | model = DLA([1, 1, 1, 2, 2, 1], 335 | [16, 32, 64, 64, 128, 256], 336 | block=Bottleneck, **kwargs) 337 | if pretrained is not None: 338 | model.load_pretrained_model(pretrained, 'dla46_c') 339 | return model 340 | 341 | 342 | def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C 343 | BottleneckX.expansion = 2 344 | model = DLA([1, 1, 1, 2, 2, 1], 345 | [16, 32, 64, 64, 128, 256], 346 | block=BottleneckX, **kwargs) 347 | if pretrained is not None: 348 | model.load_pretrained_model(pretrained, 'dla46x_c') 349 | return model 350 | 351 | 352 | def dla60x_c(pretrained=None, **kwargs): # DLA-X-60-C 353 | BottleneckX.expansion = 2 354 | model = DLA([1, 1, 1, 2, 3, 1], 355 | [16, 32, 64, 64, 128, 256], 356 | block=BottleneckX, **kwargs) 357 | if pretrained is not None: 358 | model.load_pretrained_model(pretrained, 'dla60x_c') 359 | return model 360 | 361 | 362 | def dla60(pretrained=None, **kwargs): # DLA-60 363 | Bottleneck.expansion = 2 364 | model = DLA([1, 1, 1, 2, 3, 1], 365 | [16, 32, 128, 256, 512, 1024], 366 | block=Bottleneck, **kwargs) 367 | if pretrained is not None: 368 | model.load_pretrained_model(pretrained, 'dla60') 369 | return model 370 | 371 | 372 | def dla60x(pretrained=None, **kwargs): # DLA-X-60 373 | BottleneckX.expansion = 2 374 | model = DLA([1, 1, 1, 2, 3, 1], 375 | [16, 32, 128, 256, 512, 1024], 376 | block=BottleneckX, **kwargs) 377 | if pretrained is not None: 378 | model.load_pretrained_model(pretrained, 'dla60x') 379 | return model 380 | 381 | 382 | def dla102(pretrained=None, **kwargs): # DLA-102 383 | Bottleneck.expansion = 2 384 | model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], 385 | block=Bottleneck, residual_root=True, **kwargs) 386 | if pretrained is not None: 387 | model.load_pretrained_model(pretrained, 'dla102') 388 | return model 389 | 390 | 391 | def dla102x(pretrained=None, **kwargs): # DLA-X-102 392 | BottleneckX.expansion = 2 393 | model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], 394 | block=BottleneckX, residual_root=True, **kwargs) 395 | if pretrained is not None: 396 | model.load_pretrained_model(pretrained, 'dla102x') 397 | return model 398 | 399 | 400 | def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64 401 | BottleneckX.cardinality = 64 402 | model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], 403 | block=BottleneckX, residual_root=True, **kwargs) 404 | if pretrained is not None: 405 | model.load_pretrained_model(pretrained, 'dla102x2') 406 | return model 407 | 408 | 409 | def dla169(pretrained=None, **kwargs): # DLA-169 410 | Bottleneck.expansion = 2 411 | model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024], 412 | block=Bottleneck, residual_root=True, **kwargs) 413 | if pretrained is not None: 414 | model.load_pretrained_model(pretrained, 'dla169') 415 | return model 416 | -------------------------------------------------------------------------------- /dla_up.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | 7 | import dla 8 | 9 | BatchNorm = nn.BatchNorm2d 10 | 11 | 12 | def set_bn(bn): 13 | global BatchNorm 14 | BatchNorm = bn 15 | dla.BatchNorm = bn 16 | 17 | 18 | class Identity(nn.Module): 19 | def __init__(self): 20 | super(Identity, self).__init__() 21 | 22 | def forward(self, x): 23 | return x 24 | 25 | 26 | def fill_up_weights(up): 27 | w = up.weight.data 28 | f = math.ceil(w.size(2) / 2) 29 | c = (2 * f - 1 - f % 2) / (2. * f) 30 | for i in range(w.size(2)): 31 | for j in range(w.size(3)): 32 | w[:, 0, i, j] = \ 33 | (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) 34 | 35 | class IDAUp(nn.Module): 36 | def __init__(self, node_kernel, out_dim, channels, up_factors): 37 | super(IDAUp, self).__init__() 38 | self.channels = channels 39 | self.out_dim = out_dim 40 | for i, c in enumerate(channels): 41 | if c == out_dim: 42 | proj = Identity() 43 | else: 44 | proj = nn.Sequential( 45 | nn.Conv2d(c, out_dim, 46 | kernel_size=1, stride=1, bias=False), 47 | BatchNorm(out_dim), 48 | nn.ReLU(inplace=True)) 49 | f = int(up_factors[i]) 50 | if f == 1: 51 | up = Identity() 52 | else: 53 | up = nn.ConvTranspose2d( 54 | out_dim, out_dim, f * 2, stride=f, padding=f // 2, 55 | output_padding=0, groups=out_dim, bias=False) 56 | fill_up_weights(up) 57 | setattr(self, 'proj_' + str(i), proj) 58 | setattr(self, 'up_' + str(i), up) 59 | 60 | for i in range(1, len(channels)): 61 | node = nn.Sequential( 62 | nn.Conv2d(out_dim * 2, out_dim, 63 | kernel_size=node_kernel, stride=1, 64 | padding=node_kernel // 2, bias=False), 65 | BatchNorm(out_dim), 66 | nn.ReLU(inplace=True)) 67 | setattr(self, 'node_' + str(i), node) 68 | 69 | for m in self.modules(): 70 | if isinstance(m, nn.Conv2d): 71 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 72 | m.weight.data.normal_(0, math.sqrt(2. / n)) 73 | elif isinstance(m, BatchNorm): 74 | m.weight.data.fill_(1) 75 | m.bias.data.zero_() 76 | 77 | def forward(self, layers): 78 | assert len(self.channels) == len(layers), \ 79 | '{} vs {} layers'.format(len(self.channels), len(layers)) 80 | layers = list(layers) 81 | for i, l in enumerate(layers): 82 | upsample = getattr(self, 'up_' + str(i)) 83 | project = getattr(self, 'proj_' + str(i)) 84 | layers[i] = upsample(project(l)) 85 | x = layers[0] 86 | y = [] 87 | for i in range(1, len(layers)): 88 | node = getattr(self, 'node_' + str(i)) 89 | x = node(torch.cat([x, layers[i]], 1)) 90 | y.append(x) 91 | return x, y 92 | 93 | 94 | class DLAUp(nn.Module): 95 | def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None): 96 | super(DLAUp, self).__init__() 97 | if in_channels is None: 98 | in_channels = channels 99 | self.channels = channels 100 | channels = list(channels) 101 | scales = np.array(scales, dtype=int) 102 | for i in range(len(channels) - 1): 103 | j = -i - 2 104 | setattr(self, 'ida_{}'.format(i), 105 | IDAUp(3, channels[j], in_channels[j:], 106 | scales[j:] // scales[j])) 107 | scales[j + 1:] = scales[j] 108 | in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] 109 | 110 | def forward(self, layers): 111 | layers = list(layers) 112 | assert len(layers) > 1 113 | for i in range(len(layers) - 1): 114 | ida = getattr(self, 'ida_{}'.format(i)) 115 | x, y = ida(layers[-i - 2:]) 116 | layers[-i - 1:] = y 117 | return x 118 | 119 | 120 | class DLASeg(nn.Module): 121 | def __init__(self, base_name, classes, 122 | pretrained_base=None, down_ratio=2): 123 | super(DLASeg, self).__init__() 124 | assert down_ratio in [2, 4, 8, 16] 125 | self.first_level = int(np.log2(down_ratio)) 126 | self.base = dla.__dict__[base_name](pretrained=pretrained_base, 127 | return_levels=True) 128 | channels = self.base.channels 129 | scales = [2 ** i for i in range(len(channels[self.first_level:]))] 130 | self.dla_up = DLAUp(channels[self.first_level:], scales=scales) 131 | self.fc = nn.Sequential( 132 | nn.Conv2d(channels[self.first_level], classes, kernel_size=1, 133 | stride=1, padding=0, bias=True) 134 | ) 135 | up_factor = 2 ** self.first_level 136 | if up_factor > 1: 137 | up = nn.ConvTranspose2d(classes, classes, up_factor * 2, 138 | stride=up_factor, padding=up_factor // 2, 139 | output_padding=0, groups=classes, 140 | bias=False) 141 | fill_up_weights(up) 142 | up.weight.requires_grad = False 143 | else: 144 | up = Identity() 145 | self.up = up 146 | self.softmax = nn.LogSoftmax(dim=1) 147 | 148 | for m in self.fc.modules(): 149 | if isinstance(m, nn.Conv2d): 150 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 151 | m.weight.data.normal_(0, math.sqrt(2. / n)) 152 | elif isinstance(m, BatchNorm): 153 | m.weight.data.fill_(1) 154 | m.bias.data.zero_() 155 | 156 | def forward(self, x): 157 | x = self.base(x) 158 | x = self.dla_up(x[self.first_level:]) 159 | x = self.fc(x) 160 | y = self.softmax(self.up(x)) 161 | return y, x 162 | 163 | def optim_parameters(self, memo=None): 164 | for param in self.base.parameters(): 165 | yield param 166 | for param in self.dla_up.parameters(): 167 | yield param 168 | for param in self.fc.parameters(): 169 | yield param 170 | 171 | 172 | def dla34up(classes, pretrained_base=None, **kwargs): 173 | model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs) 174 | return model 175 | 176 | 177 | def dla60up(classes, pretrained_base=None, **kwargs): 178 | model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs) 179 | return model 180 | 181 | 182 | def dla102up(classes, pretrained_base=None, **kwargs): 183 | model = DLASeg('dla102', classes, 184 | pretrained_base=pretrained_base, **kwargs) 185 | return model 186 | 187 | 188 | def dla169up(classes, pretrained_base=None, **kwargs): 189 | model = DLASeg('dla169', classes, 190 | pretrained_base=pretrained_base, **kwargs) 191 | return model 192 | -------------------------------------------------------------------------------- /folder.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | 3 | import torch 4 | import torch.utils.data as data 5 | 6 | from PIL import Image 7 | from PIL import ImageFilter 8 | import os 9 | import os.path 10 | 11 | IMG_EXTENSIONS = [ 12 | '.jpg', '.JPG', '.jpeg', '.JPEG', 13 | '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', 14 | ] 15 | 16 | 17 | def is_image_file(filename): 18 | return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) 19 | 20 | 21 | def find_classes(dir): 22 | classes = [d for d in os.listdir(dir) 23 | if os.path.isdir(os.path.join(dir, d))] 24 | classes.sort() 25 | class_to_idx = {classes[i]: i for i in range(len(classes))} 26 | return classes, class_to_idx 27 | 28 | 29 | def make_dataset(dir, class_to_idx): 30 | images = [] 31 | for target in os.listdir(dir): 32 | d = os.path.join(dir, target) 33 | if not os.path.isdir(d): 34 | continue 35 | 36 | for root, _, fnames in sorted(os.walk(d)): 37 | for fname in fnames: 38 | if is_image_file(fname): 39 | path = os.path.join(root, fname) 40 | item = (path, class_to_idx[target]) 41 | images.append(item) 42 | images.sort(key=lambda t: os.path.split(t[0])[1]) 43 | return images 44 | 45 | 46 | def default_loader(path): 47 | return Image.open(path).convert('RGB') 48 | 49 | 50 | class ImageFolder(data.Dataset): 51 | 52 | def __init__(self, root, transform=None, target_transform=None, 53 | loader=default_loader, out_name=False, out_image_size=False): 54 | classes, class_to_idx = find_classes(root) 55 | imgs = make_dataset(root, class_to_idx) 56 | if len(imgs) == 0: 57 | raise (RuntimeError( 58 | "Found 0 images in subfolders of: " + root + "\n" 59 | "Supported image extensions are: " + ",".join(IMG_EXTENSIONS))) 60 | 61 | self.root = root 62 | self.imgs = imgs 63 | self.classes = classes 64 | self.class_to_idx = class_to_idx 65 | self.transform = transform 66 | self.target_transform = target_transform 67 | self.loader = loader 68 | self.out_name = out_name 69 | self.out_image_size = out_image_size 70 | 71 | def __getitem__(self, index): 72 | path, target = self.imgs[index] 73 | img = self.loader(path) 74 | img_size = torch.LongTensor(img.size) 75 | # img = img.filter(ImageFilter.GaussianBlur(7)) 76 | if self.transform is not None: 77 | img = self.transform(img) 78 | if self.target_transform is not None: 79 | target = self.target_transform(target) 80 | 81 | output = [img, target] 82 | if self.out_name: 83 | output.append(os.path.split(path)[1]) 84 | if self.out_image_size: 85 | output.append(img_size) 86 | 87 | return tuple(output) 88 | 89 | def __len__(self): 90 | return len(self.imgs) 91 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | PYTORCH_LIB_DIR := /home/fy/pytorch/torch/lib 2 | 3 | 4 | PYTHON := python3 5 | NVCC_COMPILE := nvcc -c -o 6 | RM_RF := rm -rf 7 | 8 | # Library compilation rules. 9 | NVCC_FLAGS := -x cu -Xcompiler -fPIC -shared 10 | 11 | # File structure. 12 | BUILD_DIR := dense 13 | INCLUDE_DIRS := TH THC THCUNN include include/TH 14 | TORCH_FFI_BUILD := build.py 15 | BN_KERNEL := $(BUILD_DIR)/batchnormp_kernel.so 16 | TORCH_FFI_TARGET := $(BUILD_DIR)/batch_norm/_batch_norm.so 17 | 18 | INCLUDE_FLAGS := $(foreach d, $(INCLUDE_DIRS), -I$(PYTORCH_LIB_DIR)/$d) 19 | 20 | all: $(TORCH_FFI_TARGET) 21 | 22 | $(TORCH_FFI_TARGET): $(BN_KERNEL) $(TORCH_FFI_BUILD) 23 | $(PYTHON) $(TORCH_FFI_BUILD) 24 | 25 | $(BUILD_DIR)/batchnormp_kernel.so: src/batchnormp_cuda_kernel.cu 26 | @mkdir -p $(BUILD_DIR) 27 | $(NVCC_COMPILE) $@ $? $(NVCC_FLAGS) $(INCLUDE_FLAGS) -Isrc 28 | 29 | clean: 30 | $(RM_RF) $(BUILD_DIR) -------------------------------------------------------------------------------- /lib/build.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | this_file = os.path.dirname(__file__) 7 | 8 | sources = ['src/batchnormp.c'] 9 | headers = ['src/batchnormp.h'] 10 | defines = [] 11 | with_cuda = False 12 | 13 | abs_path = os.path.dirname(os.path.realpath(__file__)) 14 | extra_objects = [os.path.join(abs_path, 'dense/batchnormp_kernel.so')] 15 | extra_objects += glob.glob('/usr/local/cuda/lib64/*.a') 16 | 17 | if torch.cuda.is_available(): 18 | print('Including CUDA code.') 19 | sources += ['src/batchnormp_cuda.c'] 20 | headers += ['src/batchnormp_cuda.h'] 21 | defines += [('WITH_CUDA', None)] 22 | with_cuda = True 23 | 24 | ffi = create_extension( 25 | 'dense.batch_norm', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects) 32 | 33 | if __name__ == '__main__': 34 | ffi.build() -------------------------------------------------------------------------------- /lib/dense/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/dense/__init__.py -------------------------------------------------------------------------------- /lib/dense/batch_norm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._batch_norm import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /lib/dense/batch_norm/_batch_norm.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/dense/batch_norm/_batch_norm.so -------------------------------------------------------------------------------- /lib/dense/batchnormp_kernel.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/dense/batchnormp_kernel.so -------------------------------------------------------------------------------- /lib/eaconv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/eaconv/__init__.py -------------------------------------------------------------------------------- /lib/eaconv/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.utils.ffi 4 | 5 | this_folder = os.path.dirname(os.path.abspath(__file__)) + '/' 6 | 7 | Headers = [] 8 | Sources = [] 9 | Defines = [] 10 | Objects = [] 11 | 12 | if torch.cuda.is_available() is True: 13 | Headers += ['src/EAConv2d_cuda.h'] 14 | Sources += ['src/EAConv2d_cuda.c'] 15 | Defines += [('WITH_CUDA', None)] 16 | Objects += ['src/EAConv2d_kernel.o', 17 | 'src/handle.o', 18 | 'src/conv_params.o'] 19 | 20 | ffi = torch.utils.ffi.create_extension( 21 | name='_ext.eaconv2d', 22 | headers=Headers, 23 | sources=Sources, 24 | verbose=False, 25 | with_cuda=True, 26 | package=False, 27 | relative_to=this_folder, 28 | define_macros=Defines, 29 | extra_objects=[os.path.join(this_folder, Object) for Object in Objects] 30 | ) 31 | 32 | if __name__ == '__main__': 33 | ffi.build() 34 | -------------------------------------------------------------------------------- /lib/eaconv/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .eaconv import * 2 | -------------------------------------------------------------------------------- /lib/eaconv/functions/eaconv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from eaconv._ext import eaconv2d 4 | 5 | 6 | def _conv_output_dim(inputDim, pad, filterDim, dilation, stride): 7 | tmp = 1 + (inputDim + 2 * pad - (((filterDim - 1) * dilation) + 1)) 8 | if tmp % stride == 0\ 9 | and pad >= 0\ 10 | and stride >= 1\ 11 | and dilation >= 1\ 12 | and tmp > 0: 13 | return tmp // stride 14 | else: 15 | raise ValueError('Parameters of the kernel must be compatible ' 16 | 'with the dimensions of the input') 17 | 18 | 19 | class EAConv2dFunction(Function): 20 | 21 | def __init__(self, stride, padding, dilation, groups): 22 | super(EAConv2dFunction, self).__init__() 23 | self.stride = stride 24 | self.padding = padding 25 | self.dilation = dilation 26 | self.groups = groups 27 | 28 | def forward(self, *bias_inp_and_weight): 29 | num_inp = len(bias_inp_and_weight) // 2 30 | inp = bias_inp_and_weight[:num_inp] 31 | if len(bias_inp_and_weight) % 2 == 1: 32 | weight = bias_inp_and_weight[num_inp:-1] 33 | bias = bias_inp_and_weight[-1] 34 | else: 35 | weight = bias_inp_and_weight[num_inp:] 36 | bias = None 37 | 38 | self.saved_for_later = inp, weight, bias 39 | stride = self.stride 40 | padding = self.padding 41 | dilation = self.dilation 42 | groups = self.groups 43 | 44 | h = _conv_output_dim(inp[0].size(2), padding[0], 45 | weight[0].size(2), dilation[0], stride[0]) 46 | w = _conv_output_dim(inp[0].size(3), padding[1], 47 | weight[0].size(3), dilation[1], stride[1]) 48 | output = inp[0].new(inp[0].size(0), weight[0].size(0), h, w).zero_() 49 | if bias is not None: 50 | eaconv2d.EAConv2d_cuda_forward_bias(bias, output) 51 | for _inp, _weight in zip(inp, weight): 52 | if not isinstance(_inp, torch.cuda.FloatTensor): 53 | raise NotImplementedError 54 | eaconv2d.EAConv2d_cuda_forward(_inp, _weight, output, 55 | stride[0], stride[1], 56 | padding[0], padding[1], 57 | dilation[0], dilation[1], 58 | groups) 59 | return output 60 | 61 | def backward(self, gradOutput): 62 | gradOutput = gradOutput.contiguous() 63 | 64 | inp, weight, bias = self.saved_for_later 65 | 66 | stride = self.stride 67 | padding = self.padding 68 | dilation = self.dilation 69 | groups = self.groups 70 | 71 | if bias is not None: 72 | grad_bias = bias.new(*bias.size()).zero_() 73 | else: 74 | grad_bias = None 75 | 76 | grad_inp = [] 77 | grad_weight = [] 78 | for _inp in inp: 79 | grad_inp.append(_inp.new(*_inp.size()).zero_()) 80 | for _weight in weight: 81 | grad_weight.append(_weight.new(*_weight.size()).zero_()) 82 | 83 | if bias is not None: 84 | eaconv2d.EAConv2d_cuda_backward_bias(grad_bias, gradOutput) 85 | for _inp, _weight, _grad_inp, _grad_weight in zip(inp, weight, 86 | grad_inp, 87 | grad_weight): 88 | eaconv2d.EAConv2d_cuda_backward(_grad_inp, _grad_weight, 89 | gradOutput, 90 | _inp, _weight, 91 | stride[0], stride[1], 92 | padding[0], padding[1], 93 | dilation[0], dilation[1], 94 | groups) 95 | return_val = tuple(grad_inp) + tuple(grad_weight) 96 | if bias is not None: 97 | return_val = return_val + (grad_bias,) 98 | return return_val 99 | -------------------------------------------------------------------------------- /lib/eaconv/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .eaconv import * 2 | -------------------------------------------------------------------------------- /lib/eaconv/modules/eaconv.py: -------------------------------------------------------------------------------- 1 | import math 2 | import collections 3 | 4 | import torch 5 | from torch.nn.parameter import Parameter 6 | from torch.nn.modules.module import Module 7 | 8 | from .util import _pair 9 | from eaconv.functions import EAConv2dFunction 10 | 11 | 12 | def _check_input_dimensions(input_list): 13 | for i in range(1, len(input_list)): 14 | if input_list[i - 1].size(0) != input_list[i].size(0)\ 15 | or input_list[i - 1].size(2) != input_list[i].size(2)\ 16 | or input_list[i - 1].size(3) != input_list[i].size(3): 17 | return False 18 | return True 19 | 20 | 21 | class _EAConvNd(Module): 22 | def __init__(self, in_channels_list, out_channels, kernel_size, stride, 23 | padding, dilation, transposed, output_padding, groups, bias): 24 | super(_EAConvNd, self).__init__() 25 | in_channels = sum(in_channels_list) 26 | if in_channels % groups != 0: 27 | raise ValueError('in_channels must be divisible by groups') 28 | if out_channels % groups != 0: 29 | raise ValueError('out_channels must be divisible by groups') 30 | self.in_channels = in_channels 31 | self.in_channels_list = in_channels_list 32 | self.out_channels = out_channels 33 | self.kernel_size = kernel_size 34 | self.stride = stride 35 | self.padding = padding 36 | self.dilation = dilation 37 | self.transposed = transposed 38 | self.output_padding = output_padding 39 | self.groups = groups 40 | if transposed: 41 | raise NotImplementedError 42 | else: 43 | self.weight = torch.nn.ParameterList( 44 | [Parameter(torch.cuda.FloatTensor( 45 | out_channels, c // groups, *kernel_size)) 46 | for c in in_channels_list]) 47 | # for i, param in enumerate(self.weight): 48 | # setattr(self, 'weight' + str(i), param) 49 | if bias: 50 | self.bias = Parameter(torch.cuda.FloatTensor(out_channels)) 51 | else: 52 | self.register_parameter('bias', None) 53 | self.reset_parameters() 54 | 55 | def reset_parameters(self): 56 | n = self.in_channels 57 | for k in self.kernel_size: 58 | n *= k 59 | stdv = 1. / math.sqrt(n) 60 | for i in range(len(self.weight)): 61 | self.weight[i].data.uniform_(-stdv, stdv) 62 | if self.bias is not None: 63 | self.bias.data.uniform_(-stdv, stdv) 64 | 65 | 66 | class EAConv2d(_EAConvNd): 67 | '''A 2-dimensional convolution layer with efficient aggregation 68 | 69 | Overall, the APIs are the same as torch.nn.Conv2d with a few exceptions: 70 | 1. The in_channels argument is replaced with in_channels_list, 71 | which accepts either a single int or a list of ints. 72 | 2. This module accepts a variable number of inputs. The number of inputs 73 | must match the length of in_channels_list. 74 | ''' 75 | 76 | def __init__(self, in_channels_list, out_channels, kernel_size, stride=1, 77 | padding=0, dilation=1, groups=1, bias=True): 78 | if groups != 1: 79 | raise NotImplementedError 80 | if not isinstance(in_channels_list, collections.Iterable): 81 | in_channels_list = (in_channels_list,) 82 | kernel_size = _pair(kernel_size) 83 | stride = _pair(stride) 84 | padding = _pair(padding) 85 | dilation = _pair(dilation) 86 | super(EAConv2d, self).__init__( 87 | in_channels_list, out_channels, kernel_size, 88 | stride, padding, dilation, 89 | False, _pair(0), groups, bias) 90 | 91 | def forward(self, *inp): 92 | if not _check_input_dimensions(inp): 93 | raise ValueError('all except the channel dimensions ' 94 | 'of input tesors must coincide') 95 | eaconv2dfunc = EAConv2dFunction(self.stride, self.padding, 96 | self.dilation, self.groups) 97 | if self.bias is None: 98 | return eaconv2dfunc(*inp, *self.weight) 99 | else: 100 | return eaconv2dfunc(*inp, *self.weight, self.bias) 101 | -------------------------------------------------------------------------------- /lib/eaconv/modules/util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from itertools import repeat 3 | 4 | 5 | def _ntuple(n): 6 | def parse(x): 7 | if isinstance(x, collections.Iterable): 8 | return x 9 | return tuple(repeat(x, n)) 10 | return parse 11 | 12 | 13 | _single = _ntuple(1) 14 | _pair = _ntuple(2) 15 | _triple = _ntuple(3) 16 | _quadruple = _ntuple(4) 17 | -------------------------------------------------------------------------------- /lib/eaconv/src/EAConv2d_cuda.c: -------------------------------------------------------------------------------- 1 | #include "EAConv2d_kernel.h" 2 | #include "handle.h" 3 | 4 | #include 5 | 6 | #include 7 | 8 | extern THCState* state; 9 | 10 | void EAConv2d_cuda_forward_bias(THCudaTensor* bias, 11 | THCudaTensor* output) { 12 | cudnnHandle_t cudnn = getCudnnHandle(); 13 | cudnnSetStream(cudnn, THCState_getCurrentStream(state)); 14 | EAConv2d_cudnn_forward_bias(cudnn, 15 | THCudaTensor_data(state, bias), 16 | THCudaTensor_data(state, output), 17 | output->size[0], 18 | output->size[1], 19 | output->size[2], 20 | output->size[3]); 21 | } 22 | 23 | void EAConv2d_cuda_backward_bias(THCudaTensor* grad_bias, 24 | THCudaTensor* gradOutput) { 25 | cudnnHandle_t cudnn = getCudnnHandle(); 26 | cudnnSetStream(cudnn, THCState_getCurrentStream(state)); 27 | EAConv2d_cudnn_backward_bias(cudnn, 28 | THCudaTensor_data(state, grad_bias), 29 | THCudaTensor_data(state, gradOutput), 30 | gradOutput->size[0], 31 | gradOutput->size[1], 32 | gradOutput->size[2], 33 | gradOutput->size[3]); 34 | } 35 | 36 | int EAConv2d_cuda_forward(THCudaTensor* input, 37 | THCudaTensor* weight, 38 | THCudaTensor* output, 39 | int stride_x, 40 | int stride_y, 41 | int padding_x, 42 | int padding_y, 43 | int dilation_x, 44 | int dilation_y, 45 | int groups 46 | ) { 47 | cudnnHandle_t cudnn = getCudnnHandle(); 48 | cudnnSetStream(cudnn, THCState_getCurrentStream(state)); 49 | EAConv2d_cudnn_forward(state, 50 | cudnn, 51 | THCudaTensor_data(state, input), 52 | THCudaTensor_data(state, weight), 53 | THCudaTensor_data(state, output), 54 | stride_x, 55 | stride_y, 56 | padding_x, 57 | padding_y, 58 | dilation_x, 59 | dilation_y, 60 | groups, 61 | input->size[0], 62 | input->size[1], 63 | input->size[2], 64 | input->size[3], 65 | weight->size[0], 66 | weight->size[1], 67 | weight->size[2], 68 | weight->size[3], 69 | output->size[0], 70 | output->size[1], 71 | output->size[2], 72 | output->size[3]); 73 | return 1; 74 | } 75 | 76 | int EAConv2d_cuda_backward(THCudaTensor* grad_input, 77 | THCudaTensor* grad_weight, 78 | THCudaTensor* gradOutput, 79 | THCudaTensor* input, 80 | THCudaTensor* weight, 81 | int stride_x, 82 | int stride_y, 83 | int padding_x, 84 | int padding_y, 85 | int dilation_x, 86 | int dilation_y, 87 | int groups 88 | ) { 89 | cudnnHandle_t cudnn = getCudnnHandle(); 90 | cudnnSetStream(cudnn, THCState_getCurrentStream(state)); 91 | EAConv2d_cudnn_backward(state, 92 | cudnn, 93 | THCudaTensor_data(state, grad_input), 94 | THCudaTensor_data(state, grad_weight), 95 | THCudaTensor_data(state, gradOutput), 96 | THCudaTensor_data(state, input), 97 | THCudaTensor_data(state, weight), 98 | stride_x, 99 | stride_y, 100 | padding_x, 101 | padding_y, 102 | dilation_x, 103 | dilation_y, 104 | groups, 105 | grad_input->size[0], 106 | grad_input->size[1], 107 | grad_input->size[2], 108 | grad_input->size[3], 109 | grad_weight->size[0], 110 | grad_weight->size[1], 111 | grad_weight->size[2], 112 | grad_weight->size[3], 113 | gradOutput->size[0], 114 | gradOutput->size[1], 115 | gradOutput->size[2], 116 | gradOutput->size[3]); 117 | return 1; 118 | } 119 | -------------------------------------------------------------------------------- /lib/eaconv/src/EAConv2d_cuda.h: -------------------------------------------------------------------------------- 1 | void EAConv2d_cuda_forward_bias(THCudaTensor* bias, 2 | THCudaTensor* output); 3 | 4 | void EAConv2d_cuda_backward_bias(THCudaTensor* grad_bias, 5 | THCudaTensor* gradOutput); 6 | 7 | int EAConv2d_cuda_forward(THCudaTensor* input, 8 | THCudaTensor* weight, 9 | THCudaTensor* output, 10 | int stride_x, 11 | int stride_y, 12 | int padding_x, 13 | int padding_y, 14 | int dilation_x, 15 | int dilation_y, 16 | int groups); 17 | 18 | int EAConv2d_cuda_backward(THCudaTensor* grad_input, 19 | THCudaTensor* grad_weight, 20 | THCudaTensor* gradOutput, 21 | THCudaTensor* input, 22 | THCudaTensor* weight, 23 | int stride_x, 24 | int stride_y, 25 | int padding_x, 26 | int padding_y, 27 | int dilation_x, 28 | int dilation_y, 29 | int groups); 30 | -------------------------------------------------------------------------------- /lib/eaconv/src/EAConv2d_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "eaconv/src/EAConv2d_kernel.h" 4 | #include "eaconv/src/cuda_check.h" 5 | #include "eaconv/src/handle.h" 6 | #include "eaconv/src/conv_params.h" 7 | 8 | struct Workspace { 9 | Workspace(THCState* state, size_t size) : 10 | state(state), size(size), data(NULL) { 11 | checkCUDA(THCudaMalloc(state, &data, size)); 12 | } 13 | Workspace(const Workspace&) = delete; 14 | Workspace(Workspace&&) = default; 15 | ~Workspace() { 16 | if (data) { 17 | THCudaFree(state, data); 18 | } 19 | } 20 | THCState* state; 21 | size_t size; 22 | void* data; 23 | }; 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | void EAConv2d_cudnn_forward_bias(cudnnHandle_t cudnn, 30 | float *bias, float* output, 31 | int output_batch_size, 32 | int output_channels, 33 | int output_h, 34 | int output_w) { 35 | Convolution_Params params(1, 1, 36 | 0, 0, 37 | 1, 1, 38 | 3, 3, 3, 3, 39 | 3, 3, 3, 3, 40 | output_batch_size, 41 | output_channels, 42 | output_h, 43 | output_w); 44 | const float alpha = 1; 45 | checkCUDNN(cudnnAddTensor(cudnn, &alpha, 46 | params.bias_desc, bias, 47 | &alpha, 48 | params.output_desc, output)); 49 | } 50 | 51 | void EAConv2d_cudnn_backward_bias(cudnnHandle_t cudnn, 52 | float *grad_bias, float* gradOutput, 53 | int output_batch_size, 54 | int output_channels, 55 | int output_h, 56 | int output_w) { 57 | Convolution_Params params(1, 1, 58 | 0, 0, 59 | 1, 1, 60 | 3, 3, 3, 3, 61 | 3, 3, 3, 3, 62 | output_batch_size, 63 | output_channels, 64 | output_h, 65 | output_w); 66 | const float alpha = 1; 67 | checkCUDNN(cudnnConvolutionBackwardBias(cudnn, &alpha, 68 | params.output_desc, gradOutput, 69 | &alpha, 70 | params.bias_desc, grad_bias)); 71 | } 72 | 73 | void EAConv2d_cudnn_forward(THCState* state, 74 | cudnnHandle_t cudnn, 75 | float* input, 76 | float* weight, 77 | float* output, 78 | int stride_x, 79 | int stride_y, 80 | int padding_x, 81 | int padding_y, 82 | int dilation_x, 83 | int dilation_y, 84 | int groups, 85 | int input_batch_size, 86 | int input_channels, 87 | int input_h, 88 | int input_w, 89 | int kernel_out, 90 | int kernel_in, 91 | int kernel_h, 92 | int kernel_w, 93 | int output_batch_size, 94 | int output_channels, 95 | int output_h, 96 | int output_w) { 97 | Convolution_Params params(stride_x, 98 | stride_y, 99 | padding_x, 100 | padding_y, 101 | dilation_x, 102 | dilation_y, 103 | input_batch_size, 104 | input_channels, 105 | input_h, 106 | input_w, 107 | kernel_out, 108 | kernel_in, 109 | kernel_h, 110 | kernel_w, 111 | output_batch_size, 112 | output_channels, 113 | output_h, 114 | output_w); 115 | cudnnConvolutionFwdAlgo_t convolution_algorithm; 116 | checkCUDNN( 117 | cudnnGetConvolutionForwardAlgorithm(cudnn, 118 | params.input_desc, 119 | params.kernel_desc, 120 | params.conv_desc, 121 | params.output_desc, 122 | // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 123 | CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 124 | /*memoryLimitInBytes=*/0, 125 | &convolution_algorithm)); 126 | size_t workspace_bytes = 0; 127 | checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn, 128 | params.input_desc, 129 | params.kernel_desc, 130 | params.conv_desc, 131 | params.output_desc, 132 | convolution_algorithm, 133 | &workspace_bytes)); 134 | Workspace cur_ws(state, workspace_bytes); 135 | 136 | const float alpha = 1; 137 | checkCUDNN(cudnnConvolutionForward(cudnn, 138 | &alpha, 139 | params.input_desc, 140 | input, 141 | params.kernel_desc, 142 | weight, 143 | params.conv_desc, 144 | convolution_algorithm, 145 | cur_ws.data, 146 | cur_ws.size, 147 | &alpha, 148 | params.output_desc, 149 | output)); 150 | } 151 | 152 | void EAConv2d_cudnn_backward(THCState* state, 153 | cudnnHandle_t cudnn, 154 | float* grad_input, 155 | float* grad_weight, 156 | float* gradOutput, 157 | float* input, 158 | float* weight, 159 | int stride_x, 160 | int stride_y, 161 | int padding_x, 162 | int padding_y, 163 | int dilation_x, 164 | int dilation_y, 165 | int groups, 166 | int input_batch_size, 167 | int input_channels, 168 | int input_h, 169 | int input_w, 170 | int kernel_out, 171 | int kernel_in, 172 | int kernel_h, 173 | int kernel_w, 174 | int output_batch_size, 175 | int output_channels, 176 | int output_h, 177 | int output_w) { 178 | Convolution_Params params(stride_x, 179 | stride_y, 180 | padding_x, 181 | padding_y, 182 | dilation_x, 183 | dilation_y, 184 | input_batch_size, 185 | input_channels, 186 | input_h, 187 | input_w, 188 | kernel_out, 189 | kernel_in, 190 | kernel_h, 191 | kernel_w, 192 | output_batch_size, 193 | output_channels, 194 | output_h, 195 | output_w); 196 | // backward filter 197 | cudnnConvolutionBwdFilterAlgo_t convolution_filter_algorithm; 198 | checkCUDNN( 199 | cudnnGetConvolutionBackwardFilterAlgorithm( 200 | cudnn, 201 | params.input_desc, 202 | params.output_desc, 203 | params.conv_desc, 204 | params.kernel_desc, 205 | // CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 206 | CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 207 | /*memoryLimitInBytes=*/0, 208 | &convolution_filter_algorithm)); 209 | size_t filter_workspace_bytes = 0; 210 | checkCUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize( 211 | cudnn, 212 | params.input_desc, 213 | params.output_desc, 214 | params.conv_desc, 215 | params.kernel_desc, 216 | convolution_filter_algorithm, 217 | &filter_workspace_bytes)); 218 | 219 | // backward data 220 | cudnnConvolutionBwdDataAlgo_t convolution_data_algorithm; 221 | checkCUDNN( 222 | cudnnGetConvolutionBackwardDataAlgorithm( 223 | cudnn, 224 | params.kernel_desc, 225 | params.output_desc, 226 | params.conv_desc, 227 | params.input_desc, 228 | // CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE, 229 | CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 230 | /*memoryLimitInBytes=*/0, 231 | &convolution_data_algorithm)); 232 | size_t data_workspace_bytes = 0; 233 | checkCUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize( 234 | cudnn, 235 | params.kernel_desc, 236 | params.output_desc, 237 | params.conv_desc, 238 | params.input_desc, 239 | convolution_data_algorithm, 240 | &data_workspace_bytes)); 241 | 242 | Workspace filter_ws(state, filter_workspace_bytes); 243 | Workspace data_ws(state, data_workspace_bytes); 244 | 245 | const float alpha = 1; 246 | checkCUDNN(cudnnConvolutionBackwardFilter(cudnn, 247 | &alpha, 248 | params.input_desc, 249 | input, 250 | params.output_desc, 251 | gradOutput, 252 | params.conv_desc, 253 | convolution_filter_algorithm, 254 | filter_ws.data, 255 | filter_ws.size, 256 | &alpha, 257 | params.kernel_desc, 258 | grad_weight)); 259 | checkCUDNN(cudnnConvolutionBackwardData(cudnn, 260 | &alpha, 261 | params.kernel_desc, 262 | weight, 263 | params.output_desc, 264 | gradOutput, 265 | params.conv_desc, 266 | convolution_data_algorithm, 267 | data_ws.data, 268 | data_ws.size, 269 | &alpha, 270 | params.input_desc, 271 | grad_input)); 272 | } 273 | 274 | #ifdef __cplusplus 275 | } 276 | #endif 277 | -------------------------------------------------------------------------------- /lib/eaconv/src/EAConv2d_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef EACONV_SRC_EACONV2D_KERNEL_H_ 2 | #define EACONV_SRC_EACONV2D_KERNEL_H_ 3 | 4 | #include 5 | 6 | #include "THC/THC.h" 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | void EAConv2d_cudnn_forward_bias(cudnnHandle_t cudnn, 13 | float *bias, float* output, 14 | int output_batch_size, 15 | int output_channels, 16 | int output_h, 17 | int output_w); 18 | 19 | void EAConv2d_cudnn_backward_bias(cudnnHandle_t cudnn, 20 | float *grad_bias, float* gradOutput, 21 | int output_batch_size, 22 | int output_channels, 23 | int output_h, 24 | int output_w); 25 | 26 | void EAConv2d_cudnn_forward(THCState* state, 27 | cudnnHandle_t cudnn, 28 | float* input, 29 | float* weight, 30 | float* output, 31 | int stride_x, 32 | int stride_y, 33 | int padding_x, 34 | int padding_y, 35 | int dilation_x, 36 | int dilation_y, 37 | int groups, 38 | int input_batch_size, 39 | int input_channels, 40 | int input_h, 41 | int input_w, 42 | int kernel_out, 43 | int kernel_in, 44 | int kernel_h, 45 | int kernel_w, 46 | int output_batch_size, 47 | int output_channels, 48 | int output_h, 49 | int output_w); 50 | 51 | void EAConv2d_cudnn_backward(THCState* state, 52 | cudnnHandle_t cudnn, 53 | float* grad_input, 54 | float* grad_weight, 55 | float* gradOutput, 56 | float* input, 57 | float* weight, 58 | int stride_x, 59 | int stride_y, 60 | int padding_x, 61 | int padding_y, 62 | int dilation_x, 63 | int dilation_y, 64 | int groups, 65 | int input_batch_size, 66 | int input_channels, 67 | int input_h, 68 | int input_w, 69 | int kernel_out, 70 | int kernel_in, 71 | int kernel_h, 72 | int kernel_w, 73 | int output_batch_size, 74 | int output_channels, 75 | int output_h, 76 | int output_w); 77 | 78 | #ifdef __cplusplus 79 | } 80 | #endif 81 | 82 | #endif // EACONV_SRC_EACONV2D_KERNEL_H_ 83 | -------------------------------------------------------------------------------- /lib/eaconv/src/conv_params.cu: -------------------------------------------------------------------------------- 1 | #include "eaconv/src/cuda_check.h" 2 | #include "eaconv/src/conv_params.h" 3 | 4 | Convolution_Params::Convolution_Params(int stride_x, 5 | int stride_y, 6 | int padding_x, 7 | int padding_y, 8 | int dilation_x, 9 | int dilation_y, 10 | int input_batch_size, 11 | int input_channels, 12 | int input_h, 13 | int input_w, 14 | int kernel_out, 15 | int kernel_in, 16 | int kernel_h, 17 | int kernel_w, 18 | int output_batch_size, 19 | int output_channels, 20 | int output_h, 21 | int output_w) { 22 | checkCUDNN(cudnnCreateTensorDescriptor(&input_desc)); 23 | checkCUDNN(cudnnSetTensor4dDescriptor(input_desc, 24 | /*format=*/CUDNN_TENSOR_NCHW, 25 | /*dataType=*/CUDNN_DATA_FLOAT, 26 | /*batch_size=*/input_batch_size, 27 | /*channels=*/input_channels, 28 | /*image_height=*/input_h, 29 | /*image_width=*/input_w)); 30 | checkCUDNN(cudnnCreateTensorDescriptor(&output_desc)); 31 | checkCUDNN(cudnnSetTensor4dDescriptor(output_desc, 32 | /*format=*/CUDNN_TENSOR_NCHW, 33 | /*dataType=*/CUDNN_DATA_FLOAT, 34 | /*batch_size=*/output_batch_size, 35 | /*channels=*/output_channels, 36 | /*image_height=*/output_h, 37 | /*image_width=*/output_w)); 38 | checkCUDNN(cudnnCreateFilterDescriptor(&kernel_desc)); 39 | checkCUDNN(cudnnSetFilter4dDescriptor(kernel_desc, 40 | /*dataType=*/CUDNN_DATA_FLOAT, 41 | /*format=*/CUDNN_TENSOR_NCHW, 42 | /*out_channels=*/kernel_out, 43 | /*in_channels=*/kernel_in, 44 | /*kernel_height=*/kernel_h, 45 | /*kernel_width=*/kernel_w)); 46 | checkCUDNN(cudnnCreateTensorDescriptor(&bias_desc)); 47 | checkCUDNN(cudnnSetTensor4dDescriptor(bias_desc, 48 | /*format=*/CUDNN_TENSOR_NCHW, 49 | /*dataType=*/CUDNN_DATA_FLOAT, 50 | 1, 51 | /*channels=*/output_channels, 52 | 1, 53 | 1)); 54 | checkCUDNN(cudnnCreateConvolutionDescriptor(&conv_desc)); 55 | checkCUDNN(cudnnSetConvolution2dDescriptor( 56 | conv_desc, 57 | /*pad_height=*/padding_x, 58 | /*pad_width=*/padding_y, 59 | /*vertical_stride=*/stride_x, 60 | /*horizontal_stride=*/stride_y, 61 | /*dilation_height=*/dilation_x, 62 | /*dilation_width=*/dilation_y, 63 | /*mode=*/CUDNN_CROSS_CORRELATION, 64 | /*computeType=*/CUDNN_DATA_FLOAT)); 65 | checkCUDNN(cudnnSetConvolutionMathType(conv_desc, 66 | CUDNN_TENSOR_OP_MATH)); 67 | } 68 | 69 | Convolution_Params::~Convolution_Params() { 70 | cudnnDestroyTensorDescriptor(input_desc); 71 | cudnnDestroyTensorDescriptor(output_desc); 72 | cudnnDestroyFilterDescriptor(kernel_desc); 73 | cudnnDestroyTensorDescriptor(bias_desc); 74 | cudnnDestroyConvolutionDescriptor(conv_desc); 75 | } 76 | -------------------------------------------------------------------------------- /lib/eaconv/src/conv_params.h: -------------------------------------------------------------------------------- 1 | #ifndef EACONV_SRC_CONV_PARAMS_H_ 2 | #define EACONV_SRC_CONV_PARAMS_H_ 3 | 4 | #include 5 | 6 | class Convolution_Params { 7 | public: 8 | cudnnTensorDescriptor_t input_desc; 9 | cudnnTensorDescriptor_t output_desc; 10 | cudnnTensorDescriptor_t bias_desc; 11 | cudnnFilterDescriptor_t kernel_desc; 12 | cudnnConvolutionDescriptor_t conv_desc; 13 | Convolution_Params(int stride_x, 14 | int stride_y, 15 | int padding_x, 16 | int padding_y, 17 | int dilation_x, 18 | int dilation_y, 19 | int input_batch_size, 20 | int input_channels, 21 | int input_h, 22 | int input_w, 23 | int kernel_out, 24 | int kernel_in, 25 | int kernel_h, 26 | int kernel_w, 27 | int output_batch_size, 28 | int output_channels, 29 | int output_h, 30 | int output_w); 31 | ~Convolution_Params(); 32 | }; 33 | 34 | #endif // EACONV_SRC_CONV_PARAMS_H_ 35 | -------------------------------------------------------------------------------- /lib/eaconv/src/cuda_check.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016- Facebook, Inc (Adam Paszke) 2 | // Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 3 | // Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 4 | // Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 5 | // Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 6 | // Copyright (c) 2011-2013 NYU (Clement Farabet) 7 | // Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) // NOLINT 8 | // Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 9 | // Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) // NOLINT 10 | // 11 | // All rights reserved. 12 | // 13 | // Redistribution and use in source and binary forms, with or without 14 | // modification, are permitted provided that the following conditions are met: 15 | // 16 | // 1. Redistributions of source code must retain the above copyright 17 | // notice, this list of conditions and the following disclaimer. 18 | // 19 | // 2. Redistributions in binary form must reproduce the above copyright 20 | // notice, this list of conditions and the following disclaimer in the 21 | // documentation and/or other materials provided with the distribution. 22 | // 23 | // 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America // NOLINT 24 | // and IDIAP Research Institute nor the names of its contributors may be 25 | // used to endorse or promote products derived from this software without 26 | // specific prior written permission. 27 | // 28 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 29 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | // POSSIBILITY OF SUCH DAMAGE. 39 | 40 | #ifndef EACONV_SRC_CUDA_CHECK_H_ 41 | #define EACONV_SRC_CUDA_CHECK_H_ 42 | 43 | #include 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | class cudnn_exception : public std::runtime_error { 50 | public: 51 | cudnnStatus_t status; 52 | cudnn_exception(cudnnStatus_t status, const char* msg) 53 | : std::runtime_error(msg) 54 | , status(status) {} 55 | cudnn_exception(cudnnStatus_t status, const std::string& msg) 56 | : std::runtime_error(msg) 57 | , status(status) {} 58 | }; 59 | 60 | inline void checkCUDNN(cudnnStatus_t status) { 61 | if (status != CUDNN_STATUS_SUCCESS) { 62 | if (status == CUDNN_STATUS_NOT_SUPPORTED) { 63 | throw cudnn_exception(status, std::string(cudnnGetErrorString(status)) + 64 | ". This error may appear if you passed in a non-contiguous input."); 65 | } 66 | throw cudnn_exception(status, cudnnGetErrorString(status)); 67 | } 68 | } 69 | 70 | inline void checkCUDA(cudaError_t error) { 71 | if (error != cudaSuccess) { 72 | std::string msg("CUDA error: "); 73 | msg += cudaGetErrorString(error); 74 | throw std::runtime_error(msg); 75 | } 76 | } 77 | 78 | #endif // EACONV_SRC_CUDA_CHECK_H_ 79 | -------------------------------------------------------------------------------- /lib/eaconv/src/handle.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016- Facebook, Inc (Adam Paszke) 2 | // Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 3 | // Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 4 | // Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 5 | // Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 6 | // Copyright (c) 2011-2013 NYU (Clement Farabet) 7 | // Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) // NOLINT 8 | // Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 9 | // Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) // NOLINT 10 | // 11 | // All rights reserved. 12 | // 13 | // Redistribution and use in source and binary forms, with or without 14 | // modification, are permitted provided that the following conditions are met: 15 | // 16 | // 1. Redistributions of source code must retain the above copyright 17 | // notice, this list of conditions and the following disclaimer. 18 | // 19 | // 2. Redistributions in binary form must reproduce the above copyright 20 | // notice, this list of conditions and the following disclaimer in the 21 | // documentation and/or other materials provided with the distribution. 22 | // 23 | // 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America // NOLINT 24 | // and IDIAP Research Institute nor the names of its contributors may be 25 | // used to endorse or promote products derived from this software without 26 | // specific prior written permission. 27 | // 28 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 29 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | // POSSIBILITY OF SUCH DAMAGE. 39 | 40 | #include 41 | #include // NOLINT 42 | 43 | #include "eaconv/src/handle.h" 44 | #include "eaconv/src/cuda_check.h" 45 | 46 | #ifdef __cplusplus 47 | extern "C" { 48 | #endif 49 | 50 | struct Handle { 51 | cudnnHandle_t handle; 52 | Handle() : handle(NULL) { 53 | checkCUDNN(cudnnCreate(&handle)); 54 | } 55 | ~Handle() { 56 | if (handle) { 57 | cudnnDestroy(handle); 58 | } 59 | } 60 | }; 61 | 62 | std::mutex mutex; 63 | std::unordered_map handles; 64 | 65 | cudnnHandle_t getCudnnHandle(void) { 66 | int device; 67 | checkCUDA(cudaGetDevice(&device)); 68 | 69 | std::lock_guard guard(mutex); 70 | return handles[device].handle; 71 | } 72 | 73 | #ifdef __cplusplus 74 | } 75 | #endif 76 | -------------------------------------------------------------------------------- /lib/eaconv/src/handle.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2016- Facebook, Inc (Adam Paszke) 2 | // Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 3 | // Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 4 | // Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 5 | // Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 6 | // Copyright (c) 2011-2013 NYU (Clement Farabet) 7 | // Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) // NOLINT 8 | // Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 9 | // Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) // NOLINT 10 | // 11 | // All rights reserved. 12 | // 13 | // Redistribution and use in source and binary forms, with or without 14 | // modification, are permitted provided that the following conditions are met: 15 | // 16 | // 1. Redistributions of source code must retain the above copyright 17 | // notice, this list of conditions and the following disclaimer. 18 | // 19 | // 2. Redistributions in binary form must reproduce the above copyright 20 | // notice, this list of conditions and the following disclaimer in the 21 | // documentation and/or other materials provided with the distribution. 22 | // 23 | // 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America // NOLINT 24 | // and IDIAP Research Institute nor the names of its contributors may be 25 | // used to endorse or promote products derived from this software without 26 | // specific prior written permission. 27 | // 28 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 29 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 | // POSSIBILITY OF SUCH DAMAGE. 39 | 40 | #ifndef EACONV_SRC_HANDLE_H_ 41 | #define EACONV_SRC_HANDLE_H_ 42 | 43 | #include 44 | 45 | #ifdef __cplusplus 46 | extern "C" { 47 | #endif 48 | 49 | cudnnHandle_t getCudnnHandle(void); 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | #endif // EACONV_SRC_HANDLE_H_ 56 | -------------------------------------------------------------------------------- /lib/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/functions/__init__.py -------------------------------------------------------------------------------- /lib/functions/batchnormp.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | 3 | import numpy as np 4 | 5 | import torch 6 | from torch.autograd import Function 7 | from dense import batch_norm 8 | 9 | from queue import Queue 10 | from threading import Condition 11 | 12 | cum_queue = Queue() 13 | broadcast_queue = Queue() 14 | broadcast_cv = Condition() 15 | 16 | 17 | class BatchNormPFunction(Function): 18 | def __init__(self, running_mean, running_var, training, 19 | cum_queue, broadcast_queue, device_ids, sync, 20 | eps=1e-5, momentum=0.1, affine=True): 21 | self.affine = affine 22 | self.eps = eps 23 | self.momentum = momentum 24 | self.running_mean = running_mean 25 | self.running_var = running_var 26 | self.mean = None 27 | self.var = None 28 | self.training = training 29 | self.cum_queue = cum_queue 30 | self.broadcast_queue = broadcast_queue 31 | self.device_ids = device_ids 32 | self.sync = sync 33 | 34 | def forward(self, input, weight, bias): 35 | output = input.new() 36 | self.save_for_backward(input, weight, bias) 37 | 38 | # input_t = input.transpose(0, 1).double() 39 | # input_size = input_t.size() 40 | batch_size = int(input.size(0)) 41 | # input_t.resize_(int(input_size[0]), int(np.prod(input_size[1:]))) 42 | # self.mean = input_t.mean(dim=1) 43 | 44 | device_ids = self.device_ids 45 | # print('device', input.get_device(), flush=True) 46 | if input.is_cuda: 47 | # self.mean.copy_(torch.from_numpy( 48 | # self.cum_mean(input.get_device(), 49 | # self.mean.cpu().numpy(), 50 | # batch_size))) 51 | # var = input_t - torch.unsqueeze(self.mean, 1) 52 | # var *= var 53 | # var = var.mean(dim=1) 54 | # total_var = self.cum_mean( 55 | # input.get_device(), var.cpu().numpy(), batch_size) 56 | # self.std = input_t.new().resize_as_(self.mean). \ 57 | # copy_(torch.from_numpy(total_var)).sqrt() 58 | 59 | mean_cuda = input.new().resize_(input.size(1)) 60 | var_cuda = input.new().resize_(input.size(1)) 61 | batch_norm.BatchNormalizationP_mean_cuda(input, mean_cuda) 62 | 63 | if len(device_ids) > 1 and self.sync and self.training: 64 | mean_cuda.copy_(torch.from_numpy(self.cum_mean( 65 | input.get_device(), mean_cuda.cpu().numpy(), batch_size))) 66 | batch_norm.BatchNormalizationP_var_cuda(input, mean_cuda, var_cuda) 67 | if len(device_ids) > 1 and self.sync and self.training: 68 | var_cuda.copy_(torch.from_numpy(self.cum_mean( 69 | input.get_device(), var_cuda.cpu().numpy(), batch_size))) 70 | else: 71 | # self.std = input_t.std(dim=1, unbiased=False) 72 | batch_norm.BatchNormalizationP_var_cuda(input, mean_cuda, var_cuda) 73 | self.mean = mean_cuda 74 | self.var = var_cuda 75 | 76 | if not input.is_cuda: 77 | self.std = input_t.std(dim=1, unbiased=False) 78 | batch_norm.BatchNormalizationP_forward( 79 | input, output, weight, bias, 80 | self.running_mean, self.running_var, self.mean, self.std, 81 | self.training, self.momentum, self.eps) 82 | else: 83 | batch_norm.BatchNormalizationP_forward_cuda( 84 | input, output, weight, bias, 85 | self.running_mean, self.running_var, self.mean, self.var, 86 | self.training, self.momentum, self.eps) 87 | return output 88 | 89 | def cum_mean(self, this_device, this_mean, batch_size): 90 | cum_queue.put((batch_size, this_mean)) 91 | total_mean = np.zeros(this_mean.shape, dtype=np.float64) 92 | total_batch_size = 0 93 | if this_device == self.device_ids[0]: 94 | for _ in self.device_ids: 95 | item = cum_queue.get() 96 | total_batch_size += item[0] 97 | total_mean += item[0] * item[1] 98 | cum_queue.task_done() 99 | total_mean /= total_batch_size 100 | broadcast_cv.acquire() 101 | for _ in range(len(self.device_ids) - 1): 102 | broadcast_queue.put(total_mean) 103 | broadcast_cv.notify_all() 104 | broadcast_cv.release() 105 | else: 106 | broadcast_cv.acquire() 107 | if broadcast_queue.qsize() == 0: 108 | broadcast_cv.wait() 109 | total_mean = broadcast_queue.get() 110 | broadcast_queue.task_done() 111 | broadcast_cv.release() 112 | # assert cum_queue.empty() 113 | broadcast_queue.join() 114 | return total_mean 115 | 116 | def backward(self, grad_output): 117 | input, weight, bias = self.saved_tensors 118 | grad_input = grad_output.new().resize_as_(input) 119 | grad_weight = grad_output.new().resize_as_(weight).zero_() 120 | grad_bias = grad_output.new().resize_as_(bias).zero_() 121 | if not grad_output.is_cuda: 122 | batch_norm.BatchNormalizationP_backward( 123 | input, grad_output, grad_input, grad_weight, grad_bias, 124 | weight, self.running_mean, self.running_var, self.mean, 125 | self.std, self.training, 1, self.eps) 126 | else: 127 | # grad_output_t = grad_output.transpose(0, 1).double() 128 | # batch_size = int(grad_output.size(0)) 129 | # grad_output_t.resize_(int(grad_output_t.size(0)), 130 | # int(np.prod(grad_output_t.size()[1:]))) 131 | # grad_output_mean = grad_output_t.mean(dim=1) 132 | # device_ids = self.device_ids 133 | # if len(device_ids) > 1 and self.sync: 134 | # grad_output_mean.copy_(torch.from_numpy( 135 | # self.cum_mean(grad_output.get_device(), 136 | # grad_output_mean.cpu().numpy(), 137 | # batch_size))) 138 | # grad_output_mean = grad_output_mean.float() 139 | # 140 | # input_t = input.transpose(0, 1).double() 141 | # input_size = input_t.size() 142 | # input_t.resize_(int(input_size[0]), int(np.prod(input_size[1:]))) 143 | # dotP = (input_t - torch.unsqueeze(self.mean.double(), 1)) * \ 144 | # grad_output_t 145 | # dotP = dotP.mean(dim=1) 146 | # if len(device_ids) > 1 and self.sync: 147 | # dotP.copy_(torch.from_numpy( 148 | # self.cum_mean(grad_output.get_device(), 149 | # dotP.cpu().numpy(), 150 | # batch_size))) 151 | # dotP = dotP.float() 152 | 153 | batch_size = int(grad_output.size(0)) 154 | grad_output_mean_cuda = grad_output.new().resize_(grad_output.size(1)) 155 | dotP_cuda = grad_output.new().resize_( 156 | grad_output.size(1)) 157 | batch_norm.BatchNormalizationP_mean_grad_cuda( 158 | input, grad_output, self.running_mean, 159 | self.mean, grad_output_mean_cuda, dotP_cuda, self.training 160 | ) 161 | if len(self.device_ids) > 1 and self.sync: 162 | grad_output_mean_cuda.copy_(torch.from_numpy( 163 | self.cum_mean(grad_output.get_device(), 164 | grad_output_mean_cuda.cpu().numpy(), 165 | batch_size))) 166 | dotP_cuda.copy_(torch.from_numpy( 167 | self.cum_mean(grad_output.get_device(), 168 | dotP_cuda.cpu().numpy(), 169 | batch_size))) 170 | 171 | # pdb.set_trace() 172 | 173 | batch_norm.BatchNormalizationP_backward_cuda( 174 | input, grad_output, grad_output_mean_cuda, dotP_cuda, 175 | grad_input, grad_weight, grad_bias, 176 | weight, self.running_mean, self.running_var, 177 | self.mean, self.var, self.training, 1, self.eps) 178 | return grad_input, grad_weight, grad_bias 179 | -------------------------------------------------------------------------------- /lib/make_eaconv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd eaconv/src 4 | echo "Compiling Efficient Aggregation Convolution kernels by nvcc..." 5 | rm -f EAConv2d_kernel.o 6 | rm -rf ../_ex 7 | 8 | cd ../.. 9 | nvcc -c -o eaconv/src/EAConv2d_kernel.o eaconv/src/EAConv2d_kernel.cu -x cu -Xcompiler -fPIC -std=c++11 \ 10 | -I ~/anaconda3/lib/python3.6/site-packages/torch/lib/include/ \ 11 | -I ~/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH \ 12 | -I ~/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC 13 | nvcc -c -o eaconv/src/handle.o eaconv/src/handle.cu -x cu -Xcompiler -fPIC -std=c++11 14 | nvcc -c -o eaconv/src/conv_params.o eaconv/src/conv_params.cu -x cu -Xcompiler -fPIC -std=c++11 15 | cd eaconv/ 16 | python3 build.py 17 | -------------------------------------------------------------------------------- /lib/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbdrive/dla/d477ceb8036ae402f3c7b59ead54963cf864903b/lib/modules/__init__.py -------------------------------------------------------------------------------- /lib/modules/batchnormsync.py: -------------------------------------------------------------------------------- 1 | from queue import Queue 2 | 3 | import torch 4 | from torch.nn import Module 5 | from torch.nn.parameter import Parameter 6 | from functions.batchnormp import BatchNormPFunction 7 | 8 | 9 | class BatchNormSync(Module): 10 | 11 | sync = True 12 | checking_mode = False 13 | 14 | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, 15 | device_ids=None): 16 | super(BatchNormSync, self).__init__() 17 | self.num_features = num_features 18 | self.affine = affine 19 | self.eps = eps 20 | self.momentum = momentum 21 | if self.affine: 22 | self.weight = Parameter(torch.Tensor(num_features)) 23 | self.bias = Parameter(torch.Tensor(num_features)) 24 | else: 25 | self.register_parameter('weight', None) 26 | self.register_parameter('bias', None) 27 | self.register_buffer('running_mean', torch.zeros(num_features)) 28 | self.register_buffer('running_var', torch.ones(num_features)) 29 | self.mean = torch.zeros(num_features) 30 | self.std = torch.ones(num_features) 31 | self.reset_parameters() 32 | self.cum_queue = Queue() 33 | self.broadcast_queue = Queue() 34 | if device_ids is None: 35 | self.device_ids = list(range(torch.cuda.device_count())) 36 | else: 37 | self.device_ids = device_ids 38 | 39 | def reset_parameters(self): 40 | self.running_mean.zero_() 41 | self.running_var.fill_(1) 42 | self.mean.zero_() 43 | self.std.fill_(1) 44 | if self.affine: 45 | if BatchNormSync.checking_mode: 46 | self.weight.data.fill_(1) 47 | else: 48 | self.weight.data.uniform_() 49 | self.bias.data.zero_() 50 | 51 | def forward(self, input): 52 | training = int(self.training) 53 | assert input.size(1) == self.num_features 54 | 55 | bn_func = BatchNormPFunction( 56 | self.running_mean, self.running_var, # self.mean, self.std, 57 | training, self.cum_queue, self.broadcast_queue, self.device_ids, 58 | BatchNormSync.sync, self.eps, self.momentum, self.affine) 59 | return bn_func(input, self.weight, self.bias) 60 | 61 | def __repr__(self): 62 | return ('{name}({num_features}, eps={eps}, momentum={momentum},' 63 | ' affine={affine})' 64 | .format(name=self.__class__.__name__, **self.__dict__)) -------------------------------------------------------------------------------- /lib/src/batchnormp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "batchnormp.h" 3 | 4 | #define THNN_CHECK_SHAPE(I1, I2) \ 5 | if (I1 != NULL && I2 != NULL && !THFloatTensor_isSameSizeAs(I1, I2)) \ 6 | { \ 7 | THDescBuff s1 = THFloatTensor_sizeDesc(I1); \ 8 | THDescBuff s2 = THFloatTensor_sizeDesc(I2); \ 9 | THError(#I1 " and " #I2 " shapes do not match: " \ 10 | #I1 " %s, " #I2 " %s", s1.str, s2.str); \ 11 | } 12 | 13 | void BatchNormalizationP_forward( 14 | THFloatTensor *input, THFloatTensor *output, 15 | THFloatTensor *weight, THFloatTensor *bias, 16 | THFloatTensor *running_mean, THFloatTensor *running_var, 17 | THFloatTensor *save_mean, THFloatTensor *save_std, 18 | int train, double momentum, double eps) 19 | { 20 | THFloatTensor_resizeAs(output, input); 21 | int64_t nInput = THFloatTensor_size(input, 1); 22 | int64_t f; 23 | ptrdiff_t n = THFloatTensor_nElement(input) / nInput; 24 | 25 | #pragma omp parallel for 26 | for (f = 0; f < nInput; ++f) { 27 | THFloatTensor *in = THFloatTensor_newSelect(input, 1, f); 28 | THFloatTensor *out = THFloatTensor_newSelect(output, 1, f); 29 | 30 | float mean, invstd, std; 31 | 32 | if (train) { 33 | // compute mean per input 34 | // double sum = 0; 35 | // TH_TENSOR_APPLY(float, in, sum += *in_data;); 36 | // 37 | // mean = (float) sum / n; 38 | // THFloatTensor_set1d(save_mean, f, (float) mean); 39 | 40 | mean = THFloatTensor_get1d(save_mean, f); 41 | std = THFloatTensor_get1d(save_std, f); 42 | invstd = (float) (1 / (std + eps)); 43 | 44 | // compute variance per input 45 | // sum = 0; 46 | // TH_TENSOR_APPLY(float, in, 47 | // sum += (*in_data - mean) * (*in_data - mean);); 48 | // 49 | // if (sum == 0 && eps == 0.0) { 50 | // invstd = 0; 51 | // } else { 52 | // invstd = (float) (1 / sqrt(sum/n + eps)); 53 | // } 54 | // THFloatTensor_set1d(save_std, f, (float) invstd); 55 | 56 | // update running averages 57 | THFloatTensor_set1d(running_mean, f, 58 | (float) (momentum * mean + (1 - momentum) * THFloatTensor_get1d(running_mean, f))); 59 | 60 | double unbiased_var = std * n / (n - 1); 61 | THFloatTensor_set1d(running_var, f, 62 | (float) (momentum * unbiased_var + (1 - momentum) * THFloatTensor_get1d(running_var, f))); 63 | } else { 64 | mean = THFloatTensor_get1d(running_mean, f); 65 | invstd = 1 / sqrt(THFloatTensor_get1d(running_var, f) + eps); 66 | } 67 | 68 | // compute output 69 | float w = weight ? THFloatTensor_get1d(weight, f) : 1; 70 | float b = bias ? THFloatTensor_get1d(bias, f) : 0; 71 | 72 | TH_TENSOR_APPLY2(float, in, float, out, 73 | *out_data = (float) (((*in_data - mean) * invstd) * w + b);); 74 | 75 | THFloatTensor_free(out); 76 | THFloatTensor_free(in); 77 | } 78 | } 79 | 80 | void BatchNormalizationP_backward( 81 | THFloatTensor *input, THFloatTensor *gradOutput, THFloatTensor *gradInput, 82 | THFloatTensor *gradWeight, THFloatTensor *gradBias, THFloatTensor *weight, 83 | THFloatTensor *running_mean, THFloatTensor *running_var, 84 | THFloatTensor *save_mean, THFloatTensor *save_std, 85 | int train, double scale, double eps) 86 | { 87 | THNN_CHECK_SHAPE(input, gradOutput); 88 | int64_t nInput = THFloatTensor_size(input, 1); 89 | int64_t f; 90 | ptrdiff_t n = THFloatTensor_nElement(input) / nInput; 91 | 92 | #pragma omp parallel for 93 | for (f = 0; f < nInput; ++f) { 94 | THFloatTensor *in = THFloatTensor_newSelect(input, 1, f); 95 | THFloatTensor *gradOut = THFloatTensor_newSelect(gradOutput, 1, f); 96 | float w = weight ? THFloatTensor_get1d(weight, f) : 1; 97 | float mean, invstd; 98 | if (train) { 99 | mean = THFloatTensor_get1d(save_mean, f); 100 | invstd = 1 / (THFloatTensor_get1d(save_std, f) + eps); 101 | } else { 102 | mean = THFloatTensor_get1d(running_mean, f); 103 | invstd = 1 / sqrt(THFloatTensor_get1d(running_var, f) + eps); 104 | } 105 | 106 | // sum over all gradOutput in feature plane 107 | double sum = 0; 108 | TH_TENSOR_APPLY(float, gradOut, sum += *gradOut_data;); 109 | 110 | // dot product of the Q(X) and gradOuput 111 | double dotp = 0; 112 | TH_TENSOR_APPLY2(float, in, float, gradOut, 113 | dotp += (*in_data - mean) * (*gradOut_data);); 114 | 115 | if (gradInput) { 116 | THFloatTensor_resizeAs(gradInput, input); 117 | THFloatTensor *gradIn = THFloatTensor_newSelect(gradInput, 1, f); 118 | 119 | if (train) { 120 | // when in training mode 121 | // Q(X) = X - E[x] ; i.e. input centered to zero mean 122 | // Y = Q(X) / σ ; i.e. BN output before weight and bias 123 | // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w 124 | 125 | // projection of gradOutput on to output scaled by std 126 | float k = (float) dotp * invstd * invstd / n; 127 | TH_TENSOR_APPLY2(float, gradIn, float, in, 128 | *gradIn_data = (*in_data - mean) * k;); 129 | 130 | double gradMean = sum / n; 131 | TH_TENSOR_APPLY2(float, gradIn, float, gradOut, 132 | *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;); 133 | 134 | } else { 135 | // when in evaluation mode 136 | // Q(X) = X - running_mean ; i.e. input centered to zero mean 137 | // Y = Q(X) / running_std ; i.e. BN output before weight and bias 138 | // dL/dX = w / running_std 139 | TH_TENSOR_APPLY2(float, gradIn, float, gradOut, 140 | *gradIn_data = *gradOut_data * invstd * w;); 141 | } 142 | 143 | THFloatTensor_free(gradIn); 144 | } 145 | 146 | if (gradWeight) { 147 | float val = THFloatTensor_get1d(gradWeight, f); 148 | THFloatTensor_set1d(gradWeight, f, val + scale * dotp * invstd); 149 | } 150 | 151 | if (gradBias) { 152 | float val = THFloatTensor_get1d(gradBias, f); 153 | THFloatTensor_set1d(gradBias, f, val + scale * sum); 154 | } 155 | 156 | THFloatTensor_free(gradOut); 157 | THFloatTensor_free(in); 158 | } 159 | } -------------------------------------------------------------------------------- /lib/src/batchnormp.h: -------------------------------------------------------------------------------- 1 | // #include 2 | 3 | void BatchNormalizationP_forward( 4 | THFloatTensor *input, THFloatTensor *output, 5 | THFloatTensor *weight, THFloatTensor *bias, 6 | THFloatTensor *running_mean, THFloatTensor *running_var, 7 | THFloatTensor *save_mean, THFloatTensor *save_std, 8 | int train, double momentum, double eps); 9 | 10 | 11 | void BatchNormalizationP_backward( 12 | THFloatTensor *input, THFloatTensor *gradOutput, THFloatTensor *gradInput, 13 | THFloatTensor *gradWeight, THFloatTensor *gradBias, THFloatTensor *weight, 14 | THFloatTensor *running_mean, THFloatTensor *running_var, 15 | THFloatTensor *save_mean, THFloatTensor *save_std, 16 | int train, double scale, double eps); 17 | -------------------------------------------------------------------------------- /lib/src/batchnormp_cuda.c: -------------------------------------------------------------------------------- 1 | // #include "auto_gpu.h" 2 | #include 3 | 4 | #include "batchnormp_cuda_kernel.h" 5 | 6 | 7 | extern THCState *state; 8 | 9 | void BatchNormalizationP_forward_cuda( 10 | THCudaTensor *input, THCudaTensor *output, 11 | THCudaTensor *weight, THCudaTensor *bias, 12 | THCudaTensor *running_mean, THCudaTensor *running_var, 13 | THCudaTensor *save_mean, THCudaTensor *save_std, 14 | int train, double momentum, double eps) { 15 | THNN_CudaBatchNormalization_updateOutputhaha( 16 | state, input, output, weight, bias, running_mean, running_var, 17 | save_mean, save_std, train, momentum, eps); 18 | } 19 | 20 | void BatchNormalizationP_mean_cuda( 21 | THCudaTensor *input, THCudaTensor *save_mean) { 22 | THNN_CudaBatchNormalization_mean( 23 | state, input, save_mean); 24 | } 25 | 26 | 27 | void BatchNormalizationP_var_cuda( 28 | THCudaTensor *input, THCudaTensor *save_mean, THCudaTensor *save_var) { 29 | THNN_CudaBatchNormalization_var( 30 | state, input, save_mean, save_var); 31 | } 32 | 33 | 34 | void BatchNormalizationP_backward_cuda( 35 | THCudaTensor *input, THCudaTensor *gradOutput, 36 | THCudaTensor *gradOutputMean, THCudaTensor *dotP, 37 | THCudaTensor *gradInput, 38 | THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *weight, 39 | THCudaTensor *running_mean, THCudaTensor *running_var, 40 | THCudaTensor *save_mean, THCudaTensor *save_std, 41 | int train, double scale, double eps) { 42 | THNN_CudaBatchNormalization_backwardhaha( 43 | state, input, gradOutput, gradOutputMean, dotP, 44 | gradInput, gradWeight, gradBias, weight, 45 | running_mean, running_var, save_mean, save_std, train, scale, eps); 46 | } 47 | 48 | void BatchNormalizationP_mean_grad_cuda( 49 | THCudaTensor *input, THCudaTensor *gradOutput, 50 | THCudaTensor *runningMean, THCudaTensor *saveMean, 51 | THCudaTensor *gradOutputMean, THCudaTensor *dotP, int train) { 52 | THNN_CudaBatchNormalization_mean_grad( 53 | state, input, gradOutput, runningMean, saveMean, 54 | gradOutputMean, dotP, train); 55 | } -------------------------------------------------------------------------------- /lib/src/batchnormp_cuda.h: -------------------------------------------------------------------------------- 1 | void BatchNormalizationP_forward_cuda( 2 | THCudaTensor *input, THCudaTensor *output, 3 | THCudaTensor *weight, THCudaTensor *bias, 4 | THCudaTensor *running_mean, THCudaTensor *running_var, 5 | THCudaTensor *save_mean, THCudaTensor *save_std, 6 | int train, double momentum, double eps); 7 | 8 | 9 | void BatchNormalizationP_mean_cuda( 10 | THCudaTensor *input, THCudaTensor *save_mean); 11 | 12 | 13 | void BatchNormalizationP_var_cuda( 14 | THCudaTensor *input, THCudaTensor *save_mean, THCudaTensor *save_var); 15 | 16 | 17 | void BatchNormalizationP_backward_cuda( 18 | THCudaTensor *input, THCudaTensor *gradOutput, 19 | THCudaTensor *gradOutputMean, THCudaTensor *dotP, 20 | THCudaTensor *gradInput, 21 | THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *weight, 22 | THCudaTensor *running_mean, THCudaTensor *running_var, 23 | THCudaTensor *save_mean, THCudaTensor *save_std, 24 | int train, double scale, double eps); 25 | 26 | 27 | void BatchNormalizationP_mean_grad_cuda( 28 | THCudaTensor *input, THCudaTensor *gradOutput, 29 | THCudaTensor *runningMean, THCudaTensor *saveMean, 30 | THCudaTensor *gradOutputMean, THCudaTensor *dotP, int train); -------------------------------------------------------------------------------- /lib/src/batchnormp_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include "THCDeviceTensor.cuh" 7 | #include "THCDeviceTensorUtils.cuh" 8 | #include "THCDeviceUtils.cuh" 9 | const int WARP_SIZE = 32; 10 | 11 | // The maximum number of threads in a block 12 | const int MAX_BLOCK_SIZE = 512; 13 | 14 | // Number of threads in a block given an input size up to MAX_BLOCK_SIZE 15 | static int getNumThreads(int nElem) { 16 | int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE }; 17 | for (int i = 0; i != 5; ++i) { 18 | if (nElem <= threadSizes[i]) { 19 | return threadSizes[i]; 20 | } 21 | } 22 | return MAX_BLOCK_SIZE; 23 | } 24 | 25 | // Returns the index of the most significant 1 bit in `val`. 26 | __device__ __forceinline__ int getMSB(int val) { 27 | return 31 - __clz(val); 28 | } 29 | 30 | template 31 | struct Float2 { 32 | Acctype v1, v2; 33 | __device__ Float2() {} 34 | __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert::to(v1)), v2(ScalarConvert::to(v2)) {} 35 | __device__ Float2(Dtype v) : v1(ScalarConvert::to(v)), v2(ScalarConvert::to(v)) {} 36 | __device__ Float2(int v) : v1(ScalarConvert::to(v)), v2(ScalarConvert::to(v)) {} 37 | __device__ Float2& operator+=(const Float2& a) { 38 | v1 += a.v1; 39 | v2 += a.v2; 40 | return *this; 41 | } 42 | }; 43 | 44 | template 45 | struct SumOp { 46 | __device__ SumOp(const DeviceTensor3 t) : tensor(t) {} 47 | __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { 48 | return ScalarConvert::to(tensor[batch][plane][n]); 49 | } 50 | const DeviceTensor3 tensor; 51 | }; 52 | 53 | template 54 | struct VarOp { 55 | __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {} 56 | __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { 57 | Dtype val = tensor[batch][plane][n]; 58 | return (val - mean) * (val - mean); 59 | } 60 | const Acctype mean; 61 | const DeviceTensor3 tensor; 62 | }; 63 | 64 | template 65 | struct GradOp { 66 | __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g) 67 | : mean(m), input(i), gradOutput(g) {} 68 | __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) { 69 | Dtype g = gradOutput[batch][plane][n]; 70 | Dtype c = ScalarConvert::to(input[batch][plane][n] - mean); 71 | return Float2(g, g * c); 72 | } 73 | const Acctype mean; 74 | const DeviceTensor3 input; 75 | const DeviceTensor3 gradOutput; 76 | }; 77 | 78 | // Sum across all threads within a warp 79 | template 80 | static __device__ __forceinline__ T warpSum(T val) { 81 | #if __CUDA_ARCH__ >= 300 82 | for (int i = 0; i < getMSB(WARP_SIZE); ++i) { 83 | val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE); 84 | } 85 | #else 86 | __shared__ T values[MAX_BLOCK_SIZE]; 87 | values[threadIdx.x] = val; 88 | __threadfence_block(); 89 | const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; 90 | for (int i = 1; i < WARP_SIZE; i++) { 91 | val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; 92 | } 93 | #endif 94 | return val; 95 | } 96 | 97 | template 98 | static __device__ __forceinline__ Float2 warpSum(Float2 value) { 99 | value.v1 = warpSum(value.v1); 100 | value.v2 = warpSum(value.v2); 101 | return value; 102 | } 103 | 104 | // Sum across (batch, x/y/z) applying Op() pointwise 105 | template 106 | __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) { 107 | T sum = (T)0; 108 | for (int batch = 0; batch < tensor.getSize(0); ++batch) { 109 | for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) { 110 | sum += op(batch, plane, x); 111 | } 112 | } 113 | 114 | // sum over NumThreads within a warp 115 | sum = warpSum(sum); 116 | 117 | // 'transpose', and reduce within warp again 118 | __shared__ T shared[32]; 119 | __syncthreads(); 120 | if (threadIdx.x % WARP_SIZE == 0) { 121 | shared[threadIdx.x / WARP_SIZE] = sum; 122 | } 123 | if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { 124 | // zero out the other entries in shared 125 | shared[threadIdx.x] = (T)0; 126 | } 127 | __syncthreads(); 128 | if (threadIdx.x / WARP_SIZE == 0) { 129 | sum = warpSum(shared[threadIdx.x]); 130 | if (threadIdx.x == 0) { 131 | shared[0] = sum; 132 | } 133 | } 134 | __syncthreads(); 135 | 136 | // Everyone picks it up, should be broadcast into the whole gradInput 137 | return shared[0]; 138 | } 139 | 140 | template 141 | __global__ void BatchNormalizationUpdateOutputInference_kernel( 142 | const DeviceTensor3 input, 143 | DeviceTensor3 output, 144 | DeviceTensor1 runningMean, 145 | DeviceTensor1 runningVar, 146 | const DeviceTensor1 weight, 147 | const DeviceTensor1 bias, 148 | Acctype epsilon) { 149 | 150 | int plane = blockIdx.x; 151 | 152 | Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon); 153 | Acctype mean = ScalarConvert::to(runningMean[plane].ldg()); 154 | Acctype gamma = weight.numElements() > 0 ? ScalarConvert::to(weight[plane].ldg()) : Acctype(1); 155 | Acctype beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane].ldg()) : Acctype(0); 156 | 157 | // Write normalized and update the output 158 | for (int batch = 0; batch < input.getSize(0); batch++) { 159 | for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { 160 | Dtype inp = input[batch][plane][x].ldg(); 161 | output[batch][plane][x] = ScalarConvert::to(gamma * (inp - mean) * invstd + beta); 162 | } 163 | } 164 | } 165 | 166 | template 167 | __global__ void BatchNormalizationMean_kernel( 168 | const DeviceTensor3 input, 169 | DeviceTensor1 out_mean) { 170 | int plane = blockIdx.x; 171 | int N = input.getSize(0) * input.getSize(2); 172 | 173 | Acctype norm = Acctype(1) / N; 174 | Acctype mean = reduce(SumOp(input), input, plane) * norm; 175 | if (threadIdx.x == 0) { 176 | out_mean[plane] = ScalarConvert::to(mean); 177 | } 178 | } 179 | 180 | 181 | template 182 | __global__ void BatchNormalizationVar_kernel( 183 | const DeviceTensor3 input, 184 | const DeviceTensor1 in_mean, 185 | DeviceTensor1 out_var) { 186 | int plane = blockIdx.x; 187 | int N = input.getSize(0) * input.getSize(2); 188 | 189 | Acctype norm = Acctype(1) / N; 190 | Acctype mean = ScalarConvert::to(in_mean[plane]); 191 | 192 | Acctype var = reduce(VarOp(mean, input), input, plane) * norm; 193 | if (threadIdx.x == 0) { 194 | out_var[plane] = ScalarConvert::to(var); 195 | } 196 | } 197 | 198 | template 199 | __global__ void BatchNormalizationUpdateOutput_kernelhaha( 200 | const DeviceTensor3 input, 201 | DeviceTensor3 output, 202 | const DeviceTensor1 weight, 203 | const DeviceTensor1 bias, 204 | const Acctype epsilon, 205 | const Acctype momentum, 206 | DeviceTensor1 runningMean, 207 | DeviceTensor1 runningVar, 208 | DeviceTensor1 saveMean, 209 | DeviceTensor1 saveVar) { 210 | 211 | 212 | int plane = blockIdx.x; 213 | int N = input.getSize(0) * input.getSize(2); 214 | 215 | 216 | // Compute the mean and variance across (batch, x/y/z) 217 | 218 | /* Acctype norm = Acctype(1) / N; 219 | Acctype mean = reduce(SumOp(input), input, plane) * norm; 220 | __syncthreads(); 221 | Acctype varN = reduce(VarOp(mean, input), input, plane); 222 | Acctype invStd = 0; 223 | if (varN != Acctype(0) || epsilon != Acctype(0)) { 224 | invStd = 1 / sqrt(varN * norm + epsilon); 225 | } */ 226 | 227 | Acctype mean = ScalarConvert::to(saveMean[plane]); 228 | Acctype var = ScalarConvert::to(saveVar[plane]); 229 | Acctype invStd = 1 / sqrt(var + epsilon); 230 | 231 | // Save the mean, variance, and moving averages 232 | if (threadIdx.x == 0) { 233 | // Momentum based writeback 234 | // Acctype unbiasedVar = varN / (N - 1); 235 | Acctype unbiasedVar = var * N / (N - 1); 236 | // saveMean[plane] = ScalarConvert::to(mean); 237 | // saveStd[plane] = ScalarConvert::to(invStd); 238 | runningMean[plane] = ScalarConvert::to((1 - momentum) * runningMean[plane] + momentum * mean); 239 | runningVar[plane] = ScalarConvert::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar); 240 | } 241 | 242 | // Write normalized and update the output 243 | Acctype gamma = weight.numElements() > 0 ? ScalarConvert::to(weight[plane]) : ScalarConvert::to(1); 244 | Acctype beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane]) : ScalarConvert::to(0); 245 | for (int batch = 0; batch < input.getSize(0); ++batch) { 246 | for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { 247 | Dtype inp = input[batch][plane][x].ldg(); 248 | output[batch][plane][x] = ScalarConvert::to(gamma * (inp - mean) * invStd + beta); 249 | } 250 | } 251 | } 252 | 253 | 254 | template 255 | __global__ void BatchNormalizationMeanGrad_kernel( 256 | const DeviceTensor3 input, 257 | const DeviceTensor3 gradOutput, 258 | const DeviceTensor1 runningMean, 259 | const DeviceTensor1 saveMean, 260 | DeviceTensor1 gradOutputMean_all, 261 | DeviceTensor1 dotP_all, 262 | bool train) { 263 | int plane = blockIdx.x; 264 | int N = gradOutput.getSize(0) * gradOutput.getSize(2); 265 | 266 | Acctype mean; 267 | if (train) { 268 | mean = ScalarConvert::to(saveMean[plane]); 269 | } else { 270 | mean = ScalarConvert::to(runningMean[plane]); 271 | } 272 | 273 | Acctype norm = Acctype(1) / N; 274 | GradOp g(mean, input, gradOutput); 275 | Float2 res = reduce, GradOp, DeviceTensor3>(g, gradOutput, plane); 276 | Acctype gradOutputMean = res.v1 * norm; 277 | Acctype dotP = res.v2 * norm; 278 | 279 | if (threadIdx.x == 0) { 280 | gradOutputMean_all[plane] = ScalarConvert::to(gradOutputMean); 281 | dotP_all[plane] = ScalarConvert::to(dotP); 282 | } 283 | } 284 | 285 | template 286 | __global__ void BatchNormalizationBackward_kernel( 287 | const DeviceTensor3 input, 288 | const DeviceTensor3 gradOutput, 289 | const DeviceTensor1 gradOutputMean, 290 | const DeviceTensor1 dotP_all, 291 | DeviceTensor3 gradInput, 292 | DeviceTensor1 gradWeight, 293 | DeviceTensor1 gradBias, 294 | const DeviceTensor1 weight, 295 | const DeviceTensor1 runningMean, 296 | const DeviceTensor1 runningVar, 297 | const DeviceTensor1 saveMean, 298 | const DeviceTensor1 saveVar, 299 | bool train, 300 | Acctype scale, 301 | double eps) { 302 | 303 | int plane = blockIdx.x; 304 | int N = gradOutput.getSize(0) * gradOutput.getSize(2); 305 | 306 | Acctype mean, stdVal; 307 | if (train) { 308 | mean = ScalarConvert::to(saveMean[plane]); 309 | stdVal = 1 / sqrt(ScalarConvert::to(saveVar[plane]) + eps); 310 | } else { 311 | mean = ScalarConvert::to(runningMean[plane]); 312 | stdVal = 1 / sqrt(runningVar[plane] + eps); 313 | } 314 | 315 | Acctype weightVal = weight.numElements() > 0 ? ScalarConvert::to(weight[plane]) : Acctype(1); 316 | // Acctype norm = Acctype(1) / N; 317 | 318 | // Compute two values across (batch, x/y/z) in one pass: 319 | // 1. Sum(gradOutput) 320 | // 2. DotProduct(input - mean, gradOutput) 321 | // GradOp g(mean, input, gradOutput); 322 | // Float2 res = reduce, GradOp, DeviceTensor3>(g, gradOutput, plane); 323 | // Acctype gradOutputSum = res.v1; 324 | Acctype gradOutputSum = ScalarConvert::to(gradOutputMean[plane]) * N; 325 | // Acctype dotP = res.v2; 326 | Acctype dotP = ScalarConvert::to(dotP_all[plane]); 327 | 328 | // Acctype gradMean = gradOutputSum * norm; 329 | Acctype gradMean = ScalarConvert::to(gradOutputMean[plane]); 330 | // Acctype projScale = dotP * norm * stdVal * stdVal; 331 | Acctype projScale = dotP * stdVal * stdVal; 332 | Acctype gradScale = stdVal * weightVal; 333 | 334 | if (gradInput.numElements() > 0) { 335 | for (int batch = 0; batch < gradOutput.getSize(0); ++batch) { 336 | for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) { 337 | Dtype gradOut = gradOutput[batch][plane][x]; 338 | if (train) { 339 | Dtype inp = input[batch][plane][x]; 340 | Acctype proj = (inp - mean) * projScale; 341 | gradInput[batch][plane][x] = ScalarConvert::to((gradOut - proj - gradMean) * gradScale); 342 | } else { 343 | gradInput[batch][plane][x] = ScalarConvert::to(gradOut * gradScale); 344 | } 345 | } 346 | } 347 | } 348 | 349 | if (gradWeight.numElements() > 0) { 350 | if (threadIdx.x == 0) { 351 | gradWeight[plane] += ScalarConvert::to(scale * dotP * stdVal); 352 | } 353 | } 354 | 355 | if (gradBias.numElements() > 0) { 356 | if (threadIdx.x == 0) { 357 | gradBias[plane] += ScalarConvert::to(scale * gradOutputSum); 358 | } 359 | } 360 | } 361 | 362 | #include "generic/batchnormp_cuda.cu" 363 | #include "THCGenerateFloatTypes.h" -------------------------------------------------------------------------------- /lib/src/batchnormp_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void THNN_CudaBatchNormalization_updateOutputhaha( 4 | THCState *state, THCudaTensor *input_, THCudaTensor *output_, 5 | THCudaTensor *weight_, THCudaTensor *bias_, THCudaTensor *runningMean_, 6 | THCudaTensor *runningVar_, THCudaTensor *saveMean_, THCudaTensor *saveStd_, 7 | int train, double momentum, double eps); 8 | 9 | 10 | void THNN_CudaBatchNormalization_backwardhaha( 11 | THCState *state, THCudaTensor *input_, THCudaTensor *gradOutput_, 12 | THCudaTensor *gradOutputMean_, THCudaTensor *dotP, 13 | THCudaTensor *gradInput_, THCudaTensor *gradWeight_, THCudaTensor *gradBias_, 14 | THCudaTensor *weight_, THCudaTensor *runningMean_, THCudaTensor *runningVar_, 15 | THCudaTensor *saveMean_, THCudaTensor *saveStd_, int train, double scale, 16 | double eps); -------------------------------------------------------------------------------- /lib/src/generic/batchnormp_cuda.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/batchnormp_cuda.cu" 3 | #else 4 | 5 | #define DeviceTensor3 THCDeviceTensor 6 | #define DeviceTensor1 THCDeviceTensor 7 | 8 | template 9 | static THCDeviceTensor devicetensor(THCState *state, THCTensor *t) { 10 | if (!t) { 11 | return THCDeviceTensor(); 12 | } 13 | 14 | int inDim = THCTensor_(nDimension)(state, t); 15 | if (inDim == Dim) { 16 | return toDeviceTensor(state, t); 17 | } 18 | 19 | // View in which the last dimensions are collapsed or expanded as needed 20 | THAssert(THCTensor_(isContiguous)(state, t)); 21 | int size[Dim]; 22 | for (int i = 0; i < Dim || i < inDim; ++i) { 23 | if (i < Dim && i < inDim) { 24 | size[i] = t->size[i]; 25 | } else if (i < Dim) { 26 | size[i] = 1; 27 | } else { 28 | size[Dim - 1] *= t->size[i]; 29 | } 30 | } 31 | return THCDeviceTensor(THCTensor_(data)(state, t), size); 32 | } 33 | 34 | extern "C" void THNN_(BatchNormalization_updateOutputhaha)( 35 | THCState *state, THCTensor *input_, THCTensor *output_, 36 | THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_, 37 | THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_, 38 | int train, double momentum, double eps); 39 | 40 | extern "C" void THNN_(BatchNormalization_mean)( 41 | THCState *state, THCTensor *input_, THCTensor *saveMean_); 42 | 43 | extern "C" void THNN_(BatchNormalization_var)( 44 | THCState *state, THCTensor *input_, THCTensor *saveMean_, 45 | THCTensor *saveVar_); 46 | 47 | 48 | void THNN_(BatchNormalization_mean)( 49 | THCState *state, THCTensor *input_, THCTensor *saveMean_) { 50 | DeviceTensor3 input = devicetensor<3>(state, input_); 51 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 52 | 53 | cudaStream_t s = THCState_getCurrentStream(state); 54 | cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); 55 | 56 | dim3 blocks(input.getSize(1)); 57 | dim3 threads(getNumThreads(input.getSize(2))); 58 | BatchNormalizationMean_kernel <<>>( 59 | input, saveMean); 60 | THCudaCheck(cudaGetLastError()); 61 | } 62 | 63 | void THNN_(BatchNormalization_var)( 64 | THCState *state, THCTensor *input_, THCTensor *saveMean_, THCTensor *saveVar_) { 65 | DeviceTensor3 input = devicetensor<3>(state, input_); 66 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 67 | DeviceTensor1 saveVar = devicetensor<1>(state, saveVar_); 68 | 69 | cudaStream_t s = THCState_getCurrentStream(state); 70 | cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); 71 | 72 | dim3 blocks(input.getSize(1)); 73 | dim3 threads(getNumThreads(input.getSize(2))); 74 | BatchNormalizationVar_kernel <<>>( 75 | input, saveMean, saveVar); 76 | THCudaCheck(cudaGetLastError()); 77 | } 78 | 79 | void THNN_(BatchNormalization_updateOutputhaha)( 80 | THCState *state, THCTensor *input_, THCTensor *output_, 81 | THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_, 82 | THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_, 83 | int train, double momentum, double eps) { 84 | 85 | THCTensor_(resizeAs)(state, output_, input_); 86 | DeviceTensor3 input = devicetensor<3>(state, input_); 87 | DeviceTensor3 output = devicetensor<3>(state, output_); 88 | DeviceTensor1 weight = devicetensor<1>(state, weight_); 89 | DeviceTensor1 bias = devicetensor<1>(state, bias_); 90 | DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); 91 | DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); 92 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 93 | DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); 94 | 95 | cudaStream_t s = THCState_getCurrentStream(state); 96 | cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); 97 | 98 | if (!train) { 99 | dim3 blocks(input.getSize(1)); 100 | dim3 threads(getNumThreads(input.getSize(2))); 101 | BatchNormalizationUpdateOutputInference_kernel <<>>( 102 | input, output, runningMean, runningVar, weight, bias, eps); 103 | } else { 104 | dim3 blocks(input.getSize(1)); 105 | dim3 threads(getNumThreads(input.getSize(2))); 106 | BatchNormalizationUpdateOutput_kernelhaha <<>>( 107 | input, output, weight, bias, eps, momentum, runningMean, runningVar, 108 | saveMean, saveStd); 109 | } 110 | THCudaCheck(cudaGetLastError()); 111 | } 112 | 113 | extern "C" void THNN_(BatchNormalization_backwardhaha)( 114 | THCState *state, THCTensor *input_, THCTensor *gradOutput_, 115 | THCTensor *gradOutputMean_, THCTensor *dotP, 116 | THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_, 117 | THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_, 118 | THCTensor *saveMean_, THCTensor *saveStd_, int train, double scale, double eps); 119 | 120 | 121 | extern "C" void THNN_(BatchNormalization_mean_grad)( 122 | THCState *state, THCTensor *input_, THCTensor *gradOutput_, 123 | THCTensor *runningMean_, THCTensor *saveMean_, 124 | THCTensor *gradOutputMean_, THCTensor *dotP_, int train); 125 | 126 | 127 | void THNN_(BatchNormalization_mean_grad)( 128 | THCState *state, THCTensor *input_, THCTensor *gradOutput_, 129 | THCTensor *runningMean_, THCTensor *saveMean_, 130 | THCTensor *gradOutputMean_, THCTensor *dotP_, int train) { 131 | 132 | THCUNN_check_shape(state, input_, gradOutput_); 133 | DeviceTensor3 input = devicetensor<3>(state, input_); 134 | DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); 135 | DeviceTensor1 gradOutputMean = devicetensor<1>(state, gradOutputMean_); 136 | DeviceTensor1 dotP = devicetensor<1>(state, dotP_); 137 | 138 | DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); 139 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 140 | 141 | cudaStream_t s = THCState_getCurrentStream(state); 142 | 143 | dim3 blocks(gradOutput.getSize(1)); 144 | dim3 threads(getNumThreads(gradOutput.getSize(2))); 145 | BatchNormalizationMeanGrad_kernel <<>>( 146 | input, gradOutput, runningMean, saveMean, gradOutputMean, dotP, train); 147 | THCudaCheck(cudaGetLastError()); 148 | } 149 | 150 | 151 | void THNN_(BatchNormalization_backwardhaha)( 152 | THCState *state, THCTensor *input_, THCTensor *gradOutput_, 153 | THCTensor *gradOutputMean_, THCTensor *dotP_, 154 | THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_, 155 | THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_, 156 | THCTensor *saveMean_, THCTensor *saveStd_, int train, double scale, double eps) { 157 | 158 | THCUNN_check_shape(state, input_, gradOutput_); 159 | DeviceTensor3 input = devicetensor<3>(state, input_); 160 | DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); 161 | DeviceTensor1 gradOutputMean = devicetensor<1>(state, gradOutputMean_); 162 | DeviceTensor1 dotP = devicetensor<1>(state, dotP_); 163 | DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_); 164 | DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_); 165 | DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_); 166 | DeviceTensor1 weight = devicetensor<1>(state, weight_); 167 | DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); 168 | DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); 169 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 170 | DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); 171 | 172 | cudaStream_t s = THCState_getCurrentStream(state); 173 | 174 | dim3 blocks(gradOutput.getSize(1)); 175 | dim3 threads(getNumThreads(gradOutput.getSize(2))); 176 | BatchNormalizationBackward_kernel <<>>( 177 | input, gradOutput, gradOutputMean, dotP, gradInput, gradWeight, gradBias, weight, runningMean, runningVar, 178 | saveMean, saveStd, train, scale, eps); 179 | THCudaCheck(cudaGetLastError()); 180 | } 181 | 182 | #undef DeviceTensor3 183 | #undef DeviceTensor1 184 | 185 | #endif 186 | -------------------------------------------------------------------------------- /lib/test.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import time 3 | import logging 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | from torch.autograd import gradcheck 8 | 9 | from modules import batchnormsync 10 | 11 | FORMAT = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s" 12 | logging.basicConfig(format=FORMAT) 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.DEBUG) 15 | 16 | batchnormsync.BatchNormSync.checking_mode = True 17 | batchnormsync.BatchNormSync.sync = True 18 | 19 | cuda = True 20 | batch_size = 3 21 | input = torch.randn(3, 3, 2, 2).float() 22 | # input = torch.Tensor(range(60 * batch_size)).float().resize_(batch_size, 3, 2, 2) / 100 23 | bn = batchnormsync.BatchNormSync(3, eps=0, affine=True, 24 | device_ids=None) 25 | bn2 = torch.nn.BatchNorm2d(3, eps=0, affine=False) 26 | # bn.train() 27 | 28 | bn1 = batchnormsync.BatchNormSync(3, eps=0, affine=True, device_ids=[0]) 29 | 30 | bn1.train() 31 | 32 | if cuda: 33 | bn = torch.nn.DataParallel(bn) 34 | bn2 = torch.nn.DataParallel(bn2) 35 | 36 | bn = bn.cuda() 37 | bn1 = bn1.cuda() 38 | bn2 = bn2.cuda() 39 | input = input.cuda() 40 | 41 | 42 | inputs = (Variable(input, requires_grad=True),) 43 | # output = bn(inputs[0]) 44 | 45 | # output1 = bn1(inputs[0]) 46 | # output2 = bn2(inputs[0]) 47 | # print((output1 - output2).abs().max()) 48 | # print((output - output2).abs().max()) 49 | # test = gradcheck(bn, inputs, eps=1e-4, atol=1e-4, rtol=1e-8) 50 | for i in range(1000): 51 | logger.info(i) 52 | start_time = time.time() 53 | test = gradcheck(bn, inputs, eps=1e-4, atol=1e-2, rtol=1e-3) 54 | logger.info('%s %f', test, time.time() - start_time) 55 | -------------------------------------------------------------------------------- /scripts/pre-commit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | OPTIONS="--show-source" 4 | 5 | RETURN=0 6 | PYSTYLE=$(which pycodestyle) 7 | if [ $? -ne 0 ]; then 8 | echo "[!] pycodestyle not installed. Unable to check source file format policy." >&2 9 | exit 1 10 | fi 11 | 12 | FILES=`git diff --cached --name-only --diff-filter=ACMR | grep -E "\.(py)$"` 13 | for FILE in $FILES; do 14 | $PYSTYLE $OPTIONS $FILE >&2 15 | if [ $? -ne 0 ]; then 16 | echo "[!] $FILE does not respect pep8." >&2 17 | RETURN=1 18 | fi 19 | done 20 | 21 | if [ $RETURN -eq 1 ]; then 22 | exit 1 23 | fi 24 | 25 | CPPSTYLE=$(which cpplint) 26 | CPPOPTIONS="" 27 | if [ $? -ne 0 ]; then 28 | echo "[!] cpplint not installed. Unable to check source file format policy." >&2 29 | exit 1 30 | fi 31 | 32 | FILES=`git diff --cached --name-only --diff-filter=ACMR | grep -E "\.(c|h|cpp|hpp|cu)$"` 33 | for FILE in $FILES; do 34 | $CPPSTYLE $CPPOPTIONS $FILE >&2 35 | if [ $? -ne 0 ]; then 36 | echo "[!] $FILE does not respect google code style." >&2 37 | RETURN=1 38 | fi 39 | done 40 | 41 | exit $RETURN 42 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/env bash 2 | 3 | if ! [ -x "$(command -v pycodestyle)" ]; then 4 | pip install pycodestyle 5 | fi 6 | 7 | if ! [ -x "$(command -v cpplint)" ]; then 8 | pip install cpplint 9 | fi 10 | 11 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 12 | PC_PATH=$DIR/../.git/hooks/pre-commit 13 | if [ -f $PC_PATH ]; then 14 | rm $PC_PATH 15 | fi 16 | ln -s $DIR/pre-commit.sh $DIR/../.git/hooks/pre-commit 17 | chmod +x $DIR/pre-commit.sh 18 | -------------------------------------------------------------------------------- /segment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import threading 6 | from os.path import exists, join, split, dirname 7 | 8 | import time 9 | 10 | import numpy as np 11 | import shutil 12 | 13 | import sys 14 | from PIL import Image 15 | import torch 16 | import torch.utils.data 17 | from torch import nn 18 | import torch.backends.cudnn as cudnn 19 | from torch.autograd import Variable 20 | 21 | import dla_up 22 | import data_transforms as transforms 23 | import dataset 24 | 25 | try: 26 | from modules import batchnormsync 27 | HAS_BN_SYNC = True 28 | except ImportError: 29 | HAS_BN_SYNC = False 30 | 31 | FORMAT = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s" 32 | logging.basicConfig(format=FORMAT) 33 | logger = logging.getLogger(__name__) 34 | logger.setLevel(logging.DEBUG) 35 | 36 | CITYSCAPE_PALLETE = np.asarray([ 37 | [128, 64, 128], 38 | [244, 35, 232], 39 | [70, 70, 70], 40 | [102, 102, 156], 41 | [190, 153, 153], 42 | [153, 153, 153], 43 | [250, 170, 30], 44 | [220, 220, 0], 45 | [107, 142, 35], 46 | [152, 251, 152], 47 | [70, 130, 180], 48 | [220, 20, 60], 49 | [255, 0, 0], 50 | [0, 0, 142], 51 | [0, 0, 70], 52 | [0, 60, 100], 53 | [0, 80, 100], 54 | [0, 0, 230], 55 | [119, 11, 32], 56 | [0, 0, 0]], dtype=np.uint8) 57 | 58 | 59 | class SegList(torch.utils.data.Dataset): 60 | def __init__(self, data_dir, phase, transforms, list_dir=None, 61 | out_name=False, out_size=False, binary=False): 62 | self.list_dir = data_dir if list_dir is None else list_dir 63 | self.data_dir = data_dir 64 | self.out_name = out_name 65 | self.phase = phase 66 | self.transforms = transforms 67 | self.image_list = None 68 | self.label_list = None 69 | self.bbox_list = None 70 | self.out_size = out_size 71 | self.binary = binary 72 | self.read_lists() 73 | 74 | def __getitem__(self, index): 75 | image = Image.open(join(self.data_dir, self.image_list[index])) 76 | data = [image] 77 | if self.label_list is not None: 78 | label_map = Image.open(join(self.data_dir, self.label_list[index])) 79 | if self.binary: 80 | label_map = Image.fromarray( 81 | (np.array(label_map) > 0).astype(np.uint8)) 82 | data.append(label_map) 83 | if self.bbox_list is not None: 84 | data.append(Image.open(join(self.data_dir, self.bbox_list[index]))) 85 | data = list(self.transforms(*data)) 86 | if self.out_name: 87 | if self.label_list is None: 88 | data.append(data[0][0, :, :]) 89 | data.append(self.image_list[index]) 90 | if self.out_size: 91 | data.append(torch.from_numpy(np.array(image.size, dtype=int))) 92 | return tuple(data) 93 | 94 | def __len__(self): 95 | return len(self.image_list) 96 | 97 | def read_lists(self): 98 | image_path = join(self.list_dir, self.phase + '_images.txt') 99 | label_path = join(self.list_dir, self.phase + '_labels.txt') 100 | bbox_path = join(self.list_dir, self.phase + '_bboxes.txt') 101 | assert exists(image_path) 102 | self.image_list = [line.strip() for line in open(image_path, 'r')] 103 | if exists(label_path): 104 | self.label_list = [line.strip() for line in open(label_path, 'r')] 105 | assert len(self.image_list) == len(self.label_list) 106 | if exists(bbox_path): 107 | self.bbox_list = [line.strip() for line in open(bbox_path, 'r')] 108 | assert len(self.image_list) == len(self.bbox_list) 109 | 110 | 111 | class SegListMS(torch.utils.data.Dataset): 112 | def __init__(self, data_dir, phase, transforms, scales, list_dir=None): 113 | self.list_dir = data_dir if list_dir is None else list_dir 114 | self.data_dir = data_dir 115 | self.phase = phase 116 | self.transforms = transforms 117 | self.image_list = None 118 | self.label_list = None 119 | self.bbox_list = None 120 | self.read_lists() 121 | self.scales = scales 122 | 123 | def __getitem__(self, index): 124 | data = [Image.open(join(self.data_dir, self.image_list[index]))] 125 | w, h = data[0].size 126 | if self.label_list is not None: 127 | data.append(Image.open(join(self.data_dir, 128 | self.label_list[index]))) 129 | # data = list(self.transforms(*data)) 130 | if len(data) > 1: 131 | out_data = list(self.transforms(*data)) 132 | else: 133 | out_data = [self.transforms(*data)] 134 | ms_images = [self.transforms(data[0].resize((int(w * s), int(h * s)), 135 | Image.BICUBIC)) 136 | for s in self.scales] 137 | out_data.append(self.image_list[index]) 138 | out_data.extend(ms_images) 139 | return tuple(out_data) 140 | 141 | def __len__(self): 142 | return len(self.image_list) 143 | 144 | def read_lists(self): 145 | image_path = join(self.list_dir, self.phase + '_images.txt') 146 | label_path = join(self.list_dir, self.phase + '_labels.txt') 147 | assert exists(image_path) 148 | self.image_list = [line.strip() for line in open(image_path, 'r')] 149 | if exists(label_path): 150 | self.label_list = [line.strip() for line in open(label_path, 'r')] 151 | assert len(self.image_list) == len(self.label_list) 152 | 153 | 154 | def validate(val_loader, model, criterion, eval_score=None, print_freq=10): 155 | batch_time = AverageMeter() 156 | losses = AverageMeter() 157 | score = AverageMeter() 158 | 159 | # switch to evaluate mode 160 | model.eval() 161 | 162 | end = time.time() 163 | for i, (input, target) in enumerate(val_loader): 164 | if type(criterion) in [torch.nn.modules.loss.L1Loss, 165 | torch.nn.modules.loss.MSELoss]: 166 | target = target.float() 167 | input = input.cuda() 168 | target = target.cuda(async=True) 169 | input_var = torch.autograd.Variable(input, volatile=True) 170 | target_var = torch.autograd.Variable(target, volatile=True) 171 | 172 | # compute output 173 | output = model(input_var)[0] 174 | loss = criterion(output, target_var) 175 | 176 | # measure accuracy and record loss 177 | # prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 178 | losses.update(loss.data[0], input.size(0)) 179 | if eval_score is not None: 180 | score.update(eval_score(output, target_var), input.size(0)) 181 | 182 | # measure elapsed time 183 | batch_time.update(time.time() - end) 184 | end = time.time() 185 | 186 | if i % print_freq == 0: 187 | print('Test: [{0}/{1}]\t' 188 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 189 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 190 | 'Score {score.val:.3f} ({score.avg:.3f})'.format( 191 | i, len(val_loader), batch_time=batch_time, loss=losses, 192 | score=score), flush=True) 193 | 194 | print(' * Score {top1.avg:.3f}'.format(top1=score)) 195 | 196 | return score.avg 197 | 198 | 199 | class AverageMeter(object): 200 | """Computes and stores the average and current value""" 201 | 202 | def __init__(self): 203 | self.reset() 204 | 205 | def reset(self): 206 | self.val = 0 207 | self.avg = 0 208 | self.sum = 0 209 | self.count = 0 210 | 211 | def update(self, val, n=1): 212 | self.val = val 213 | self.sum += val * n 214 | self.count += n 215 | self.avg = self.sum / self.count 216 | 217 | 218 | def accuracy(output, target): 219 | """Computes the precision@k for the specified values of k""" 220 | # batch_size = target.size(0) * target.size(1) * target.size(2) 221 | _, pred = output.max(1) 222 | pred = pred.view(1, -1) 223 | target = target.view(1, -1) 224 | correct = pred.eq(target) 225 | correct = correct[target != 255] 226 | correct = correct.view(-1) 227 | score = correct.float().sum(0).mul(100.0 / correct.size(0)) 228 | return score.data[0] 229 | 230 | 231 | def train(train_loader, model, criterion, optimizer, epoch, 232 | eval_score=None, print_freq=10): 233 | batch_time = AverageMeter() 234 | data_time = AverageMeter() 235 | losses = AverageMeter() 236 | scores = AverageMeter() 237 | 238 | # switch to train mode 239 | model.train() 240 | 241 | end = time.time() 242 | 243 | for i, (input, target) in enumerate(train_loader): 244 | # measure data loading time 245 | data_time.update(time.time() - end) 246 | 247 | # pdb.set_trace() 248 | 249 | if type(criterion) in [torch.nn.modules.loss.L1Loss, 250 | torch.nn.modules.loss.MSELoss]: 251 | target = target.float() 252 | 253 | input = input.cuda() 254 | target = target.cuda(async=True) 255 | input_var = torch.autograd.Variable(input) 256 | target_var = torch.autograd.Variable(target) 257 | 258 | # compute output 259 | output = model(input_var)[0] 260 | loss = criterion(output, target_var) 261 | 262 | # measure accuracy and record loss 263 | # prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 264 | losses.update(loss.data[0], input.size(0)) 265 | if eval_score is not None: 266 | scores.update(eval_score(output, target_var), input.size(0)) 267 | 268 | # compute gradient and do SGD step 269 | optimizer.zero_grad() 270 | loss.backward() 271 | optimizer.step() 272 | 273 | # measure elapsed time 274 | batch_time.update(time.time() - end) 275 | end = time.time() 276 | 277 | if i % print_freq == 0: 278 | print('Epoch: [{0}][{1}/{2}]\t' 279 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 280 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 281 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 282 | 'Score {top1.val:.3f} ({top1.avg:.3f})'.format( 283 | epoch, i, len(train_loader), batch_time=batch_time, 284 | data_time=data_time, loss=losses, top1=scores)) 285 | 286 | 287 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 288 | torch.save(state, filename) 289 | if is_best: 290 | shutil.copyfile(filename, 'model_best.pth.tar') 291 | 292 | 293 | def train_seg(args): 294 | batch_size = args.batch_size 295 | num_workers = args.workers 296 | crop_size = args.crop_size 297 | 298 | print(' '.join(sys.argv)) 299 | 300 | for k, v in args.__dict__.items(): 301 | print(k, ':', v) 302 | 303 | pretrained_base = args.pretrained_base 304 | single_model = dla_up.__dict__.get(args.arch)( 305 | args.classes, pretrained_base, down_ratio=args.down) 306 | model = torch.nn.DataParallel(single_model).cuda() 307 | if args.edge_weight > 0: 308 | weight = torch.from_numpy( 309 | np.array([1, args.edge_weight], dtype=np.float32)) 310 | criterion = nn.NLLLoss2d(ignore_index=255, weight=weight) 311 | else: 312 | criterion = nn.NLLLoss2d(ignore_index=255) 313 | 314 | criterion.cuda() 315 | 316 | data_dir = args.data_dir 317 | info = dataset.load_dataset_info(data_dir) 318 | normalize = transforms.Normalize(mean=info.mean, std=info.std) 319 | t = [] 320 | if args.random_rotate > 0: 321 | t.append(transforms.RandomRotate(args.random_rotate)) 322 | if args.random_scale > 0: 323 | t.append(transforms.RandomScale(args.random_scale)) 324 | t.append(transforms.RandomCrop(crop_size)) 325 | if args.random_color: 326 | t.append(transforms.RandomJitter(0.4, 0.4, 0.4)) 327 | t.extend([transforms.RandomHorizontalFlip(), 328 | transforms.ToTensor(), 329 | normalize]) 330 | train_loader = torch.utils.data.DataLoader( 331 | SegList(data_dir, 'train', transforms.Compose(t), 332 | binary=(args.classes == 2)), 333 | batch_size=batch_size, shuffle=True, num_workers=num_workers, 334 | pin_memory=True 335 | ) 336 | val_loader = torch.utils.data.DataLoader( 337 | SegList(data_dir, 'val', transforms.Compose([ 338 | transforms.RandomCrop(crop_size), 339 | # transforms.RandomHorizontalFlip(), 340 | transforms.ToTensor(), 341 | normalize, 342 | ]), binary=(args.classes == 2)), 343 | batch_size=batch_size, shuffle=False, num_workers=num_workers, 344 | pin_memory=True 345 | ) 346 | optimizer = torch.optim.SGD(single_model.optim_parameters(), 347 | args.lr, 348 | momentum=args.momentum, 349 | weight_decay=args.weight_decay) 350 | cudnn.benchmark = True 351 | best_prec1 = 0 352 | start_epoch = 0 353 | 354 | # optionally resume from a checkpoint 355 | if args.resume: 356 | if os.path.isfile(args.resume): 357 | print("=> loading checkpoint '{}'".format(args.resume)) 358 | checkpoint = torch.load(args.resume) 359 | start_epoch = checkpoint['epoch'] 360 | best_prec1 = checkpoint['best_prec1'] 361 | model.load_state_dict(checkpoint['state_dict']) 362 | print("=> loaded checkpoint '{}' (epoch {})" 363 | .format(args.resume, checkpoint['epoch'])) 364 | else: 365 | print("=> no checkpoint found at '{}'".format(args.resume)) 366 | 367 | if args.evaluate: 368 | validate(val_loader, model, criterion, eval_score=accuracy) 369 | return 370 | 371 | for epoch in range(start_epoch, args.epochs): 372 | lr = adjust_learning_rate(args, optimizer, epoch) 373 | print('Epoch: [{0}]\tlr {1:.06f}'.format(epoch, lr)) 374 | # train for one epoch 375 | train(train_loader, model, criterion, optimizer, epoch, 376 | eval_score=accuracy) 377 | 378 | # evaluate on validation set 379 | prec1 = validate(val_loader, model, criterion, eval_score=accuracy) 380 | 381 | is_best = prec1 > best_prec1 382 | best_prec1 = max(prec1, best_prec1) 383 | checkpoint_path = 'checkpoint_latest.pth.tar' 384 | save_checkpoint({ 385 | 'epoch': epoch + 1, 386 | 'arch': args.arch, 387 | 'state_dict': model.state_dict(), 388 | 'best_prec1': best_prec1, 389 | }, is_best, filename=checkpoint_path) 390 | if (epoch + 1) % args.save_freq == 0: 391 | history_path = 'checkpoint_{:03d}.pth.tar'.format(epoch + 1) 392 | shutil.copyfile(checkpoint_path, history_path) 393 | 394 | 395 | def adjust_learning_rate(args, optimizer, epoch): 396 | """Sets the learning rate to the initial LR decayed by 10 397 | every 30 epochs""" 398 | if args.lr_mode == 'step': 399 | lr = args.lr * (0.1 ** (epoch // args.step)) 400 | elif args.lr_mode == 'poly': 401 | lr = args.lr * (1 - epoch / args.epochs) ** 0.9 402 | else: 403 | raise ValueError('Unknown lr mode {}'.format(args.lr_mode)) 404 | 405 | for param_group in optimizer.param_groups: 406 | param_group['lr'] = lr 407 | return lr 408 | 409 | 410 | def fast_hist(pred, label, n): 411 | k = (label >= 0) & (label < n) 412 | return np.bincount( 413 | n * label[k].astype(int) + pred[k], minlength=n ** 2).reshape(n, n) 414 | 415 | 416 | def per_class_iu(hist): 417 | return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 418 | 419 | 420 | def crop_image(image, size): 421 | left = (image.size[0] - size[0]) // 2 422 | upper = (image.size[1] - size[1]) // 2 423 | right = left + size[0] 424 | lower = upper + size[1] 425 | return image.crop((left, upper, right, lower)) 426 | 427 | 428 | def save_output_images(predictions, filenames, output_dir, sizes=None): 429 | """ 430 | Saves a given (B x C x H x W) into an image file. 431 | If given a mini-batch tensor, will save the tensor as a grid of images. 432 | """ 433 | # pdb.set_trace() 434 | for ind in range(len(filenames)): 435 | im = Image.fromarray(predictions[ind].astype(np.uint8)) 436 | if sizes is not None: 437 | im = crop_image(im, sizes[ind]) 438 | fn = os.path.join(output_dir, filenames[ind][:-4] + '.png') 439 | out_dir = split(fn)[0] 440 | if not exists(out_dir): 441 | os.makedirs(out_dir) 442 | im.save(fn) 443 | 444 | 445 | def save_prob_images(prob, filenames, output_dir, sizes=None): 446 | for ind in range(len(filenames)): 447 | im = Image.fromarray( 448 | (prob[ind][1].squeeze().data.cpu().numpy() * 255).astype(np.uint8)) 449 | if sizes is not None: 450 | im = crop_image(im, sizes[ind]) 451 | fn = os.path.join(output_dir, filenames[ind][:-4] + '.png') 452 | out_dir = split(fn)[0] 453 | if not exists(out_dir): 454 | os.makedirs(out_dir) 455 | im.save(fn) 456 | 457 | 458 | def save_colorful_images(predictions, filenames, output_dir, palettes): 459 | """ 460 | Saves a given (B x C x H x W) into an image file. 461 | If given a mini-batch tensor, will save the tensor as a grid of images. 462 | """ 463 | for ind in range(len(filenames)): 464 | im = Image.fromarray(palettes[predictions[ind].squeeze()]) 465 | fn = os.path.join(output_dir, filenames[ind][:-4] + '.png') 466 | out_dir = split(fn)[0] 467 | if not exists(out_dir): 468 | os.makedirs(out_dir) 469 | im.save(fn) 470 | 471 | 472 | def test(eval_data_loader, model, num_classes, 473 | output_dir='pred', has_gt=True, save_vis=False): 474 | model.eval() 475 | batch_time = AverageMeter() 476 | data_time = AverageMeter() 477 | end = time.time() 478 | hist = np.zeros((num_classes, num_classes)) 479 | for iter, (image, label, name, size) in enumerate(eval_data_loader): 480 | data_time.update(time.time() - end) 481 | image_var = Variable(image, requires_grad=False, volatile=True) 482 | final = model(image_var)[0] 483 | _, pred = torch.max(final, 1) 484 | pred = pred.cpu().data.numpy() 485 | batch_time.update(time.time() - end) 486 | prob = torch.exp(final) 487 | if save_vis: 488 | save_output_images(pred, name, output_dir, size) 489 | if prob.size(1) == 2: 490 | save_prob_images(prob, name, output_dir + '_prob', size) 491 | else: 492 | save_colorful_images(pred, name, output_dir + '_color', 493 | CITYSCAPE_PALLETE) 494 | if has_gt: 495 | label = label.numpy() 496 | hist += fast_hist(pred.flatten(), label.flatten(), num_classes) 497 | print('===> mAP {mAP:.3f}'.format( 498 | mAP=round(np.nanmean(per_class_iu(hist)) * 100, 2))) 499 | end = time.time() 500 | print('Eval: [{0}/{1}]\t' 501 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 502 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 503 | .format(iter, len(eval_data_loader), batch_time=batch_time, 504 | data_time=data_time)) 505 | ious = per_class_iu(hist) * 100 506 | print(' '.join('{:.03f}'.format(i) for i in ious)) 507 | if has_gt: # val 508 | return round(np.nanmean(ious), 2) 509 | 510 | 511 | def resize_4d_tensor(tensor, width, height): 512 | tensor_cpu = tensor.cpu().numpy() 513 | if tensor.size(2) == height and tensor.size(3) == width: 514 | return tensor_cpu 515 | out_size = (tensor.size(0), tensor.size(1), height, width) 516 | out = np.empty(out_size, dtype=np.float32) 517 | 518 | def resize_one(i, j): 519 | out[i, j] = np.array( 520 | Image.fromarray(tensor_cpu[i, j]).resize( 521 | (width, height), Image.BILINEAR)) 522 | 523 | def resize_channel(j): 524 | for i in range(tensor.size(0)): 525 | out[i, j] = np.array( 526 | Image.fromarray(tensor_cpu[i, j]).resize( 527 | (width, height), Image.BILINEAR)) 528 | 529 | workers = [threading.Thread(target=resize_channel, args=(j,)) 530 | for j in range(tensor.size(1))] 531 | for w in workers: 532 | w.start() 533 | for w in workers: 534 | w.join() 535 | return out 536 | 537 | 538 | def test_ms(eval_data_loader, model, num_classes, scales, 539 | output_dir='pred', has_gt=True, save_vis=False): 540 | model.eval() 541 | batch_time = AverageMeter() 542 | data_time = AverageMeter() 543 | end = time.time() 544 | hist = np.zeros((num_classes, num_classes)) 545 | num_scales = len(scales) 546 | for iter, input_data in enumerate(eval_data_loader): 547 | data_time.update(time.time() - end) 548 | if has_gt: 549 | name = input_data[2] 550 | label = input_data[1] 551 | else: 552 | name = input_data[1] 553 | h, w = input_data[0].size()[2:4] 554 | images = [input_data[0]] 555 | images.extend(input_data[-num_scales:]) 556 | outputs = [] 557 | for image in images: 558 | image_var = Variable(image, requires_grad=False, volatile=True) 559 | final = model(image_var)[0] 560 | outputs.append(final.data) 561 | final = sum([resize_4d_tensor(out, w, h) for out in outputs]) 562 | pred = final.argmax(axis=1) 563 | batch_time.update(time.time() - end) 564 | if save_vis: 565 | save_output_images(pred, name, output_dir) 566 | save_colorful_images(pred, name, output_dir + '_color', 567 | CITYSCAPE_PALLETE) 568 | if has_gt: 569 | label = label.numpy() 570 | hist += fast_hist(pred.flatten(), label.flatten(), num_classes) 571 | logger.info('===> mAP {mAP:.3f}'.format( 572 | mAP=round(np.nanmean(per_class_iu(hist)) * 100, 2))) 573 | end = time.time() 574 | logger.info('Eval: [{0}/{1}]\t' 575 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 576 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 577 | .format(iter, len(eval_data_loader), batch_time=batch_time, 578 | data_time=data_time)) 579 | if has_gt: # val 580 | ious = per_class_iu(hist) * 100 581 | logger.info(' '.join('{:.03f}'.format(i) for i in ious)) 582 | return round(np.nanmean(ious), 2) 583 | 584 | 585 | def test_seg(args): 586 | batch_size = args.batch_size 587 | num_workers = args.workers 588 | phase = args.phase 589 | 590 | for k, v in args.__dict__.items(): 591 | print(k, ':', v) 592 | 593 | single_model = dla_up.__dict__.get(args.arch)( 594 | args.classes, down_ratio=args.down) 595 | 596 | model = torch.nn.DataParallel(single_model).cuda() 597 | 598 | data_dir = args.data_dir 599 | info = dataset.load_dataset_info(data_dir) 600 | normalize = transforms.Normalize(mean=info.mean, std=info.std) 601 | # scales = [0.5, 0.75, 1.25, 1.5, 1.75] 602 | scales = [0.5, 0.75, 1.25, 1.5] 603 | t = [] 604 | if args.crop_size > 0: 605 | t.append(transforms.PadToSize(args.crop_size)) 606 | t.extend([transforms.ToTensor(), normalize]) 607 | if args.ms: 608 | data = SegListMS(data_dir, phase, transforms.Compose(t), scales) 609 | else: 610 | data = SegList(data_dir, phase, transforms.Compose(t), 611 | out_name=True, out_size=True, 612 | binary=args.classes == 2) 613 | test_loader = torch.utils.data.DataLoader( 614 | data, 615 | batch_size=batch_size, shuffle=False, num_workers=num_workers, 616 | pin_memory=False 617 | ) 618 | 619 | cudnn.benchmark = True 620 | 621 | # optionally resume from a checkpoint 622 | start_epoch = 0 623 | if args.resume: 624 | if os.path.isfile(args.resume): 625 | print("=> loading checkpoint '{}'".format(args.resume)) 626 | checkpoint = torch.load(args.resume) 627 | start_epoch = checkpoint['epoch'] 628 | best_prec1 = checkpoint['best_prec1'] 629 | model.load_state_dict(checkpoint['state_dict']) 630 | print("=> loaded checkpoint '{}' (epoch {})" 631 | .format(args.resume, checkpoint['epoch'])) 632 | else: 633 | print("=> no checkpoint found at '{}'".format(args.resume)) 634 | 635 | out_dir = '{}_{:03d}_{}'.format(args.arch, start_epoch, phase) 636 | if len(args.test_suffix) > 0: 637 | out_dir += '_' + args.test_suffix 638 | 639 | if args.ms: 640 | out_dir += '_ms' 641 | 642 | if args.ms: 643 | mAP = test_ms(test_loader, model, args.classes, save_vis=True, 644 | has_gt=phase != 'test' or args.with_gt, 645 | output_dir=out_dir, 646 | scales=scales) 647 | else: 648 | mAP = test(test_loader, model, args.classes, save_vis=True, 649 | has_gt=phase != 'test' or args.with_gt, output_dir=out_dir) 650 | print('mAP: ', mAP) 651 | 652 | 653 | def parse_args(): 654 | # Training settings 655 | parser = argparse.ArgumentParser( 656 | description='DLA Segmentation and Boundary Prediction') 657 | parser.add_argument('cmd', choices=['train', 'test']) 658 | parser.add_argument('-d', '--data-dir', default=None) 659 | parser.add_argument('-c', '--classes', default=0, type=int) 660 | parser.add_argument('-s', '--crop-size', default=0, type=int) 661 | parser.add_argument('--step', type=int, default=200) 662 | parser.add_argument('--arch') 663 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 664 | help='input batch size for training (default: 64)') 665 | parser.add_argument('--train-samples', default=16000, type=int) 666 | parser.add_argument('--loss', default='l1', type=str) 667 | parser.add_argument('--test-batch-size', type=int, default=1000, 668 | metavar='N', 669 | help='input batch size for testing (default: 1000)') 670 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 671 | help='number of epochs to train (default: 10)') 672 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 673 | help='learning rate (default: 0.01)') 674 | parser.add_argument('--momentum', type=float, default=0.9, metavar='M', 675 | help='SGD momentum (default: 0.9)') 676 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 677 | metavar='W', help='weight decay (default: 1e-4)') 678 | parser.add_argument('-e', '--evaluate', dest='evaluate', 679 | action='store_true', 680 | help='evaluate model on validation set') 681 | parser.add_argument('--no-cuda', action='store_true', default=False, 682 | help='enables CUDA training') 683 | parser.add_argument('--seed', type=int, default=1, metavar='S', 684 | help='- seed (default: 1)') 685 | parser.add_argument('--log-interval', type=int, default=1, metavar='N', 686 | help='how many batches to wait before logging ' 687 | 'training status') 688 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 689 | help='path to latest checkpoint (default: none)') 690 | parser.add_argument('--pretrained-base', default=None, 691 | help='use pre-trained model') 692 | parser.add_argument('-j', '--workers', type=int, default=8) 693 | parser.add_argument('--down', default=2, type=int, choices=[2, 4, 8, 16], 694 | help='Downsampling ratio of IDA network output, which ' 695 | 'is then upsampled to the original resolution ' 696 | 'with bilinear interpolation.') 697 | parser.add_argument('--load-release', dest='load_rel', default=None) 698 | parser.add_argument('--phase', default='val') 699 | parser.add_argument('--lr-mode', default='step') 700 | parser.add_argument('--bn-sync', action='store_true', default=False) 701 | parser.add_argument('--random-scale', default=0, type=float) 702 | parser.add_argument('--random-rotate', default=0, type=int) 703 | parser.add_argument('--random-color', action='store_true', default=False) 704 | parser.add_argument('--save-freq', default=10, type=int) 705 | parser.add_argument('--ms', action='store_true', default=False) 706 | parser.add_argument('--edge-weight', type=int, default=-1) 707 | parser.add_argument('--test-suffix', default='') 708 | parser.add_argument('--with-gt', action='store_true') 709 | args = parser.parse_args() 710 | args.cuda = not args.no_cuda and torch.cuda.is_available() 711 | 712 | assert args.data_dir is not None 713 | assert args.classes > 0 714 | 715 | print(' '.join(sys.argv)) 716 | print(args) 717 | 718 | return args 719 | 720 | 721 | def main(): 722 | args = parse_args() 723 | if args.bn_sync: 724 | if HAS_BN_SYNC: 725 | dla_up.set_bn(batchnormsync.BatchNormSync) 726 | else: 727 | print('batch normalization synchronization across GPUs ' 728 | 'is not imported.') 729 | if args.cmd == 'train': 730 | train_seg(args) 731 | elif args.cmd == 'test': 732 | test_seg(args) 733 | 734 | 735 | if __name__ == '__main__': 736 | main() 737 | --------------------------------------------------------------------------------