├── AN30.png ├── PENCIL.py ├── README.md ├── SN70.png ├── addnoise_AN.py ├── addnoise_SN.py └── framework.png /AN30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikun2019/PENCIL/8a89ed50b6ecd998439b09947b59412dccbca93f/AN30.png -------------------------------------------------------------------------------- /PENCIL.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import shutil 3 | import time 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.parallel 7 | import torch.backends.cudnn as cudnn 8 | import torch.optim 9 | import torch.utils.data 10 | import torch.utils.data.distributed 11 | import torchvision.transforms as transforms 12 | import models 13 | import numpy as np 14 | from PIL import Image 15 | import os 16 | import os.path 17 | import sys 18 | import resnet 19 | if sys.version_info[0] == 2: 20 | import cPickle as pickle 21 | else: 22 | import pickle 23 | 24 | 25 | model_names = sorted(name for name in models.__dict__ 26 | if name.islower() and not name.startswith("__") 27 | and callable(models.__dict__[name])) 28 | 29 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 30 | parser.add_argument('--arch', '-a', metavar='ARCH', default='preact_resnet32', 31 | choices=model_names, 32 | help='model architecture: ' + 33 | ' | '.join(model_names) + 34 | ' (default: resnet18)') 35 | parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', 36 | help='number of data loading workers (default: 4)') 37 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 38 | help='manual epoch number (useful on restarts)') 39 | parser.add_argument('-b', '--batch-size', default=128, type=int, 40 | metavar='N', help='mini-batch size (default: 256)') 41 | parser.add_argument('--lr', '--learning-rate', default=0.06, type=float, 42 | metavar='H-P', help='initial learning rate') 43 | parser.add_argument('--lr2', '--learning-rate2', default=0.2, type=float, 44 | metavar='H-P', help='initial learning rate of stage3') 45 | parser.add_argument('--alpha', default=0.4, type=float, 46 | metavar='H-P', help='the coefficient of Compatibility Loss') 47 | parser.add_argument('--beta', default=0.1, type=float, 48 | metavar='H-P', help='the coefficient of Entropy Loss') 49 | parser.add_argument('--lambda1', default=600, type=int, 50 | metavar='H-P', help='the value of lambda') 51 | parser.add_argument('--stage1', default=70, type=int, 52 | metavar='H-P', help='number of epochs utill stage1') 53 | parser.add_argument('--stage2', default=200, type=int, 54 | metavar='H-P', help='number of epochs utill stage2') 55 | parser.add_argument('--epochs', default=320, type=int, metavar='H-P', 56 | help='number of total epochs to run') 57 | parser.add_argument('--datanum', default=45000, type=int, 58 | metavar='H-P', help='number of train dataset samples') 59 | parser.add_argument('--classnum', default=10, type=int, 60 | metavar='H-P', help='number of train dataset classes') 61 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 62 | help='momentum') 63 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 64 | metavar='W', help='weight decay (default: 1e-4)') 65 | parser.add_argument('--print-freq', '-p', default=50, type=int, 66 | metavar='N', help='print frequency (default: 10)') 67 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 68 | help='evaluate model on validation set') 69 | parser.add_argument('--pretrained', default=False,dest='pretrained', action='store_true', 70 | help='use pre-trained model') 71 | parser.add_argument('--world-size', default=1, type=int, 72 | help='number of distributed processes') 73 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 74 | help='url used to set up distributed training') 75 | parser.add_argument('--dist-backend', default='gloo', type=str, 76 | help='distributed backend') 77 | parser.add_argument('--gpu', dest='gpu', default='0', type=str, 78 | help='select gpu') 79 | parser.add_argument('--dir', dest='dir', default='', type=str, metavar='PATH', 80 | help='model dir') 81 | 82 | best_prec1 = 0 83 | 84 | class CIFAR10(torch.utils.data.Dataset): 85 | 86 | base_folder = 'cifar-10-batches-py' 87 | url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" 88 | filename = "cifar-10-python.tar.gz" 89 | tgz_md5 = 'c58f30108f718f92721af3b95e74349a' 90 | train_list = [ 91 | ['data_batch_1', 'c99cafc152244af753f735de768cd75f'], 92 | ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'], 93 | ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'], 94 | ['data_batch_4', '634d18415352ddfa80567beed471001a'], 95 | ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'], 96 | ] 97 | 98 | # val_dataset is from data_batch_5 99 | 100 | val_list = [ 101 | ['val_batch', '482c414d41f54cd18b22e5b47cb7c3cb'], 102 | ] 103 | 104 | test_list = [ 105 | ['test_batch', '40351d587109b95175f43aff81a1287e'], 106 | ] 107 | 108 | def __init__(self, root, train=0, 109 | transform=None, target_transform=None): 110 | self.root = os.path.expanduser(root) 111 | self.transform = transform 112 | self.target_transform = target_transform 113 | self.train = train # training set or test set 114 | 115 | # now load the picked numpy arrays 116 | if self.train == 0: 117 | self.train_data = [] 118 | self.train_labels = [] 119 | for fentry in self.train_list: 120 | f = fentry[0] 121 | file = os.path.join(self.root, self.base_folder, f) 122 | fo = open(file, 'rb') 123 | if sys.version_info[0] == 2: 124 | entry = pickle.load(fo) 125 | else: 126 | entry = pickle.load(fo, encoding='latin1') 127 | self.train_data.append(entry['data']) 128 | if 'labels' in entry: 129 | self.train_labels += entry['labels'] 130 | else: 131 | self.train_labels += entry['fine_labels'] 132 | fo.close() 133 | 134 | self.train_data = np.concatenate(self.train_data) 135 | self.train_data = self.train_data.reshape((45000, 3, 32, 32)) 136 | self.train_data = self.train_data.transpose((0, 2, 3, 1)) # convert to HWC 137 | elif self.train == 1: 138 | f = self.test_list[0][0] 139 | file = os.path.join(self.root, self.base_folder, f) 140 | fo = open(file, 'rb') 141 | if sys.version_info[0] == 2: 142 | entry = pickle.load(fo) 143 | else: 144 | entry = pickle.load(fo, encoding='latin1') 145 | self.test_data = entry['data'] 146 | if 'labels' in entry: 147 | self.test_labels = entry['labels'] 148 | else: 149 | self.test_labels = entry['fine_labels'] 150 | fo.close() 151 | self.test_data = self.test_data.reshape((10000, 3, 32, 32)) 152 | self.test_data = self.test_data.transpose((0, 2, 3, 1)) # convert to HWC 153 | else: 154 | f = self.val_list[0][0] 155 | file = os.path.join(self.root, self.base_folder, f) 156 | fo = open(file, 'rb') 157 | if sys.version_info[0] == 2: 158 | entry = pickle.load(fo) 159 | else: 160 | entry = pickle.load(fo, encoding='latin1') 161 | self.val_data = entry['data'] 162 | if 'labels' in entry: 163 | self.val_labels = entry['labels'] 164 | else: 165 | self.val_labels = entry['fine_labels'] 166 | fo.close() 167 | self.val_data = self.val_data.reshape((5000, 3, 32, 32)) 168 | self.val_data = self.val_data.transpose((0, 2, 3, 1)) # convert to HWC 169 | 170 | 171 | def __getitem__(self, index): 172 | """ 173 | Args: 174 | index (int): Index 175 | 176 | Returns: 177 | tuple: (image, target) where target is index of the target class. 178 | """ 179 | if self.train == 0: 180 | img, target = self.train_data[index], self.train_labels[index] 181 | elif self.train == 1: 182 | img, target = self.test_data[index], self.test_labels[index] 183 | else: 184 | img, target = self.val_data[index], self.val_labels[index] 185 | 186 | # doing this so that it is consistent with all other datasets 187 | # to return a PIL Image 188 | img = Image.fromarray(img) 189 | 190 | if self.transform is not None: 191 | img = self.transform(img) 192 | 193 | if self.target_transform is not None: 194 | target = self.target_transform(target) 195 | 196 | if self.train == 0: 197 | return img, target, index 198 | else: 199 | return img, target 200 | 201 | def __len__(self): 202 | if self.train == 0: 203 | return len(self.train_data) 204 | elif self.train == 1: 205 | return len(self.test_data) 206 | else: 207 | return len(self.val_data) 208 | 209 | 210 | 211 | 212 | def main(): 213 | global args, best_prec1 214 | args = parser.parse_args() 215 | 216 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 217 | y_file = args.dir + "y.npy" 218 | 219 | os.makedirs(args.dir) 220 | os.makedirs(args.dir+'record') 221 | 222 | model = resnet.__dict__[args.arch]() 223 | model = torch.nn.DataParallel(model).cuda() 224 | 225 | # define loss function (criterion) and optimizer 226 | criterion = nn.CrossEntropyLoss().cuda() 227 | 228 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 229 | momentum=args.momentum, 230 | weight_decay=args.weight_decay) 231 | 232 | checkpoint_dir = args.dir + "checkpoint.pth.tar" 233 | modelbest_dir = args.dir + "model_best.pth.tar" 234 | 235 | # optionally resume from a checkpoint 236 | if os.path.isfile(checkpoint_dir): 237 | print("=> loading checkpoint '{}'".format(checkpoint_dir)) 238 | checkpoint = torch.load(checkpoint_dir) 239 | args.start_epoch = checkpoint['epoch'] 240 | # args.start_epoch = 0 241 | best_prec1 = checkpoint['best_prec1'] 242 | model.load_state_dict(checkpoint['state_dict']) 243 | optimizer.load_state_dict(checkpoint['optimizer']) 244 | print("=> loaded checkpoint '{}' (epoch {})" 245 | .format(checkpoint_dir, checkpoint['epoch'])) 246 | else: 247 | print("=> no checkpoint found at '{}'".format(checkpoint_dir)) 248 | 249 | cudnn.benchmark = True 250 | 251 | # Data loading code 252 | transform1 = transforms.Compose([ 253 | transforms.RandomHorizontalFlip(), 254 | transforms.RandomCrop(32,4), 255 | transforms.ToTensor(), 256 | transforms.Normalize(mean = (0.492, 0.482, 0.446), std = (0.247, 0.244, 0.262)), 257 | ]) 258 | transform2 = transforms.Compose([ 259 | transforms.ToTensor(), 260 | transforms.Normalize(mean = (0.492, 0.482, 0.446), std = (0.247, 0.244, 0.262)), 261 | ]) 262 | trainset = CIFAR10(root='./', train=0, transform=transform1) 263 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, 264 | shuffle=True,num_workers=args.workers, pin_memory=True) 265 | testset = CIFAR10(root='./', train=1,transform=transform2) 266 | testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, 267 | shuffle=False,num_workers=args.workers, pin_memory=True) 268 | 269 | valset = CIFAR10(root='./', train=2, transform=transform2) 270 | valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, 271 | shuffle=False, num_workers=args.workers, pin_memory=True) 272 | 273 | if args.evaluate: 274 | validate(testloader, model, criterion) 275 | return 276 | 277 | for epoch in range(args.start_epoch, args.epochs): 278 | 279 | adjust_learning_rate(optimizer, epoch) 280 | 281 | # train for one epoch 282 | # load y_tilde 283 | if os.path.isfile(y_file): 284 | y = np.load(y_file) 285 | else: 286 | y = [] 287 | 288 | train(trainloader, model, criterion, optimizer, epoch, y) 289 | 290 | # evaluate on validation set 291 | prec1 = validate(valloader, model, criterion) 292 | validate(testloader, model, criterion) 293 | # remember best prec@1 and save checkpoint 294 | is_best = prec1 > best_prec1 295 | best_prec1 = max(prec1, best_prec1) 296 | 297 | save_checkpoint({ 298 | 'epoch': epoch + 1, 299 | 'arch': args.arch, 300 | 'state_dict': model.state_dict(), 301 | 'best_prec1': best_prec1, 302 | 'optimizer' : optimizer.state_dict(), 303 | }, is_best,filename=checkpoint_dir,modelbest=modelbest_dir) 304 | 305 | def train(train_loader, model, criterion, optimizer, epoch, y): 306 | batch_time = AverageMeter() 307 | data_time = AverageMeter() 308 | losses = AverageMeter() 309 | top1 = AverageMeter() 310 | top5 = AverageMeter() 311 | 312 | # switch to train mode 313 | model.train() 314 | 315 | end = time.time() 316 | 317 | # new y is y_tilde after updating 318 | new_y = np.zeros([args.datanum,args.classnum]) 319 | 320 | for i, (input, target, index) in enumerate(train_loader): 321 | # measure data loading time 322 | 323 | data_time.update(time.time() - end) 324 | 325 | index = index.numpy() 326 | 327 | target1 = target.cuda(async=True) 328 | input_var = torch.autograd.Variable(input) 329 | target_var = torch.autograd.Variable(target1) 330 | 331 | # compute output 332 | output = model(input_var) 333 | 334 | logsoftmax = nn.LogSoftmax(dim=1).cuda() 335 | softmax = nn.Softmax(dim=1).cuda() 336 | if epoch < args.stage1: 337 | # lc is classification loss 338 | lc = criterion(output, target_var) 339 | # init y_tilde, let softmax(y_tilde) is noisy labels 340 | onehot = torch.zeros(target.size(0), 10).scatter_(1, target.view(-1, 1), 10.0) 341 | onehot = onehot.numpy() 342 | new_y[index, :] = onehot 343 | else: 344 | yy = y 345 | yy = yy[index,:] 346 | yy = torch.FloatTensor(yy) 347 | yy = yy.cuda(async = True) 348 | yy = torch.autograd.Variable(yy,requires_grad = True) 349 | # obtain label distributions (y_hat) 350 | last_y_var = softmax(yy) 351 | lc = torch.mean(softmax(output)*(logsoftmax(output)-torch.log((last_y_var)))) 352 | # lo is compatibility loss 353 | lo = criterion(last_y_var, target_var) 354 | # le is entropy loss 355 | le = - torch.mean(torch.mul(softmax(output), logsoftmax(output))) 356 | 357 | if epoch < args.stage1: 358 | loss = lc 359 | elif epoch < args.stage2: 360 | loss = lc + args.alpha * lo + args.beta * le 361 | else: 362 | loss = lc 363 | 364 | # measure accuracy and record loss 365 | prec1, prec5 = accuracy(output.data, target1, topk=(1, 5)) 366 | losses.update(loss.item(), input.size(0)) 367 | top1.update(prec1[0], input.size(0)) 368 | top5.update(prec5[0], input.size(0)) 369 | 370 | # compute gradient and do SGD step 371 | optimizer.zero_grad() 372 | loss.backward() 373 | optimizer.step() 374 | if epoch >= args.stage1 and epoch < args.stage2: 375 | lambda1 = args.lambda1 376 | # update y_tilde by back-propagation 377 | yy.data.sub_(lambda1*yy.grad.data) 378 | 379 | new_y[index,:] = yy.data.cpu().numpy() 380 | 381 | # measure elapsed time 382 | batch_time.update(time.time() - end) 383 | end = time.time() 384 | 385 | if i % args.print_freq == 0: 386 | print('Epoch: [{0}][{1}/{2}]\t' 387 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 388 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 389 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 390 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 391 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 392 | epoch, i, len(train_loader), batch_time=batch_time, 393 | data_time=data_time, loss=losses, top1=top1, top5=top5)) 394 | if epoch < args.stage2: 395 | # save y_tilde 396 | y = new_y 397 | y_file = args.dir + "y.npy" 398 | np.save(y_file,y) 399 | y_record = args.dir + "record/y_%03d.npy" % epoch 400 | np.save(y_record,y) 401 | 402 | def validate(val_loader, model, criterion): 403 | batch_time = AverageMeter() 404 | losses = AverageMeter() 405 | top1 = AverageMeter() 406 | top5 = AverageMeter() 407 | 408 | # switch to evaluate mode 409 | model.eval() 410 | 411 | end = time.time() 412 | for i, (input, target) in enumerate(val_loader): 413 | target = target.cuda(async=True) 414 | input_var = torch.autograd.Variable(input) 415 | target_var = torch.autograd.Variable(target) 416 | 417 | # compute output 418 | output = model(input_var) 419 | loss = criterion(output, target_var) 420 | 421 | # measure accuracy and record loss 422 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 423 | losses.update(loss.item(), input.size(0)) 424 | top1.update(prec1[0], input.size(0)) 425 | top5.update(prec5[0], input.size(0)) 426 | 427 | # measure elapsed time 428 | batch_time.update(time.time() - end) 429 | end = time.time() 430 | 431 | if i % args.print_freq == 0: 432 | print('Test: [{0}/{1}]\t' 433 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 434 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 435 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 436 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 437 | i, len(val_loader), batch_time=batch_time, loss=losses, 438 | top1=top1, top5=top5)) 439 | 440 | print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' 441 | .format(top1=top1, top5=top5)) 442 | 443 | return top1.avg 444 | 445 | 446 | def save_checkpoint(state, is_best, filename='', modelbest = ''): 447 | torch.save(state, filename) 448 | if is_best: 449 | shutil.copyfile(filename, modelbest) 450 | 451 | 452 | class AverageMeter(object): 453 | """Computes and stores the average and current value""" 454 | def __init__(self): 455 | self.reset() 456 | 457 | def reset(self): 458 | self.val = 0 459 | self.avg = 0 460 | self.sum = 0 461 | self.count = 0 462 | 463 | def update(self, val, n=1): 464 | self.val = val 465 | self.sum += val * n 466 | self.count += n 467 | self.avg = self.sum / self.count 468 | 469 | 470 | def adjust_learning_rate(optimizer, epoch): 471 | """Sets the learning rate""" 472 | if epoch < args.stage2 : 473 | lr = args.lr 474 | elif epoch < (args.epochs - args.stage2)//3 + args.stage2: 475 | lr = args.lr2 476 | elif epoch < 2 * (args.epochs - args.stage2)//3 + args.stage2: 477 | lr = args.lr2//10 478 | else: 479 | lr = args.lr2//100 480 | for param_group in optimizer.param_groups: 481 | param_group['lr'] = lr 482 | 483 | 484 | def accuracy(output, target, topk=(1,)): 485 | """Computes the precision@k for the specified values of k""" 486 | maxk = max(topk) 487 | batch_size = target.size(0) 488 | 489 | _ , pred = output.topk(maxk, 1, True, True) 490 | pred = pred.t() 491 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 492 | 493 | res = [] 494 | for k in topk: 495 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 496 | res.append(correct_k.mul_(100.0 / batch_size)) 497 | return res 498 | 499 | 500 | if __name__ == '__main__': 501 | main() 502 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PENCIL.pytorch 2 | PyTorch implementation of [Probabilistic End-to-end Noise Correction for Learning with Noisy Labels](https://arxiv.org/abs/1903.07788), CVPR 2019. 3 | 4 | ## Requirements: 5 | + python3.6 6 | + numpy 7 | + torch-0.4.1 8 | + torchvision-0.2.0 9 | 10 | ## Usage 11 | + On CIFAR-10, we retained 10% of the CIFAR-10 training data as the validation set and modify the original correct labels to obtain different noisy label datasets. 12 | + So the validation set is part of `data_batch_5`, and both of them have 5000 samples 13 | + Add symmetric noise on CIFAR-10: `python addnoise_SN.py` 14 | + Add asymmetric noise on CIFAR-10: `python addnoise_AN.py` 15 | + `PENCIL.py` is used for both training a model on dataset with noisy labels and validating it 16 | 17 | ## options 18 | + `b`: batch size 19 | + `lr`: initial learning rate of stage1 20 | + `lr2`: initial learning rate of stage3 21 | + `alpha`: the coefficient of Compatibility Loss 22 | + `beta`: the coefficient of Entropy Loss 23 | + `lambda1`: the value of lambda 24 | + `stage1`: number of epochs utill the end of stage1 25 | + `stage2`: number of epochs utill the end of stage2 26 | + `epoch`: number of total epochs to run 27 | + `datanum`: number of train dataset samples 28 | + `classnum`: number of train dataset classes 29 | 30 | ## The framework of PENCIL 31 |
32 | 33 |
34 | ## The proportion of correct labels on CIFAR-10 35 |
36 | 37 | 38 |
39 | 40 | ## The results on real-world dataset Clothing1M 41 | |#|method|Test Accuracy (%)| 42 | |---|:--:|:---:| 43 | |1|Cross Entropy Loss|68.94| 44 | |2|Forward [1]|69.84| 45 | |3|Tanaka *et al*. [2]|72.16| 46 | |4|PENCIL|**73.49**| 47 | ## Citing this repository 48 | If you find this code useful in your research, please consider citing us: 49 | 50 | ``` 51 | @inproceedings{PENCIL_CVPR_2019, 52 | author = {Kun, Yi and Jianxin, Wu}, 53 | title = {{Probabilistic End-to-end Noise Correction for Learning with Noisy Labels}}, 54 | booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 55 | year = {2019} 56 | } 57 | ``` 58 | ## Reference 59 | [1] Giorgio Patrini, Alessandro Rozza, Aditya Krishna Menon, Richard Nock, and Lizhen Qu. [Making deep neural networks robust to label noise: A loss correction approach](http://arxiv.org/abs/1609.03683). In CVPR, pages 1944–1952, 2017. 60 |
[2] Daiki Tanaka, Daiki Ikami, Toshihiko Yamasaki, and Kiyoharu Aizawa. [Joint optimization framework for learning with noisy labels](https://arxiv.org/abs/1803.11364). In CVPR, pages 5552–5560, 2018. -------------------------------------------------------------------------------- /SN70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikun2019/PENCIL/8a89ed50b6ecd998439b09947b59412dccbca93f/SN70.png -------------------------------------------------------------------------------- /addnoise_AN.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | def unpickle(file): 5 | 6 | with open(file, 'rb') as fo: 7 | dict = pickle.load(fo, encoding='latin1') 8 | return dict 9 | 10 | # r is noise rate 11 | 12 | r = 0.1 13 | 14 | # add asymmetric noise 15 | 16 | a = unpickle('./cifar-10-batches-py/data_batch_1') 17 | for i in range(10000): 18 | if np.random.random() < r: 19 | if a['labels'][i] == 9: 20 | a['labels'][i] = 1 21 | elif a['labels'][i] == 2: 22 | a['labels'][i] = 0 23 | elif a['labels'][i] == 4: 24 | a['labels'][i] = 7 25 | elif a['labels'][i] == 3: 26 | a['labels'][i] = 5 27 | elif a['labels'][i] == 5: 28 | a['labels'][i] = 3 29 | with open('./cifar-10-batches-py/data_batch_1','wb') as file: 30 | pickle.dump(a,file) 31 | 32 | a = unpickle('./cifar-10-batches-py/data_batch_2') 33 | for i in range(10000): 34 | if np.random.random() < r: 35 | if a['labels'][i] == 9: 36 | a['labels'][i] = 1 37 | elif a['labels'][i] == 2: 38 | a['labels'][i] = 0 39 | elif a['labels'][i] == 4: 40 | a['labels'][i] = 7 41 | elif a['labels'][i] == 3: 42 | a['labels'][i] = 5 43 | elif a['labels'][i] == 5: 44 | a['labels'][i] = 3 45 | with open('./cifar-10-batches-py/data_batch_2','wb') as file: 46 | pickle.dump(a,file) 47 | 48 | a = unpickle('./cifar-10-batches-py/data_batch_3') 49 | for i in range(10000): 50 | if np.random.random() < r: 51 | if a['labels'][i] == 9: 52 | a['labels'][i] = 1 53 | elif a['labels'][i] == 2: 54 | a['labels'][i] = 0 55 | elif a['labels'][i] == 4: 56 | a['labels'][i] = 7 57 | elif a['labels'][i] == 3: 58 | a['labels'][i] = 5 59 | elif a['labels'][i] == 5: 60 | a['labels'][i] = 3 61 | with open('./cifar-10-batches-py/data_batch_3','wb') as file: 62 | pickle.dump(a,file) 63 | 64 | a = unpickle('./cifar-10-batches-py/data_batch_4') 65 | for i in range(10000): 66 | if np.random.random() < r: 67 | if a['labels'][i] == 9: 68 | a['labels'][i] = 1 69 | elif a['labels'][i] == 2: 70 | a['labels'][i] = 0 71 | elif a['labels'][i] == 4: 72 | a['labels'][i] = 7 73 | elif a['labels'][i] == 3: 74 | a['labels'][i] = 5 75 | elif a['labels'][i] == 5: 76 | a['labels'][i] = 3 77 | with open('./cifar-10-batches-py/data_batch_4','wb') as file: 78 | pickle.dump(a,file) 79 | 80 | a = unpickle('./cifar-10-batches-py/data_batch_5') 81 | for i in range(5000): 82 | if np.random.random() < r: 83 | if a['labels'][i] == 9: 84 | a['labels'][i] = 1 85 | elif a['labels'][i] == 2: 86 | a['labels'][i] = 0 87 | elif a['labels'][i] == 4: 88 | a['labels'][i] = 7 89 | elif a['labels'][i] == 3: 90 | a['labels'][i] = 5 91 | elif a['labels'][i] == 5: 92 | a['labels'][i] = 3 93 | with open('./cifar-10-batches-py/data_batch_5','wb') as file: 94 | pickle.dump(a,file) 95 | 96 | 97 | -------------------------------------------------------------------------------- /addnoise_SN.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | def unpickle(file): 5 | 6 | with open(file, 'rb') as fo: 7 | dict = pickle.load(fo, encoding='latin1') 8 | return dict 9 | 10 | # r is noise rate 11 | 12 | r = 0.1 13 | 14 | count = 0 15 | 16 | # add symmetric noise 17 | 18 | a = unpickle('./cifar-10-batches-py/data_batch_1') 19 | for i in range(10000): 20 | if np.random.random()< r: 21 | a['labels'][i] = np.random.randint(0,10) 22 | count += 1 23 | with open('./cifar-10-batches-py/data_batch_1','wb') as file: 24 | pickle.dump(a,file) 25 | 26 | a = unpickle('./cifar-10-batches-py/data_batch_2') 27 | for i in range(10000): 28 | if np.random.random() < r: 29 | a['labels'][i] = np.random.randint(0, 10) 30 | count += 1 31 | with open('./cifar-10-batches-py/data_batch_2', 'wb') as file: 32 | pickle.dump(a, file) 33 | 34 | a = unpickle('./cifar-10-batches-py/data_batch_3') 35 | for i in range(10000): 36 | if np.random.random() < r: 37 | a['labels'][i] = np.random.randint(0, 10) 38 | count += 1 39 | with open('./cifar-10-batches-py/data_batch_3', 'wb') as file: 40 | pickle.dump(a, file) 41 | 42 | a = unpickle('./cifar-10-batches-py/data_batch_4') 43 | for i in range(10000): 44 | if np.random.random() < r: 45 | a['labels'][i] = np.random.randint(0, 10) 46 | count += 1 47 | with open('./cifar-10-batches-py/data_batch_4', 'wb') as file: 48 | pickle.dump(a, file) 49 | 50 | a = unpickle('./cifar-10-batches-py/data_batch_5') 51 | for i in range(5000): 52 | if np.random.random() < r: 53 | a['labels'][i] = np.random.randint(0, 10) 54 | count += 1 55 | with open('./cifar-10-batches-py/data_batch_5', 'wb') as file: 56 | pickle.dump(a, file) 57 | 58 | 59 | print(count) -------------------------------------------------------------------------------- /framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikun2019/PENCIL/8a89ed50b6ecd998439b09947b59412dccbca93f/framework.png --------------------------------------------------------------------------------