├── .gitignore ├── meta_run.sh ├── test_scripts └── mnist │ ├── coreset_multirun.sh │ ├── random_multirun.sh │ ├── entropy_multirun.sh │ ├── margin_multirun.sh │ ├── uncertain_multirun.sh │ ├── run_random.sh │ ├── run_coreset.sh │ ├── run_prob_margin.sh │ ├── run_prob_entropy.sh │ ├── run_prob_uncertain.sh │ └── run_bald.sh ├── run_inference.sh ├── run_active_learn.sh ├── README.md ├── run_debug.sh ├── utils └── simplify_mnistlike.py ├── dsets └── mnist.py ├── init_pool_tools.py ├── mymodels └── mnist_net.py ├── train_test.py ├── inference.py ├── coreset.py ├── simple_train.py └── active_learn.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | output/ 3 | __pycache__/ 4 | -------------------------------------------------------------------------------- /meta_run.sh: -------------------------------------------------------------------------------- 1 | ./run_active_learn.sh 2 | ./run_active_learn.sh 3 | ./run_active_learn.sh 4 | -------------------------------------------------------------------------------- /test_scripts/mnist/coreset_multirun.sh: -------------------------------------------------------------------------------- 1 | ./run_coreset.sh 2 | ./run_coreset.sh 3 | ./run_coreset.sh 4 | ./run_coreset.sh 5 | ./run_coreset.sh -------------------------------------------------------------------------------- /test_scripts/mnist/random_multirun.sh: -------------------------------------------------------------------------------- 1 | ./run_random.sh 2 | ./run_random.sh 3 | ./run_random.sh 4 | ./run_random.sh 5 | ./run_random.sh 6 | -------------------------------------------------------------------------------- /test_scripts/mnist/entropy_multirun.sh: -------------------------------------------------------------------------------- 1 | ./run_prob_entropy.sh 2 | ./run_prob_entropy.sh 3 | ./run_prob_entropy.sh 4 | ./run_prob_entropy.sh 5 | ./run_prob_entropy.sh -------------------------------------------------------------------------------- /test_scripts/mnist/margin_multirun.sh: -------------------------------------------------------------------------------- 1 | ./run_prob_margin.sh 2 | ./run_prob_margin.sh 3 | ./run_prob_margin.sh 4 | ./run_prob_margin.sh 5 | ./run_prob_margin.sh 6 | 7 | 8 | -------------------------------------------------------------------------------- /test_scripts/mnist/uncertain_multirun.sh: -------------------------------------------------------------------------------- 1 | ./run_prob_uncertain.sh 2 | ./run_prob_uncertain.sh 3 | ./run_prob_uncertain.sh 4 | ./run_prob_uncertain.sh 5 | ./run_prob_uncertain.sh -------------------------------------------------------------------------------- /run_inference.sh: -------------------------------------------------------------------------------- 1 | DROOT=data/mnist_easy 2 | DNAME=mnist 3 | MODEL_FILE=output/mnist/202012111519/init.pth 4 | CUDA_VISIBLE_DEVICES=0 python inference.py \ 5 | --dataset-root $DROOT \ 6 | --dataset-name $DNAME \ 7 | --model-file $MODEL_FILE 8 | -------------------------------------------------------------------------------- /run_active_learn.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=10 5 | AL_BSIZE=100 6 | SAMPLE_METHOD=prob_uncertain 7 | DROOT=data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=output/ 10 | MAX_EPISODES=10 11 | CUDA_VISIBLE_DEVICES=0 python active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_random.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=random 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_coreset.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=coreset 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_prob_margin.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=prob_margin 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coreset AL 2 | A greedy implementation of coreset based active learning for image classification (https://arxiv.org/abs/1708.00489) 3 | 4 | How to run: 5 | 6 | ``` 7 | python active_learn.py \ 8 | --epochs $EPOCHS --lr $LR \ 9 | --gamma $GAMMA --init-size $INIT_SIZE \ 10 | --al-batch-size $AL_BSIZE \ 11 | --sampling-method $SAMPLE_METHOD \ 12 | --dataset-root $DROOT \ 13 | --dataset-name $DNAME \ 14 | --output-dir $OUT_DIR \ 15 | --max-eps $MAX_EPISODES 16 | ``` 17 | 18 | Check ```run_active_learn.sh``` for more details. 19 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_prob_entropy.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=prob_entropy 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_prob_uncertain.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=prob_uncertain 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 12 | --epochs $EPOCHS --lr $LR \ 13 | --gamma $GAMMA --init-size $INIT_SIZE \ 14 | --al-batch-size $AL_BSIZE \ 15 | --sampling-method $SAMPLE_METHOD \ 16 | --dataset-root $DROOT \ 17 | --dataset-name $DNAME \ 18 | --output-dir $OUT_DIR \ 19 | --max-eps $MAX_EPISODES 20 | -------------------------------------------------------------------------------- /test_scripts/mnist/run_bald.sh: -------------------------------------------------------------------------------- 1 | EPOCHS=50 2 | LR=0.001 3 | GAMMA=0.1 4 | INIT_SIZE=50 5 | AL_BSIZE=50 6 | SAMPLE_METHOD=dbal_bald 7 | DROOT=../../data/mnist_easy 8 | DNAME=mnist 9 | OUT_DIR=../../output/ 10 | MAX_EPISODES=20 11 | DROPOUT_ITR=50 12 | CUDA_VISIBLE_DEVICES=0 python ../../active_learn.py \ 13 | --epochs $EPOCHS --lr $LR \ 14 | --gamma $GAMMA --init-size $INIT_SIZE \ 15 | --al-batch-size $AL_BSIZE \ 16 | --sampling-method $SAMPLE_METHOD \ 17 | --dataset-root $DROOT \ 18 | --dataset-name $DNAME \ 19 | --output-dir $OUT_DIR \ 20 | --max-eps $MAX_EPISODES \ 21 | --dropout-iterations $DROPOUT_ITR 22 | -------------------------------------------------------------------------------- /run_debug.sh: -------------------------------------------------------------------------------- 1 | # same as run_active_learn.sh 2 | # but with less epochs training 3 | # mostly for quick debugging. 4 | EPOCHS=1 5 | LR=0.001 6 | GAMMA=0.1 7 | INIT_SIZE=10 8 | AL_BSIZE=100 9 | SAMPLE_METHOD=dbal_bald 10 | DROOT=data/mnist_easy 11 | DNAME=mnist 12 | OUT_DIR=output/ 13 | MAX_EPISODES=10 14 | DROPOUT_ITR=5 15 | CUDA_VISIBLE_DEVICES=0 python active_learn.py \ 16 | --epochs $EPOCHS --lr $LR \ 17 | --gamma $GAMMA --init-size $INIT_SIZE \ 18 | --al-batch-size $AL_BSIZE \ 19 | --sampling-method $SAMPLE_METHOD \ 20 | --dataset-root $DROOT \ 21 | --dataset-name $DNAME \ 22 | --output-dir $OUT_DIR \ 23 | --max-eps $MAX_EPISODES \ 24 | --dropout-iterations $DROPOUT_ITR 25 | -------------------------------------------------------------------------------- /utils/simplify_mnistlike.py: -------------------------------------------------------------------------------- 1 | import os, shutil 2 | import csv 3 | import pdb 4 | 5 | root_dir = '../data/fmnist_png' 6 | dest_dir = '../data/fmnist_easy' 7 | 8 | train_dir = os.path.join(root_dir,'training') 9 | test_dir = os.path.join(root_dir, 'testing') 10 | 11 | # make directories 12 | if not os.path.exists(dest_dir): 13 | os.mkdir(dest_dir) 14 | os.mkdir(dest_dir + '/train') 15 | os.mkdir(dest_dir + '/test') 16 | 17 | 18 | 19 | 20 | 21 | for source in [train_dir, test_dir]: 22 | if source == train_dir: 23 | csv_name = 'train.csv' 24 | img_dest = os.path.join(dest_dir,'train') 25 | else: 26 | csv_name = 'test.csv' 27 | img_dest = os.path.join(dest_dir,'test') 28 | 29 | anns_map = [] 30 | 31 | for clas in sorted(os.listdir(source)): 32 | clas_dir = os.path.join(source, clas) 33 | 34 | for img in os.listdir(clas_dir): 35 | anns_map.append((img,clas)) 36 | shutil.copy(os.path.join(clas_dir,img), img_dest) 37 | print(img) 38 | 39 | 40 | 41 | csv_file = os.path.join(dest_dir,csv_name) 42 | with open(csv_file, 'w') as f: 43 | csv_out = csv.writer(f) 44 | csv_out.writerows(anns_map) 45 | -------------------------------------------------------------------------------- /dsets/mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | 11 | import pdb 12 | 13 | class MNIST(Dataset): 14 | 15 | def __init__(self, root_dir, subset, csv_file, transform=None): 16 | self.root_dir = root_dir 17 | self.img_dir = os.path.join(root_dir,'images') 18 | 19 | if '/' not in csv_file: 20 | self.dataframe = pd.read_csv(os.path.join(root_dir,csv_file), header=None) 21 | else: 22 | self.dataframe = pd.read_csv(csv_file, header=None) 23 | self.transform = transform 24 | 25 | self.subset = subset # train or test 26 | 27 | def __len__(self): 28 | return len(self.dataframe) 29 | 30 | def __getitem__(self, idx): 31 | 32 | if torch.is_tensor(idx): 33 | idx = idx.tolist() 34 | 35 | img_name = os.path.join(self.root_dir, self.subset, self.dataframe.iloc[idx,0]) 36 | img_name_small = self.dataframe.iloc[idx, 0] 37 | image = io.imread(img_name) 38 | 39 | label = self.dataframe.iloc[idx,1] 40 | if self.transform: 41 | image = self.transform(image) 42 | 43 | sample = {'image': image, 'label': label, 'img_name': img_name_small} 44 | 45 | return sample 46 | -------------------------------------------------------------------------------- /init_pool_tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | from datetime import datetime 13 | import argparse 14 | import pprint 15 | 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | from torch.optim.lr_scheduler import StepLR 20 | 21 | # local stuff 22 | from dsets.mnist import MNIST 23 | from mymodels.mnist_net import Net 24 | from train_test import train, test 25 | 26 | def obtain_init_pool(args): 27 | ''' 28 | Go to the dataset root. Get train.csv 29 | shuffle train.csv and get the first "init_size" samples. 30 | create three new csv files -> init_pool.csv, labeled.csv and unlabeled.csv 31 | ''' 32 | init_pool_size = args.init_size 33 | 34 | train_file = os.path.join(args.dataset_root, 'train.csv') 35 | init_file = os.path.join(args.dataset_root, 'init_pool.csv') 36 | labeled_file = os.path.join(args.dataset_root, 'labeled.csv') 37 | unlabeled_file = os.path.join(args.dataset_root, 'unlabeled.csv') 38 | 39 | train_rows = np.genfromtxt(train_file, delimiter=',', dtype=str) 40 | 41 | np.random.shuffle(train_rows) 42 | 43 | labeled_rows = train_rows[:init_pool_size] 44 | unlabeled_rows = train_rows[init_pool_size:] 45 | 46 | np.savetxt(labeled_file, labeled_rows,'%s,%s',delimiter=',') 47 | np.savetxt(init_file, labeled_rows,'%s,%s',delimiter=',') 48 | np.savetxt(unlabeled_file, unlabeled_rows,'%s,%s',delimiter=',') 49 | 50 | return labeled_file, unlabeled_file 51 | -------------------------------------------------------------------------------- /mymodels/mnist_net.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.optim.lr_scheduler import StepLR 9 | 10 | 11 | class Net(nn.Module): 12 | def __init__(self): 13 | super(Net, self).__init__() 14 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 15 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 16 | self.dropout1 = nn.Dropout2d(0.25) 17 | self.dropout2 = nn.Dropout2d(0.5) 18 | self.fc1 = nn.Linear(9216, 128) 19 | self.fc2 = nn.Linear(128, 10) 20 | 21 | def get_features(self, x): 22 | x = self.conv1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = F.max_pool2d(x, 2) 26 | x = self.dropout1(x) 27 | x = torch.flatten(x, 1) 28 | x = self.fc1(x) 29 | return x 30 | 31 | def stochastic_pred(self, x): 32 | # add dropouts everywhere 33 | x = self.conv1(x) 34 | x = F.relu(x) 35 | x = self.dropout1(x) 36 | x = self.conv2(x) 37 | x = F.max_pool2d(x, 2) 38 | x = self.dropout1(x) 39 | x = torch.flatten(x, 1) 40 | x = self.fc1(x) 41 | x = F.relu(x) 42 | x = self.dropout2(x) 43 | x = self.fc2(x) 44 | output = F.log_softmax(x, dim=1) 45 | return output 46 | 47 | def forward(self, x): 48 | x = self.conv1(x) 49 | x = F.relu(x) 50 | x = self.conv2(x) 51 | x = F.max_pool2d(x, 2) 52 | x = self.dropout1(x) 53 | x = torch.flatten(x, 1) 54 | x = self.fc1(x) 55 | x = F.relu(x) 56 | x = self.dropout2(x) 57 | x = self.fc2(x) 58 | output = F.log_softmax(x, dim=1) 59 | return output 60 | -------------------------------------------------------------------------------- /train_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | 13 | from dsets.mnist import MNIST 14 | from mymodels.mnist_net import Net 15 | 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | from torch.optim.lr_scheduler import StepLR 20 | 21 | 22 | def train(args, model, device, train_loader, optimizer, epoch): 23 | model.train() 24 | for batch_idx, sample in enumerate(train_loader): 25 | data = sample['image'] 26 | target = sample['label'] 27 | 28 | data, target = data.to(device), target.to(device) 29 | # pdb.set_trace() 30 | optimizer.zero_grad() 31 | output = model(data) 32 | loss = F.nll_loss(output, target) 33 | loss.backward() 34 | optimizer.step() 35 | if batch_idx % args.log_interval == 0: 36 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 37 | epoch, batch_idx * len(data), len(train_loader.dataset), 38 | 100. * batch_idx / len(train_loader), loss.item())) 39 | return model 40 | 41 | 42 | def test(args, model, device, test_loader): 43 | model.eval() 44 | test_loss = 0 45 | correct = 0 46 | with torch.no_grad(): 47 | for sample in test_loader: 48 | data = sample['image'] 49 | target = sample['label'] 50 | 51 | data, target = data.to(device), target.to(device) 52 | output = model(data) 53 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 54 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 55 | correct += pred.eq(target.view_as(pred)).sum().item() 56 | 57 | test_loss /= len(test_loader.dataset) 58 | 59 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 60 | test_loss, correct, len(test_loader.dataset), 61 | 100. * correct / len(test_loader.dataset))) 62 | 63 | return 100. * correct/len(test_loader.dataset) 64 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | from datetime import datetime 13 | import argparse 14 | import pprint 15 | 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | from torch.optim.lr_scheduler import StepLR 20 | 21 | # local stuff 22 | from dsets.mnist import MNIST 23 | from mymodels.mnist_net import Net 24 | from train_test import train, test 25 | 26 | 27 | def argparser(): 28 | parser = argparse.ArgumentParser(description='Active Learning - Image Classification') 29 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 30 | help='input batch size for testing (default: 1000)') 31 | parser.add_argument('--no-cuda', action='store_true', default=False, 32 | help='disables CUDA training') 33 | parser.add_argument('--seed', type=int, default=1, metavar='S', 34 | help='random seed (default: 1)') 35 | parser.add_argument('--dataset-root', default='data/mnist_easy', type=str, 36 | help='root directory of the dataset') 37 | parser.add_argument('--dataset-name', default='mnist', type=str, 38 | help='dataset name') 39 | parser.add_argument('--model-file', default='', type=str, 40 | help='location of the model file') 41 | return parser 42 | 43 | if __name__ == "__main__": 44 | args = argparser().parse_args() 45 | pprint.pprint(args) 46 | use_cuda = not args.no_cuda and torch.cuda.is_available() 47 | torch.manual_seed(args.seed) 48 | device = torch.device("cuda" if use_cuda else "cpu") 49 | kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 50 | 51 | data_transforms = transforms.Compose([ 52 | transforms.ToTensor(), 53 | transforms.Normalize((0.1307,), (0.3081,)) 54 | ]) 55 | dataset_test = MNIST(args.dataset_root, subset='test', csv_file='test.csv', transform=data_transforms) 56 | test_loader = DataLoader(dataset_test, batch_size=args.test_batch_size, shuffle=False, **kwargs) 57 | 58 | model = Net().to(device) 59 | model.load_state_dict(torch.load(args.model_file)) 60 | 61 | test(args, model, device, test_loader) 62 | -------------------------------------------------------------------------------- /coreset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | from datetime import datetime 13 | import argparse 14 | import pprint 15 | import time 16 | 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | import torch.optim as optim 20 | from torch.optim.lr_scheduler import StepLR 21 | 22 | from sklearn.metrics import pairwise_distances 23 | 24 | class Coreset_Greedy: 25 | def __init__(self, all_pts): 26 | self.all_pts = np.array(all_pts) 27 | self.dset_size = len(all_pts) 28 | self.min_distances = None 29 | self.already_selected = [] 30 | 31 | # reshape 32 | feature_len = self.all_pts[0].shape[1] 33 | self.all_pts = self.all_pts.reshape(-1,feature_len) 34 | 35 | # self.first_time = True 36 | 37 | def update_dist(self, centers, only_new=True, reset_dist=False): 38 | if reset_dist: 39 | self.min_distances = None 40 | if only_new: 41 | centers = [p for p in centers if p not in self.already_selected] 42 | 43 | if centers is not None: 44 | x = self.all_pts[centers] # pick only centers 45 | dist = pairwise_distances(self.all_pts, x, metric='euclidean') 46 | 47 | if self.min_distances is None: 48 | self.min_distances = np.min(dist, axis=1).reshape(-1,1) 49 | else: 50 | self.min_distances = np.minimum(self.min_distances, dist) 51 | 52 | def sample(self, already_selected, sample_size): 53 | 54 | # initially updating the distances 55 | self.update_dist(already_selected, only_new=False, reset_dist=True) 56 | self.already_selected = already_selected 57 | 58 | # epdb.set_trace() 59 | 60 | new_batch = [] 61 | # pdb.set_trace() 62 | for _ in range(sample_size): 63 | if self.already_selected == []: 64 | ind = np.random.choice(np.arange(self.dset_size)) 65 | else: 66 | ind = np.argmax(self.min_distances) 67 | 68 | assert ind not in already_selected 69 | self.update_dist([ind],only_new=True, reset_dist=False) 70 | new_batch.append(ind) 71 | 72 | max_distance = max(self.min_distances) 73 | print("Max distance from cluster : %0.2f" % max_distance) 74 | 75 | return new_batch, max_distance 76 | -------------------------------------------------------------------------------- /simple_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | 13 | from dsets.mnist import MNIST 14 | from mymodels.mnist_net import Net 15 | 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | from torch.optim.lr_scheduler import StepLR 20 | 21 | mnist_root = 'data/mnist_easy' 22 | epochs = 15 23 | 24 | def train(model, device, train_loader, optimizer, epoch): 25 | model.train() 26 | for batch_idx, sample in enumerate(train_loader): 27 | data = sample['image'] 28 | target = sample['label'] 29 | 30 | data, target = data.to(device), target.to(device) 31 | # pdb.set_trace() 32 | optimizer.zero_grad() 33 | output = model(data) 34 | loss = F.nll_loss(output, target) 35 | loss.backward() 36 | optimizer.step() 37 | if batch_idx % 100 == 0: 38 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 39 | epoch, batch_idx * len(data), len(train_loader.dataset), 40 | 100. * batch_idx / len(train_loader), loss.item())) 41 | 42 | 43 | def test(model, device, test_loader): 44 | model.eval() 45 | test_loss = 0 46 | correct = 0 47 | with torch.no_grad(): 48 | for sample in test_loader: 49 | data = sample['image'] 50 | target = sample['label'] 51 | 52 | data, target = data.to(device), target.to(device) 53 | output = model(data) 54 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 55 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 56 | correct += pred.eq(target.view_as(pred)).sum().item() 57 | 58 | test_loss /= len(test_loader.dataset) 59 | 60 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 61 | test_loss, correct, len(test_loader.dataset), 62 | 100. * correct / len(test_loader.dataset))) 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | use_cuda = torch.cuda.is_available() 68 | device = torch.device("cuda" if use_cuda else "cpu") 69 | kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 70 | 71 | data_transforms = transforms.Compose([ 72 | transforms.ToTensor(), 73 | transforms.Normalize((0.1307,), (0.3081,)) 74 | ]) 75 | mnist_train = MNIST(mnist_root, subset='train', csv_file='train.csv', transform=data_transforms) 76 | mnist_test = MNIST(mnist_root, subset='test', csv_file='test.csv', transform=data_transforms) 77 | 78 | train_loader = DataLoader(mnist_train, batch_size=64, shuffle=True, **kwargs) 79 | test_loader = DataLoader(mnist_test, batch_size=1000, shuffle=False, **kwargs) 80 | 81 | model = Net().to(device) 82 | 83 | optimizer = optim.Adadelta(model.parameters(),lr=1.0) 84 | scheduler = StepLR(optimizer, step_size=1, gamma=0.1) 85 | 86 | for epoch in range(1, epochs + 1): 87 | train( model, device, train_loader, optimizer, epoch) 88 | test(model, device, test_loader) 89 | scheduler.step() 90 | -------------------------------------------------------------------------------- /active_learn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import torch 4 | import pandas as pd 5 | from skimage import io, transform 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import Dataset, DataLoader 9 | from torchvision import transforms, utils 10 | import torchvision.models as models 11 | import pdb 12 | from datetime import datetime 13 | import argparse 14 | import pprint 15 | import time 16 | import csv 17 | 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | import torch.optim as optim 21 | from torch.optim.lr_scheduler import StepLR 22 | 23 | # local stuff 24 | from dsets.mnist import MNIST 25 | from mymodels.mnist_net import Net 26 | from train_test import train, test 27 | from init_pool_tools import obtain_init_pool 28 | from coreset import Coreset_Greedy 29 | 30 | 31 | def argparser(): 32 | parser = argparse.ArgumentParser(description='Active Learning - Image Classification') 33 | parser.add_argument('--batch-size', type=int, default=32, metavar='N', 34 | help='input batch size for training (default: 64)') 35 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 36 | help='input batch size for testing (default: 1000)') 37 | 38 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 39 | help='number of epochs to train (default: 10)') 40 | 41 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 42 | help='learning rate (default: 1.0)') 43 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 44 | help='Learning rate step gamma (default: 0.7)') 45 | parser.add_argument('--no-cuda', action='store_true', default=False, 46 | help='disables CUDA training') 47 | parser.add_argument('--seed', type=int, default=1, metavar='S', 48 | help='random seed (default: 1)') 49 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 50 | help='how many batches to wait before logging training status') 51 | 52 | parser.add_argument('--al-batch-size', default=500, type=int, 53 | help='number of samples to add in each iteration') 54 | parser.add_argument('--init-size', default=1000, type=int, 55 | help='init pool size') 56 | parser.add_argument('--sampling-method', default='random', type=str, 57 | help='one of random, coreset') 58 | parser.add_argument('--dataset-root', default='data/mnist_easy', type=str, 59 | help='root directory of the dataset') 60 | parser.add_argument('--dataset-name', default='mnist', type=str, 61 | help='dataset name') 62 | parser.add_argument('--output-dir', default='output', type=str, 63 | help='dataset name') 64 | parser.add_argument('--max-eps', type=int, default=10, metavar='N', 65 | help='max episodes of active learning') 66 | parser.add_argument('--dropout-iterations', type=int, default=5, metavar='N', 67 | help='dropout iterations for bald method') 68 | parser.add_argument('--nclasses', type=int, default=10, metavar='N', 69 | help='number of classes in the dataset') 70 | return parser 71 | 72 | def remove_rows(perm, samp): 73 | 74 | len_perm = len(perm) 75 | len_samp = len(samp) 76 | 77 | perm = perm.tolist() 78 | samp = samp.tolist() 79 | 80 | result = [item for item in perm if item not in samp] 81 | 82 | assert len(result) == len_perm - len_samp 83 | return np.array(result) 84 | 85 | def get_features(model, loader): 86 | features = [] 87 | model.eval() 88 | 89 | count = 0 90 | with torch.no_grad(): 91 | for sample in loader: 92 | data = sample['image'] 93 | target = sample['label'] 94 | img_name = sample['img_name'][0] 95 | 96 | data, target = data.to(device), target.to(device) 97 | output = model.get_features(data) 98 | # pdb.set_trace() 99 | 100 | count += 1 101 | # if count > 10000: 102 | # break 103 | 104 | features.append(output.cpu().numpy()) 105 | # features.append((img_name, output.cpu().numpy())) 106 | return features 107 | 108 | def get_probs(model, loader, stochastic=False): 109 | probs = [] 110 | if stochastic: 111 | model.train() 112 | else: 113 | model.eval() 114 | 115 | count = 0 116 | with torch.no_grad(): 117 | for sample in loader: 118 | data = sample['image'] 119 | target = sample['label'] 120 | img_name = sample['img_name'][0] 121 | 122 | data, target = data.to(device), target.to(device) 123 | 124 | if stochastic: 125 | output = model.stochastic_pred(data) 126 | output = model(data) 127 | 128 | # convert log softmax into softmax outputs 129 | prob = output.cpu().numpy() 130 | prob = np.exp(prob[0]) 131 | 132 | probs.append(prob) 133 | 134 | count += 1 135 | 136 | return np.array(probs) 137 | 138 | def active_sample(args, unlabeled_rows, sample_size, method='random', model=None): 139 | if method == 'random': 140 | np.random.shuffle(unlabeled_rows) 141 | sample_rows = unlabeled_rows[:sample_size] 142 | return sample_rows 143 | 144 | if method == 'prob_uncertain' or method == 'prob_margin' or method == 'prob_entropy': 145 | # unlabeled loader 146 | data_transforms = transforms.Compose([ 147 | transforms.ToTensor(), 148 | transforms.Normalize((0.1307,), (0.3081,)) 149 | ]) 150 | 151 | unlab_dset = MNIST(args.dataset_root, subset='train',csv_file='unlabeled.csv',transform=data_transforms) 152 | unlab_loader = DataLoader(unlab_dset, batch_size=1, shuffle=False, **kwargs) 153 | 154 | probabilities = get_probs(model, unlab_loader) 155 | 156 | if method == 'prob_uncertain': 157 | max_probs = np.max(probabilities, axis=1) 158 | 159 | # kind of a heap sort. 160 | argsorted_maxprobs = np.argpartition(max_probs, sample_size) 161 | # least probabilities 162 | sample_indices = argsorted_maxprobs[:sample_size] 163 | 164 | elif method == 'prob_margin': 165 | # find the top two probabilities 166 | top2_sorted = -1 * np.partition(-probabilities, 2, axis=1) 167 | margins = [x[0]-x[1] for x in top2_sorted] 168 | margins = np.array(margins) 169 | 170 | # find the ones with highest margin 171 | argsorted_margins = np.argpartition(-margins, sample_size) 172 | sample_indices = argsorted_margins[:sample_size] 173 | 174 | 175 | elif method == 'prob_entropy': 176 | entropy_arr = (-probabilities*np.log2(probabilities)).sum(axis=1) 177 | 178 | # find the ones with the highest entropy 179 | argsorted_ent = np.argpartition(-entropy_arr, sample_size) 180 | sample_indices = argsorted_ent[:sample_size] 181 | 182 | sample_rows = unlabeled_rows[sample_indices] 183 | return sample_rows 184 | 185 | if method == 'coreset': 186 | #create unlabeled loader 187 | data_transforms = transforms.Compose([ 188 | transforms.ToTensor(), 189 | transforms.Normalize((0.1307,), (0.3081,)) 190 | ]) 191 | 192 | unlab_dset = MNIST(args.dataset_root, subset='train',csv_file='unlabeled.csv',transform=data_transforms) 193 | unlab_loader = DataLoader(unlab_dset, batch_size=1, shuffle=False, **kwargs) 194 | 195 | #labeled dataloader 196 | lab_dset = MNIST(args.dataset_root, subset='train',csv_file='labeled.csv',transform=data_transforms) 197 | lab_loader = DataLoader(lab_dset, batch_size=1, shuffle=False, **kwargs) 198 | 199 | # get labeled features 200 | labeled_features = get_features(model, lab_loader) # (img_name, features) 201 | # get unlabeled features 202 | unlabeled_features = get_features(model, unlab_loader)# (img_name, features) 203 | 204 | all_features = labeled_features + unlabeled_features 205 | labeled_indices = np.arange(0,len(labeled_features)) 206 | 207 | coreset = Coreset_Greedy(all_features) 208 | new_batch, max_distance = coreset.sample(labeled_indices, sample_size) 209 | 210 | # unlabeled rows start after labeled rows in all_features 211 | # so offset the indices 212 | new_batch = [i - len(labeled_features) for i in new_batch] 213 | 214 | sample_rows = unlabeled_rows[new_batch] 215 | 216 | return sample_rows 217 | 218 | if method == 'dbal_bald': 219 | # according to BALD implementation by Riashat Islam 220 | # first randomly sample 2000 points 221 | dropout_pool_size = 2000 222 | unl_rows = np.copy(unlabeled_rows) 223 | 224 | if len(unl_rows) >= dropout_pool_size: 225 | np.random.shuffle(unl_rows) 226 | dropout_pool = unl_rows[:dropout_pool_size] 227 | temp_unlabeled_csv = 'unlabeled_temp.csv' 228 | np.savetxt(os.path.join(args.dataset_root, temp_unlabeled_csv), dropout_pool,'%s,%s',delimiter=',') 229 | csv_file = temp_unlabeled_csv 230 | else: 231 | dropout_pool = unl_rows 232 | csv_file = 'unlabeled.csv' 233 | 234 | 235 | 236 | #create unlabeled loader 237 | data_transforms = transforms.Compose([ 238 | transforms.ToTensor(), 239 | transforms.Normalize((0.1307,), (0.3081,)) 240 | ]) 241 | 242 | unlab_dset = MNIST(args.dataset_root, subset='train',csv_file=csv_file,transform=data_transforms) 243 | unlab_loader = DataLoader(unlab_dset, batch_size=1, shuffle=False, **kwargs) 244 | 245 | scores_sum = np.zeros(shape=(len(dropout_pool), args.nclasses)) 246 | entropy_sum = np.zeros(shape=(len(dropout_pool))) 247 | 248 | for _ in range(args.dropout_iterations): 249 | probabilities = get_probs(model, unlab_loader, stochastic=True) 250 | 251 | 252 | 253 | entropy = - np.multiply(probabilities, np.log(probabilities)) 254 | entropy = np.sum(entropy, axis=1) 255 | 256 | entropy_sum += entropy 257 | scores_sum += probabilities 258 | 259 | 260 | avg_scores = np.divide(scores_sum, args.dropout_iterations) 261 | entropy_avg_sc = - np.multiply(avg_scores, np.log(avg_scores)) 262 | entropy_avg_sc = np.sum(entropy_avg_sc, axis=1) 263 | 264 | avg_entropy = np.divide(entropy_sum, args.dropout_iterations) 265 | 266 | bald_score = entropy_avg_sc - avg_entropy 267 | 268 | # partial sort 269 | argsorted_bald = np.argpartition(-bald_score, sample_size) 270 | # get the indices 271 | sample_indices = argsorted_bald[:sample_size] 272 | sample_rows = dropout_pool[sample_indices] 273 | 274 | return sample_rows 275 | 276 | 277 | 278 | 279 | 280 | def log(dest_dir, episode_id, sample_method, sample_time, accuracy, labeled_rows): 281 | log_file = os.path.join(dest_dir, 'log.csv') 282 | if not os.path.exists(log_file): 283 | log_rows = [['Episode Id','Sample Method','Sampling Time (s)','Labeled Pool','Accuracy']] 284 | else: 285 | log_rows = np.genfromtxt(log_file, delimiter=',', dtype=str).tolist() 286 | 287 | log_rows.append([episode_id,sample_method, sample_time, len(labeled_rows), accuracy]) 288 | np.savetxt(log_file,log_rows,'%s,%s,%s,%s,%s',delimiter=',') 289 | 290 | def log_picked_samples(dest_dir, samples, ep_id=0): 291 | dest_file = os.path.join(dest_dir, 'picked.txt') 292 | 293 | with open(dest_file, 'a') as f: 294 | writer = csv.writer(f) 295 | writer.writerow(["Episode ID", str(ep_id)]) 296 | for s in samples: 297 | writer.writerow(s.tolist()) 298 | 299 | 300 | 301 | if __name__ == "__main__": 302 | args = argparser().parse_args() 303 | pprint.pprint(args) 304 | 305 | use_cuda = not args.no_cuda and torch.cuda.is_available() 306 | torch.manual_seed(args.seed) 307 | device = torch.device("cuda" if use_cuda else "cpu") 308 | kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 309 | 310 | # Obtaining init pool 311 | labeled_csv, unlabeled_csv = obtain_init_pool(args) 312 | print("Initial labeled pool created.") 313 | 314 | # initial setup 315 | data_transforms = transforms.Compose([ 316 | transforms.ToTensor(), 317 | transforms.Normalize((0.1307,), (0.3081,)) 318 | ]) 319 | 320 | dataset_train = MNIST(args.dataset_root, subset='train',csv_file='labeled.csv',transform=data_transforms) 321 | dataset_test = MNIST(args.dataset_root, subset='test', csv_file='test.csv', transform=data_transforms) 322 | 323 | # initial training 324 | train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, **kwargs) 325 | test_loader = DataLoader(dataset_test, batch_size=args.test_batch_size, shuffle=False, **kwargs) 326 | 327 | model = Net().to(device) # initialize the model. 328 | optimizer = optim.Adam(model.parameters(), lr=args.lr) # setup the optimizer 329 | scheduler = StepLR(optimizer, step_size = 1, gamma=args.gamma) 330 | 331 | for epoch in range(1, args.epochs + 1): 332 | model = train(args, model, device, train_loader, optimizer, epoch) 333 | 334 | scheduler.step() 335 | 336 | accuracy = test(args, model, device, test_loader) 337 | # save model 338 | dest_dir = os.path.join(args.output_dir, args.dataset_name) 339 | 340 | if not os.path.exists(args.output_dir): 341 | os.mkdir(args.output_dir) 342 | if not os.path.exists(dest_dir): 343 | os.mkdir(dest_dir) 344 | 345 | now = datetime.now() 346 | dest_dir_name = str(now.year) + str(now.month) + str(now.day) + str(now.hour) + str(now.minute) + str(now.second) 347 | dest_dir_name = os.path.join(dest_dir, dest_dir_name) 348 | if not os.path.exists(dest_dir_name): 349 | os.mkdir(dest_dir_name) 350 | save_path = os.path.join(dest_dir_name,'init.pth') 351 | torch.save(model.state_dict(), save_path) 352 | print("initial pool model saved in: ",save_path) 353 | 354 | 355 | 356 | # copy labeled csv and unlabeled csv to dest_dir 357 | # pdb.set_trace() 358 | 359 | # save config 360 | with open(dest_dir_name + '/config.json', 'w') as f: 361 | import json 362 | json.dump(vars(args),f) 363 | # save logs 364 | 365 | # pdb.set_trace() 366 | log(dest_dir_name, 0, args.sampling_method, 0, accuracy, [0]*args.init_size) 367 | log_picked_samples(dest_dir_name, np.genfromtxt(labeled_csv, delimiter=',', dtype=str)) 368 | 369 | 370 | # start the active learning loop. 371 | episode_id = 1 372 | while True: 373 | 374 | if episode_id > args.max_eps: 375 | break 376 | 377 | 378 | # read the unlabeled file 379 | unlabeled_rows = np.genfromtxt(unlabeled_csv, delimiter=',', dtype=str) 380 | labeled_rows = np.genfromtxt(labeled_csv, delimiter=',', dtype=str) 381 | 382 | print("Episode #",episode_id) 383 | 384 | 385 | # sanity checks 386 | if len(unlabeled_rows) == 0: 387 | break 388 | 389 | # set the sample size 390 | sample_size = args.al_batch_size 391 | if len(unlabeled_rows) < sample_size: 392 | sample_size = len(unlabeled_rows) 393 | 394 | # sample 395 | sample_start = time.time() 396 | sample_rows = active_sample(args, unlabeled_rows, sample_size, method=args.sampling_method, model=model) 397 | sample_end = time.time() 398 | 399 | # log picked samples 400 | log_picked_samples(dest_dir_name, sample_rows, episode_id) 401 | 402 | 403 | 404 | sample_time = sample_end - sample_start 405 | 406 | # update the labeled pool 407 | labeled_rows = np.concatenate((labeled_rows,sample_rows),axis=0) 408 | np.savetxt(labeled_csv, labeled_rows,'%s,%s',delimiter=',') 409 | 410 | 411 | # update the unlabeled pool 412 | unlabeled_rows = remove_rows(unlabeled_rows, sample_rows) 413 | np.savetxt(unlabeled_csv, unlabeled_rows, '%s,%s', delimiter=',') 414 | 415 | print("Unlabeled pool size: ",len(unlabeled_rows)) 416 | print("Labeled pool size: ",len(labeled_rows)) 417 | 418 | 419 | #train the model 420 | dataset_train = MNIST(args.dataset_root, subset='train',csv_file='labeled.csv',transform=data_transforms) 421 | train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, **kwargs) 422 | 423 | model = Net().to(device) # initialize the model. 424 | optimizer = optim.Adam(model.parameters(), lr=args.lr) # setup the optimizer 425 | # scheduler = StepLR(optimizer, step_size = 1, gamma=args.gamma) 426 | 427 | for epoch in range(1, args.epochs + 1): 428 | model = train(args, model, device, train_loader, optimizer, epoch) 429 | accuracy = test(args, model, device, test_loader) 430 | # scheduler.step() 431 | 432 | # save model 433 | save_path = os.path.join(dest_dir_name, 'ep_'+str(episode_id)+'.pth') 434 | torch.save(model.state_dict(), save_path) 435 | 436 | log(dest_dir_name, episode_id, args.sampling_method, sample_time, accuracy, labeled_rows) 437 | 438 | episode_id += 1 439 | --------------------------------------------------------------------------------