├── .gitignore ├── Deep_feature_extractors ├── extractor.py ├── googlenet.py ├── resnet18.py └── vgg19.py ├── Genetic_Algorithm ├── GA.py └── GA_functions.py ├── LICENSE ├── README.md ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Deep_feature_extractors/extractor.py: -------------------------------------------------------------------------------- 1 | from Deep_feature_extractors import googlenet,resnet18,vgg19 2 | import numpy as np 3 | 4 | def feature_extractor(folder_path, ext, out_classes): 5 | 6 | if ext == 'googlenet': 7 | 8 | df5, df6 = googlenet.model(folder_path, out_classes) 9 | array_train = np.asarray(df5) 10 | array_val = array_val = np.asarray(df6) 11 | return array_train, array_val 12 | 13 | elif ext == 'vgg': 14 | 15 | df5, df6 = vgg19.model(folder_path, out_classes) 16 | array_train = np.asarray(df5) 17 | array_val = array_val = np.asarray(df6) 18 | return array_train, array_val 19 | 20 | else: 21 | 22 | df5, df6 = resnet18.model(folder_path, out_classes) 23 | array_train = np.asarray(df5) 24 | array_val = array_val = np.asarray(df6) 25 | return array_train, array_val -------------------------------------------------------------------------------- /Deep_feature_extractors/googlenet.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | import torchvision 5 | from torchvision.datasets import ImageFolder 6 | from torch.utils.data import DataLoader 7 | import torchvision.transforms as transf 8 | import numpy as np 9 | import pandas as pd 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import time 13 | 14 | 15 | def model(folder_path, out_classes): 16 | 17 | # Configuring Device 18 | 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | # Hyperparameter tuning 22 | 23 | epoch_no = 5 24 | l_rate = 0.0001 25 | batch_size_tr = 50 26 | batch_size_val = 30 27 | 28 | # Transforming data 29 | 30 | train_transform = transf.Compose([ 31 | transf.Resize((224,224)), 32 | transf.ToTensor() 33 | ]) 34 | 35 | val_transform = transf.Compose([ 36 | transf.Resize((224,224)), 37 | transf.ToTensor() 38 | ]) 39 | 40 | # Loading the dataset in the system 41 | 42 | train_ds = ImageFolder(folder_path +'/train',transform=train_transform) 43 | val_ds = ImageFolder(folder_path +'/val', transform = val_transform) 44 | train_loader = DataLoader(train_ds, batch_size=batch_size_tr, shuffle=True, num_workers=2, drop_last=True) 45 | val_loader = DataLoader(val_ds, batch_size=batch_size_val, shuffle=True, num_workers=2, drop_last=True) 46 | 47 | # Specifying reference model 48 | 49 | old_model = torchvision.models.googlenet(pretrained=True) 50 | 51 | # Changing the model for deep feature extraction 52 | 53 | class whole_cnn(nn.Module): 54 | def __init__(self): 55 | super(whole_cnn, self).__init__() 56 | 57 | # Removing the classifier list from VGG-19 after which the deep features are to be extracted 58 | 59 | self.remv_linear = torch.nn.Sequential(*(list(old_model.children())[:-1])) 60 | 61 | # Re-assigning the removed layers of VGG-19 and storing it in our own CNN to find change in accuracy 62 | 63 | self.flatten = nn.Flatten() 64 | self.add_linear = torch.nn.Sequential(nn.Linear(1024,out_classes,bias=True)) 65 | 66 | 67 | def forward(self,x): 68 | output = self.remv_linear(x) 69 | output = self.flatten(output) 70 | x_deep = output 71 | output_new = self.add_linear(output) 72 | return x_deep,output_new 73 | 74 | # Specifying model and performing Loss calculation and Optimization 75 | 76 | model = whole_cnn() 77 | model = model.to(device) 78 | criterion = nn.CrossEntropyLoss() 79 | criterion.to(device) 80 | optim = torch.optim.Adam(model.parameters(), lr = l_rate) 81 | 82 | # Training and Validating with our CNN model 83 | 84 | def train_model(model, criterion, optim, epoch_no): 85 | best_deep_featr_train=[] 86 | best_labels_train=[] 87 | best_deep_featr_val=[] 88 | best_labels_val=[] 89 | since = time.time() 90 | best_acc= 0.0 91 | for epoch in range(epoch_no): 92 | train_features = np.zeros((50,1024)) 93 | train_labels = [] 94 | running_loss = 0.0 95 | running_acc = 0.0 96 | model.train() 97 | for images,labels in train_loader: 98 | images = images.to(device) 99 | labels = labels.to(device) 100 | with torch.set_grad_enabled(True): 101 | deep_featr,outputs = model(images) 102 | train_features = np.append(train_features, deep_featr.detach().cpu().numpy(), axis=0) 103 | train_labels = np.append(train_labels, labels.cpu().detach().numpy(), axis=0) 104 | _ ,preds = torch.max(outputs,1) 105 | loss = criterion(outputs,labels) 106 | loss.backward() 107 | optim.step() 108 | optim.zero_grad() 109 | 110 | # Calculating and printing all Statistics 111 | 112 | running_loss += loss.item()*batch_size_tr 113 | running_acc += torch.sum(preds==labels) 114 | running_val_loss, running_val_acc, val_features, val_labels = model_val(model, criterion, optim) 115 | epoch_train_loss = running_loss/len(train_ds) 116 | epoch_train_acc = running_acc.double()/len(train_ds) 117 | print("Epoch: {}".format(epoch+1)) 118 | print('-'*10) 119 | print('Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch_train_loss,epoch_train_acc)) 120 | epoch_val_loss = running_val_loss/len(val_ds) 121 | epoch_val_acc = running_val_acc.double()/len(val_ds) 122 | print('Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch_val_loss,epoch_val_acc)) 123 | print() 124 | if epoch_val_acc > best_acc: 125 | best_acc = epoch_val_acc 126 | best_deep_featr_train = train_features 127 | best_labels_train = train_labels 128 | best_deep_featr_val = val_features 129 | best_labels_val = val_labels 130 | 131 | # Printing Time Elapsed and best Validation Accuracy 132 | 133 | time_elapsed = time.time() - since 134 | print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) 135 | print("Best model has validation accuracy: {}".format(best_acc)) 136 | return best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val 137 | 138 | 139 | def model_val(model, criterion, optim): 140 | model.eval() 141 | running_val_loss = 0.0 142 | running_val_acc = 0.0 143 | val_features = np.zeros((30,1024)) 144 | val_labels = [] 145 | for images,labels in val_loader: 146 | images = images.to(device) 147 | labels = labels.to(device) 148 | deep_featr,outputs = model(images) 149 | val_features = np.append(val_features, deep_featr.detach().cpu().numpy(), axis=0) 150 | val_labels = np.append(val_labels, labels.cpu().detach().numpy(), axis=0) 151 | _ ,preds = torch.max(outputs,1) 152 | loss = criterion(outputs,labels) 153 | running_val_loss += loss.item()*batch_size_val 154 | running_val_acc += torch.sum(preds==labels) 155 | return running_val_loss, running_val_acc, val_features, val_labels 156 | 157 | # Calling the function to train our CNN model 158 | 159 | best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val = train_model(model, criterion, optim, epoch_no) 160 | 161 | # Creating the dataframes for the extracted deep features 162 | 163 | df1=pd.DataFrame(best_deep_featr_train[50:,:]) 164 | df2=pd.DataFrame(best_labels_train) 165 | df3=pd.DataFrame(best_deep_featr_val[30:,:]) 166 | df4=pd.DataFrame(best_labels_val) 167 | print(df1.shape,df2.shape,df3.shape,df4.shape) 168 | df5 = pd.concat([df1,df2], axis=1) 169 | df6 = pd.concat([df3,df4], axis=1) 170 | 171 | return df5, df6 -------------------------------------------------------------------------------- /Deep_feature_extractors/resnet18.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | import torchvision 5 | from torchvision.datasets import ImageFolder 6 | from torch.utils.data import DataLoader 7 | import torchvision.transforms as transf 8 | import numpy as np 9 | import pandas as pd 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import time 13 | 14 | 15 | def model(folder_path, out_classes): 16 | 17 | # Configuring Device 18 | 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | # Hyperparameter tuning 22 | 23 | epoch_no = 5 24 | l_rate = 0.0001 25 | batch_size_tr = 50 26 | batch_size_val = 30 27 | 28 | # Transforming data 29 | 30 | train_transform = transf.Compose([ 31 | transf.Resize((224,224)), 32 | transf.ToTensor() 33 | ]) 34 | 35 | val_transform = transf.Compose([ 36 | transf.Resize((224,224)), 37 | transf.ToTensor() 38 | ]) 39 | 40 | # Loading the dataset in the system 41 | 42 | train_ds = ImageFolder(folder_path +'/train',transform=train_transform) 43 | val_ds = ImageFolder(folder_path +'/val', transform = val_transform) 44 | train_loader = DataLoader(train_ds, batch_size=batch_size_tr, shuffle=True, num_workers=2, drop_last=True) 45 | val_loader = DataLoader(val_ds, batch_size=batch_size_val, shuffle=True, num_workers=2, drop_last=True) 46 | 47 | # Specifying reference model 48 | 49 | old_model = torchvision.models.resnet18(pretrained=True) 50 | 51 | # Changing the model for deep feature extraction 52 | 53 | class whole_cnn(nn.Module): 54 | def __init__(self): 55 | super(whole_cnn, self).__init__() 56 | 57 | # Removing the classifier list from VGG-19 after which the deep features are to be extracted 58 | 59 | self.remv_linear = torch.nn.Sequential(*(list(old_model.children())[:-1])) 60 | 61 | # Re-assigning the removed layers of VGG-19 and storing it in our own CNN to find change in accuracy 62 | 63 | self.flatten = nn.Flatten() 64 | self.add_linear = torch.nn.Sequential(nn.Linear(512,out_classes,bias=True)) 65 | 66 | 67 | def forward(self,x): 68 | output = self.remv_linear(x) 69 | output = self.flatten(output) 70 | x_deep = output 71 | output_new = self.add_linear(output) 72 | return x_deep,output_new 73 | 74 | # Specifying model and performing Loss calculation and Optimization 75 | 76 | model = whole_cnn() 77 | model = model.to(device) 78 | criterion = nn.CrossEntropyLoss() 79 | criterion.to(device) 80 | optim = torch.optim.Adam(model.parameters(), lr = l_rate) 81 | 82 | # Training and Validating with our CNN model 83 | 84 | def train_model(model, criterion, optim, epoch_no): 85 | best_deep_featr_train=[] 86 | best_labels_train=[] 87 | best_deep_featr_val=[] 88 | best_labels_val=[] 89 | since = time.time() 90 | best_acc= 0.0 91 | for epoch in range(epoch_no): 92 | train_features = np.zeros((50,512)) 93 | train_labels = [] 94 | running_loss = 0.0 95 | running_acc = 0.0 96 | model.train() 97 | for images,labels in train_loader: 98 | images = images.to(device) 99 | labels = labels.to(device) 100 | with torch.set_grad_enabled(True): 101 | deep_featr,outputs = model(images) 102 | train_features = np.append(train_features, deep_featr.detach().cpu().numpy(), axis=0) 103 | train_labels = np.append(train_labels, labels.cpu().detach().numpy(), axis=0) 104 | _ ,preds = torch.max(outputs,1) 105 | loss = criterion(outputs,labels) 106 | loss.backward() 107 | optim.step() 108 | optim.zero_grad() 109 | 110 | # Calculating and printing all Statistics 111 | 112 | running_loss += loss.item()*batch_size_tr 113 | running_acc += torch.sum(preds==labels) 114 | running_val_loss, running_val_acc, val_features, val_labels = model_val(model, criterion, optim) 115 | epoch_train_loss = running_loss/len(train_ds) 116 | epoch_train_acc = running_acc.double()/len(train_ds) 117 | print("Epoch: {}".format(epoch+1)) 118 | print('-'*10) 119 | print('Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch_train_loss,epoch_train_acc)) 120 | epoch_val_loss = running_val_loss/len(val_ds) 121 | epoch_val_acc = running_val_acc.double()/len(val_ds) 122 | print('Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch_val_loss,epoch_val_acc)) 123 | print() 124 | if epoch_val_acc > best_acc: 125 | best_acc = epoch_val_acc 126 | best_deep_featr_train = train_features 127 | best_labels_train = train_labels 128 | best_deep_featr_val = val_features 129 | best_labels_val = val_labels 130 | 131 | # Printing Time Elapsed and best Validation Accuracy 132 | 133 | time_elapsed = time.time() - since 134 | print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) 135 | print("Best model has validation accuracy: {}".format(best_acc)) 136 | return best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val 137 | 138 | 139 | def model_val(model, criterion, optim): 140 | model.eval() 141 | running_val_loss = 0.0 142 | running_val_acc = 0.0 143 | val_features = np.zeros((30,512)) 144 | val_labels = [] 145 | for images,labels in val_loader: 146 | images = images.to(device) 147 | labels = labels.to(device) 148 | deep_featr,outputs = model(images) 149 | val_features = np.append(val_features, deep_featr.detach().cpu().numpy(), axis=0) 150 | val_labels = np.append(val_labels, labels.cpu().detach().numpy(), axis=0) 151 | _ ,preds = torch.max(outputs,1) 152 | loss = criterion(outputs,labels) 153 | running_val_loss += loss.item()*batch_size_val 154 | running_val_acc += torch.sum(preds==labels) 155 | return running_val_loss, running_val_acc, val_features, val_labels 156 | 157 | # Calling the function to train our CNN model 158 | 159 | best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val = train_model(model, criterion, optim, epoch_no) 160 | 161 | # Creating the dataframes for the extracted deep features 162 | 163 | df1=pd.DataFrame(best_deep_featr_train[50:,:]) 164 | df2=pd.DataFrame(best_labels_train) 165 | df3=pd.DataFrame(best_deep_featr_val[30:,:]) 166 | df4=pd.DataFrame(best_labels_val) 167 | print(df1.shape,df2.shape,df3.shape,df4.shape) 168 | df5 = pd.concat([df1,df2], axis=1) 169 | df6 = pd.concat([df3,df4], axis=1) 170 | 171 | return df5, df6 -------------------------------------------------------------------------------- /Deep_feature_extractors/vgg19.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | import torchvision 5 | from torchvision.datasets import ImageFolder 6 | from torch.utils.data import DataLoader 7 | import torchvision.transforms as transf 8 | import numpy as np 9 | import pandas as pd 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import time 13 | 14 | 15 | def model(folder_path, out_classes): 16 | 17 | # Configuring Device 18 | 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | # Hyperparameter tuning 22 | 23 | epoch_no = 5 24 | l_rate = 0.0001 25 | batch_size_tr = 50 26 | batch_size_val = 30 27 | 28 | # Transforming data 29 | 30 | train_transform = transf.Compose([ 31 | transf.Resize((224,224)), 32 | transf.ToTensor() 33 | ]) 34 | 35 | val_transform = transf.Compose([ 36 | transf.Resize((224,224)), 37 | transf.ToTensor() 38 | ]) 39 | 40 | # Loading the dataset in the system 41 | 42 | train_ds = ImageFolder(folder_path +'/train',transform=train_transform) 43 | val_ds = ImageFolder(folder_path +'/val', transform = val_transform) 44 | train_loader = DataLoader(train_ds, batch_size=batch_size_tr, shuffle=True, num_workers=2, drop_last=True) 45 | val_loader = DataLoader(val_ds, batch_size=batch_size_val, shuffle=True, num_workers=2, drop_last=True) 46 | 47 | # Specifying reference model 48 | 49 | old_model = torchvision.models.vgg19(pretrained=True) 50 | 51 | # Changing the model for deep feature extraction 52 | 53 | class whole_cnn(nn.Module): 54 | def __init__(self): 55 | super(whole_cnn, self).__init__() 56 | 57 | # Removing the classifier list from VGG-19 after which the deep features are to be extracted 58 | 59 | self.remv_classifier = torch.nn.Sequential(*(list(old_model.children())[:-1])) 60 | 61 | # Re-assigning the removed layers of VGG-19 and storing it in our own CNN to find change in accuracy 62 | 63 | self.flatten = nn.Flatten() 64 | self.add_classifier = torch.nn.Sequential(nn.Linear(25088,4096,bias=True), 65 | nn.ReLU(), 66 | nn.Dropout(0.5), 67 | nn.Linear(4096,4096,bias=True), 68 | nn.ReLU(), 69 | nn.Dropout(0.5), 70 | nn.Linear(4096, out_classes)) 71 | 72 | 73 | def forward(self,x): 74 | output = self.remv_classifier(x) 75 | output = self.flatten(output) 76 | x_deep = output 77 | output_new = self.add_classifier(output) 78 | return x_deep,output_new 79 | 80 | # Specifying model and performing Loss calculation and Optimization 81 | 82 | model = whole_cnn() 83 | model = model.to(device) 84 | criterion = nn.CrossEntropyLoss() 85 | criterion.to(device) 86 | optim = torch.optim.Adam(model.parameters(), lr = l_rate) 87 | 88 | # Training and Validating with our CNN model 89 | 90 | def train_model(model, criterion, optim, epoch_no): 91 | best_deep_featr_train=[] 92 | best_labels_train=[] 93 | best_deep_featr_val=[] 94 | best_labels_val=[] 95 | since = time.time() 96 | best_acc= 0.0 97 | for epoch in range(epoch_no): 98 | train_features = np.zeros((50,25088)) 99 | train_labels = [] 100 | running_loss = 0.0 101 | running_acc = 0.0 102 | model.train() 103 | for images,labels in train_loader: 104 | images = images.to(device) 105 | labels = labels.to(device) 106 | with torch.set_grad_enabled(True): 107 | deep_featr,outputs = model(images) 108 | train_features = np.append(train_features, deep_featr.detach().cpu().numpy(), axis=0) 109 | train_labels = np.append(train_labels, labels.cpu().detach().numpy(), axis=0) 110 | _ ,preds = torch.max(outputs,1) 111 | loss = criterion(outputs,labels) 112 | loss.backward() 113 | optim.step() 114 | optim.zero_grad() 115 | 116 | # Calculating and printing all Statistics 117 | 118 | running_loss += loss.item()*batch_size_tr 119 | running_acc += torch.sum(preds==labels) 120 | running_val_loss, running_val_acc, val_features, val_labels = model_val(model, criterion, optim) 121 | epoch_train_loss = running_loss/len(train_ds) 122 | epoch_train_acc = running_acc.double()/len(train_ds) 123 | print("Epoch: {}".format(epoch+1)) 124 | print('-'*10) 125 | print('Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch_train_loss,epoch_train_acc)) 126 | epoch_val_loss = running_val_loss/len(val_ds) 127 | epoch_val_acc = running_val_acc.double()/len(val_ds) 128 | print('Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch_val_loss,epoch_val_acc)) 129 | print() 130 | if epoch_val_acc > best_acc: 131 | best_acc = epoch_val_acc 132 | best_deep_featr_train = train_features 133 | best_labels_train = train_labels 134 | best_deep_featr_val = val_features 135 | best_labels_val = val_labels 136 | 137 | # Printing Time Elapsed and best Validation Accuracy 138 | 139 | time_elapsed = time.time() - since 140 | print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) 141 | print("Best model has validation accuracy: {}".format(best_acc)) 142 | return best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val 143 | 144 | 145 | def model_val(model, criterion, optim): 146 | model.eval() 147 | running_val_loss = 0.0 148 | running_val_acc = 0.0 149 | val_features = np.zeros((30,25088)) 150 | val_labels = [] 151 | for images,labels in val_loader: 152 | images = images.to(device) 153 | labels = labels.to(device) 154 | deep_featr,outputs = model(images) 155 | val_features = np.append(val_features, deep_featr.detach().cpu().numpy(), axis=0) 156 | val_labels = np.append(val_labels, labels.cpu().detach().numpy(), axis=0) 157 | _ ,preds = torch.max(outputs,1) 158 | loss = criterion(outputs,labels) 159 | running_val_loss += loss.item()*batch_size_val 160 | running_val_acc += torch.sum(preds==labels) 161 | return running_val_loss, running_val_acc, val_features, val_labels 162 | 163 | # Calling the function to train our CNN model 164 | 165 | best_deep_featr_train, best_labels_train, best_deep_featr_val, best_labels_val = train_model(model, criterion, optim, epoch_no) 166 | 167 | # Creating the dataframes for the extracted deep features 168 | 169 | df1=pd.DataFrame(best_deep_featr_train[50:,:]) 170 | df2=pd.DataFrame(best_labels_train) 171 | df3=pd.DataFrame(best_deep_featr_val[30:,:]) 172 | df4=pd.DataFrame(best_labels_val) 173 | print(df1.shape,df2.shape,df3.shape,df4.shape) 174 | df5 = pd.concat([df1,df2], axis=1) 175 | df6 = pd.concat([df3,df4], axis=1) 176 | 177 | return df5, df6 -------------------------------------------------------------------------------- /Genetic_Algorithm/GA.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from Genetic_Algorithm import GA_functions 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def algorithm(array_train, array_val, classif): 9 | 10 | # Loading the necessary variables for applying GA 11 | 12 | pop_size = 50 # Population size 13 | num_parents_mating = (int)(pop_size*0.5) # Number of parents inside the mating pool. 14 | num_mutations = 3 # Number of elements to mutate. 15 | num_generations = 10 # Number of generations 16 | classifier = classif 17 | 18 | num_feature_elements_train = array_train.shape[1]-1 19 | 20 | 21 | # Applying Genetic Algorithm (GA) 22 | 23 | # Defining the population shape. 24 | pop_shape = (pop_size, num_feature_elements_train) 25 | 26 | # Creating the initial population. 27 | new_population = np.random.randint(low=0, high=2, size=pop_shape) 28 | print(new_population.shape) 29 | 30 | best_outputs = [] 31 | for generation in range(num_generations): 32 | print("Generation : ", (generation+1)) 33 | # Measuring the fitness of each chromosome in the population. 34 | fitness = GA_functions.cal_pop_fitness(new_population, array_train, array_val, classifier) 35 | 36 | best_outputs.append(np.max(fitness)) 37 | # The best result in the current iteration. 38 | print("Best result : ", best_outputs[-1]) 39 | 40 | # Selecting the best parents in the population for mating. 41 | parents = GA_functions.select_mating_pool(new_population, fitness, num_parents_mating) 42 | 43 | # Generating next generation using crossover. 44 | crossed_offsprings = GA_functions.crossover(parents, offspring_size=(pop_shape[0]-parents.shape[0], num_feature_elements_train)) 45 | 46 | # Adding some variations to the offspring using mutation. 47 | mutated_offsprings = GA_functions.mutation(crossed_offsprings, num_mutations) 48 | 49 | # Creating the new population based on the parents and offspring. 50 | new_population[ :parents.shape[0], :] = parents 51 | new_population[parents.shape[0]: , :] = mutated_offsprings 52 | 53 | 54 | # Getting the best solution after iterating finishing all generations. 55 | 56 | best_match_idx = np.where(best_outputs == np.max(best_outputs))[0] 57 | best_match_idx = best_match_idx[0] 58 | 59 | best_acc = (best_outputs[best_match_idx])*100.0 60 | best_solution = new_population[best_match_idx, :] 61 | best_solution_indices = np.where(best_solution == 1)[0] 62 | best_solution_num_elements = best_solution_indices.shape[0] 63 | 64 | # Printing the required statistics 65 | 66 | print("The accuracy of the best candidate solution is {:.4f}".format(best_acc)) 67 | print("Selected feature indices by GA : ", best_solution_indices) 68 | print("Number of selected features by GA : ", best_solution_num_elements) 69 | 70 | # Plotting the 'Accuracy' vs 'Generation' curve 71 | 72 | plt.plot(range(num_generations), best_outputs,'b') 73 | plt.xlabel('Generations') 74 | plt.ylabel("Accuracy") -------------------------------------------------------------------------------- /Genetic_Algorithm/GA_functions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | 5 | # Defining the necessary functions for GA 6 | 7 | 8 | # Feature Reduction to eliminate the redundant features 9 | 10 | def reduce_features(solution, array_train, array_val): 11 | solution = np.append(solution, [1]) 12 | selected_elements_indices = np.where(solution == 1)[0] # Selecting elements whose genes are given a value of 1 13 | reduced_train_features = array_train[:, selected_elements_indices] 14 | reduced_val_features = array_val[:, selected_elements_indices] 15 | return reduced_train_features, reduced_val_features 16 | 17 | 18 | # Calculating and returning accuracy of population 19 | 20 | def classification_accuracy(labels, val_predictions): 21 | correct = np.where(labels == val_predictions) 22 | accuracy = correct[0].shape[0]/val_predictions.shape[0] 23 | return accuracy 24 | 25 | 26 | # Calculating the population fitness using SVM, KNN or MLP classifiers 27 | 28 | def cal_pop_fitness(pop, array_train, array_val, classifier): 29 | accuracies = np.zeros(pop.shape[0]) 30 | idx = 0 # Counter variable for creating the array accuracies 31 | 32 | for curr_solution in pop: # Current solution is the chromosome and pop is the total set of chromosomes (population) 33 | reduced_train_features, reduced_val_features = reduce_features(curr_solution, array_train, array_val) 34 | X=reduced_train_features[:,:-1] # Taking all the features columns 35 | y=reduced_train_features[:,-1] # Taking the labels column 36 | reduced_validation_features = reduced_val_features[:,:-1] 37 | validation_labels = reduced_val_features[:,-1] 38 | 39 | # Mentioning classifier 40 | 41 | if classifier == 'SVM': 42 | ## SVM CLASSIFIER ## 43 | from sklearn.svm import SVC 44 | SVM_classifier = SVC(kernel='rbf') 45 | SVM_classifier.fit(X, y) 46 | val_predictions = SVM_classifier.predict(reduced_validation_features) 47 | 48 | elif classifier == 'MLP': 49 | ## MLP CLASSIFIER ## 50 | from sklearn.neural_network import MLPClassifier 51 | MLP_classifier = MLPClassifier() 52 | MLP_classifier.fit(X, y) 53 | val_predictions = MLP_classifier.predict(reduced_validation_features) 54 | 55 | 56 | else: 57 | ## KNN CLASSIFIER ## 58 | from sklearn.neighbors import KNeighborsClassifier 59 | KNN_classifier = KNeighborsClassifier(n_neighbors=2) 60 | KNN_classifier.fit(X, y) 61 | val_predictions = KNN_classifier.predict(reduced_validation_features) 62 | 63 | accuracies[idx] = classification_accuracy(validation_labels, val_predictions) 64 | 65 | idx = idx + 1 66 | return accuracies 67 | 68 | 69 | # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation. 70 | 71 | def select_mating_pool(pop, fitness, num_parents): 72 | parents = np.empty((num_parents, pop.shape[1])) 73 | for parent_num in range(num_parents): 74 | max_fitness_index = np.where(fitness == np.max(fitness))[0] 75 | max_fitness_index = max_fitness_index[0] 76 | parents[parent_num, :] = pop[max_fitness_index, :] 77 | fitness[max_fitness_index] = -99999999999 78 | return parents 79 | 80 | 81 | # Applying One Point Crossover 82 | 83 | def crossover(parents, offspring_size): 84 | crossed_offsprings = np.empty(offspring_size) 85 | # The point at which crossover takes place between two parents. Usually, it is at the center. 86 | crossover_point = np.uint8(offspring_size[1]/2) 87 | 88 | for k in range(offspring_size[0]): 89 | # Index of the first parent to mate. 90 | parent1_index = k%parents.shape[0] 91 | 92 | # Index of the second parent to mate. 93 | parent2_index = (k+1)%parents.shape[0] 94 | 95 | # The new offspring will have its first half of its genes taken from the first parent. 96 | crossed_offsprings[k, 0:crossover_point] = parents[parent1_index, 0:crossover_point] 97 | 98 | # The new offspring will have its second half of its genes taken from the second parent. 99 | crossed_offsprings[k, crossover_point:] = parents[parent2_index, crossover_point:] 100 | return crossed_offsprings 101 | 102 | 103 | # Performing Mutation of randomly selected genes by flipping their values 104 | 105 | def mutation(crossed_offsprings, num_mutations): 106 | mutation_index = np.random.randint(low=0, high=crossed_offsprings.shape[1], size=num_mutations) 107 | for index in range(crossed_offsprings.shape[0]): 108 | # The random value to be added to the gene. 109 | crossed_offsprings[index, mutation_index] = 1 - crossed_offsprings[index, mutation_index] 110 | return crossed_offsprings -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Agnish Bhattacharya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-feature-extraction-from-CNNs-followed-by-optimization-with-GA 2 | 3 | ## Project Description 4 | This is a python-based project classifying the breast tumor tissue into benign or malignant, based on deep analysis of histopathological image samples of the popular `BreakHis` through the application of three popular `pre-trained` and `fine-tuned` Convolution Neural Networks, namely `GoogLeNet`, `ResNet-18`, and `VGG-19`, one at a time, followed by the extraction of the deep features from the CNN's `pre-final` layer and optimization (removal of redundant features) using the Genetic Algorithm (`GA`) for improved accuracy. The images are finally classified using any one of the `SVM`, `KNN` or `MLP` classifiers. 5 | 6 | ## Dataset description 7 | The Breast Cancer Histopathological Image Classification (BreakHis) is composed of 9,109 microscopic images of breast tumor tissue collected from 82 patients using different magnifying factors (40X, 100X, 200X, and 400X). To date, it contains 2,480 benign and 5,429 malignant samples (700X460 pixels, 3-channel RGB, 8-bit depth in each channel, PNG format). This database has been built in collaboration with the P&D Laboratory – Pathological Anatomy and Cytopathology, Parana, Brazil (http://www.prevencaoediagnose.com.br). The dataset is available at: 8 | https://www.kaggle.com/ambarish/breakhis 9 | 10 | ## Classes of Division 11 | In this project, we have used the histopathological image samples of human breast tissue, which have been classified into two categories, namely: 12 | - `Benign tissue` 13 | - `Malignant tissue` 14 | 15 | ## Convolution Neural Network models used 16 | Three CNN models have been applied on the dataset, namely: 17 | - `GoogLeNet` 18 | - `ResNet-18` 19 | - `Visual Geometry Group (VGG-19)` 20 | 21 | ## Classifier models used inside the Genetic Algorithm (GA) 22 | Three classifier models have been used, namely: 23 | - `Support Vector Machines (SVM) (RBF Kernel)` 24 | - `K-Nearest Neighbors (KNN) (K=2 used)` 25 | - `Multi-Layer Perceptron (MLP)` 26 | 27 | ## 'Accuracy' vs 'Generation' plots 28 | The following plots show the variation in accuracy of our validation dataset with the increasing number of generations after the application of Genetic Algorithm (GA) in each of the three Convolutional Neural Networks. The 'KNN' classifier has been used in every case. 29 | No. of Epochs: `5`, No. of Generations: `10` 30 | - GoogleNet using KNN classifier 31 | ![image](https://user-images.githubusercontent.com/84792746/154838869-35486f58-74c8-46b3-9b15-75333ddd2eef.png) 32 | - ResNet-18 using KNN classifier 33 | ![image](https://user-images.githubusercontent.com/84792746/154838818-d415193f-1736-4272-a9c3-90ce013bfc37.png) 34 | - VGG-19 using KNN classifier 35 | ![image](https://user-images.githubusercontent.com/84792746/154838834-71b65cea-cc2b-4527-b6ad-0d01e2c532c5.png) 36 | 37 | ## Flow diagram of the Genetic Algorithm 38 | 39 | 40 | ![image](https://user-images.githubusercontent.com/84792746/154852688-200dd978-ec4a-47c1-b073-0d3249e3d89c.png) 41 | 42 | 43 | ## Dependencies 44 | Since the entire project is based on `Python` programming language, it is necessary to have Python installed in the system. It is recommended to use Python with version `>=3.6`. 45 | The Python packages which are in use in this project are `matplotlib`, `numpy`, `pandas`, `scikit-learn`, `torch` and `torchvision`. All these dependencies can be installed just by the following command line argument 46 | - pip install `requirements.txt` 47 | 48 | ## Code implementation 49 | - ### Data paths : 50 | Current directory -------> data 51 | | 52 | | 53 | | 54 | ---------------------> train 55 | | | 56 | | ------------------------- 57 | | | | | 58 | | V V V 59 | | class_1 class_2 ...... class_n 60 | | 61 | | 62 | | 63 | ---------------------> val 64 | | 65 | ------------------------- 66 | | | | 67 | V V V 68 | class_1 class_2 ...... class_n 69 | 70 | 71 | - Where the folders `train` and `val` contain the folders `benign` and `malignant`, which include the original histopathological images of respective type of human breast tumor tissue in `.jpg`/`.png` format. 72 | 73 | - ### Training and Evaluation : 74 | 75 | usage: main.py [-h] [-data DATA_FOLDER] [-classes NUM_CLASSES] 76 | [-ext EXTRACTOR_TYPE] [-classif CLASSIFIER_TYPE] 77 | 78 | Application of Genetic Algorithm 79 | 80 | optional arguments: 81 | -h, --help show this help message and exit 82 | -data DATA_FOLDER, --data_folder DATA_FOLDER 83 | Path to data 84 | -classes NUM_CLASSES, --num_classes NUM_CLASSES 85 | Number of data classes 86 | -ext EXTRACTOR_TYPE, --extractor_type EXTRACTOR_TYPE 87 | Choice of deep feature extractor 88 | -classif CLASSIFIER_TYPE, --classifier_type CLASSIFIER_TYPE 89 | Choice of classifier for GA 90 | 91 | - ### Run the following for training and validation : 92 | 93 | `python main.py -data data -classes n -ext resnet -classif KNN` 94 | 95 | - ### Specific tokens : 96 | 97 | GoogLeNet: 'googlenet' 98 | ResNet-18: 'resnet' 99 | VGG-19: 'vgg16' 100 | SVM Classifier: 'SVM' 101 | KNN Classifier: 'KNN' 102 | MLP Classifier: 'MLP' 103 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from Deep_feature_extractors import extractor 4 | from Genetic_Algorithm import GA 5 | import argparse 6 | 7 | 8 | parser = argparse.ArgumentParser(description = 'Application of Genetic Algorithm') 9 | # Paths 10 | parser.add_argument('-data','--data_folder',type=str, 11 | default = 'data', 12 | help = 'Path to data') 13 | parser.add_argument('-classes','--num_classes',type=int, 14 | default = 2, 15 | help = 'Number of data classes') 16 | parser.add_argument('-ext','--extractor_type',type=str, 17 | default = 'resnet', 18 | help = 'Choice of deep feature extractor') 19 | parser.add_argument('-classif','--classifier_type',type=str, 20 | default = 'KNN', 21 | help = 'Choice of classifier for GA') 22 | 23 | 24 | 25 | args = parser.parse_args() 26 | folder_path = args.data_folder 27 | out_classes = args.num_classes 28 | ext = args.extractor_type 29 | classif = args.classifier_type 30 | 31 | 32 | print("Extracting deep features...") 33 | print('\n'*2) 34 | array_train, array_val = extractor.feature_extractor(folder_path, ext, out_classes) 35 | print("Deep features extracted.") 36 | print('\n'*2) 37 | print('Applying Genetic Algorithm...') 38 | print('\n'*2) 39 | GA.algorithm(array_train, array_val, classif) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102 -f https://download.pytorch.org/whl/cu102/torch_stable.html 2 | matplotlib==3.2.1 3 | numpy==1.18.3 4 | pandas==1.0.3 5 | scikit_learn==0.24.2 --------------------------------------------------------------------------------