├── README.md ├── bounding_box_dataset_builder.py ├── classifier.py ├── classifier.pyc ├── preprocessor.py └── successful_threshold_seperator.py /README.md: -------------------------------------------------------------------------------- 1 | # A basic classifier for the ISIC challenge. 2 | 3 | Accomponies [the following blog post:)](https://hackernoon.com/machine-learning-for-isic-skin-cancer-classification-challenge-part-1-ccddea4ec44a) 4 | 5 | * Does not contain the dataset used in the blog post. You can find the data [on the ISIC website](https://isic-archive.com/) 6 | * Not exactly beautiful code here 7 | * There is some code here for experiments I did later (such as thresholding the images and seeing how that affected accuracy). The thresholding code is not here. 8 | -------------------------------------------------------------------------------- /bounding_box_dataset_builder.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import subprocess 3 | import os 4 | import cv2 5 | 6 | EXT = '.jpg' 7 | DATA_DIR_THRES = 'thresholded_final/out_final/' 8 | DATA_DIR = 'final/' 9 | OUT_DIR = 'bound_box_final_2/' 10 | #DATA_DIR = 'final/' 11 | 12 | def main(): 13 | gt = pd.read_csv("final.csv") 14 | # iterate over ground truth 15 | for idx, row in gt.iterrows(): 16 | # use threshold fusion to build directory structure for transfer learning 17 | bounded_box_img = get_bounding_box(row['image_id'], threshold=False) 18 | cv2.imwrite(OUT_DIR + row['image_id'] + '.jpg',bounded_box_img) 19 | if idx % 5 == 0: 20 | print "{}% Complete".format(idx / float(len(gt)) * 100) 21 | 22 | def get_bounding_box(image_id, threshold=True): 23 | img_path = DATA_DIR + image_id + EXT 24 | if threshold: 25 | threshold_fusion_cmd = "./fourier_0.8/threshold_fusion " + img_path 26 | thresholded_img_name = OUT_DIR + image_id + "_bin.bmp" 27 | subprocess.call(threshold_fusion_cmd, shell = True) 28 | else: 29 | thresholded_img_name = DATA_DIR_THRES + image_id + "_bin.bmp" 30 | threshold_img = cv2.imread(thresholded_img_name, 0) 31 | original_img = cv2.imread(img_path) 32 | contours = cv2.findContours(threshold_img, 33 | cv2.RETR_TREE, 34 | cv2.CHAIN_APPROX_SIMPLE)[0][0] 35 | x,y,w,h = cv2.boundingRect(contours) 36 | 37 | cv2.rectangle(original_img,(x,y), (x+w,y+h),3) 38 | #cv2.imshow('img',original_img[y:y+h,x:x+w]) 39 | #cv2.waitKey(0) 40 | 41 | return original_img[y:y+h, x:x+w] 42 | 43 | if __name__ == "__main__": 44 | #box = get_bounding_box('isic_0000009', threshold=False) 45 | main() 46 | -------------------------------------------------------------------------------- /classifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import math 4 | from preprocessor import extract_features_and_class 5 | from sklearn import ensemble 6 | from sklearn.model_selection import KFold 7 | from sklearn.metrics import roc_curve, auc 8 | import matplotlib.pyplot as plt 9 | from scipy import interp 10 | import itertools 11 | from sklearn.metrics import confusion_matrix, precision_score, recall_score 12 | 13 | NUM_FOLDS = 7 14 | DATA_SET = 'isic_balanced' 15 | PERCENTAGE_BENIGN = 0.5 16 | PERCENTAGE_MALIGNANT = 1 - PERCENTAGE_BENIGN 17 | IS_REDUCED = True 18 | 19 | def get_data(dataset_file, reduced=False, shuffle=True): 20 | dataset_file += '_reduced_20.npy' if reduced else '.npy' 21 | print "Loading {}...".format(dataset_file) 22 | data_with_class = np.load(dataset_file) 23 | print data_with_class.shape 24 | return extract_features_and_class(get_split(data_with_class)) 25 | 26 | def train_classifier(data, classes): 27 | clf = ensemble.RandomForestClassifier(n_estimators=100) # 75% 28 | clf.fit(data, classes) 29 | return clf 30 | 31 | def get_benign_training_prob(label_data): 32 | return int(math.floor(100 * \ 33 | len(np.where(label_data == 0)[0]) / float(len(label_data)))) 34 | 35 | # Gets a shuffled split of benign to malignant images based on PERCENTAGE_BENIGN 36 | def get_split(data): 37 | malignant = data[np.where(data[:,-1] == 1)] 38 | num_benign = int(math.ceil(PERCENTAGE_BENIGN * malignant.shape[0] / PERCENTAGE_MALIGNANT)) 39 | benign = data[np.where(data[:,-1] == 0)] 40 | np.random.shuffle(benign) 41 | per_benign = benign[0:num_benign,:] 42 | split_data = np.vstack((per_benign, malignant)) 43 | np.random.shuffle(split_data) 44 | return split_data 45 | 46 | def main(): 47 | # Load dataset as numpy arr 48 | X, Y = get_data(DATA_SET, reduced=IS_REDUCED) 49 | 50 | kf = KFold(n_splits = NUM_FOLDS, shuffle=True) 51 | 52 | print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS) 53 | scores, precs, recalls = [], [], [] 54 | for k, (train, test) in enumerate(kf.split(X, Y)): 55 | clf = train_classifier(X[train], Y[train]) 56 | score = clf.score(X[test],Y[test]) 57 | predictions =clf.predict(X[test]) 58 | precision = precision_score(Y[test], predictions) 59 | recall = recall_score(Y[test], predictions) 60 | print "Score: {}% Precision: {}% Recall: {}%".\ 61 | format(percentify(score), percentify(precision), percentify(recall)) 62 | scores.append(score) 63 | precs.append(precision) 64 | recalls.append(recall) 65 | 66 | print "Mean of trials: accuracy {}% precision: {}% recall: {}%".format(percentify(np.mean(scores)), percentify(np.mean(precs)), percentify(np.mean(recalls))) 67 | print "Median of trials: {}%".format(percentify(np.mean(scores))) 68 | 69 | def main_with_cnf(): 70 | # Load dataset as numpy arr 71 | X, Y = get_data(DATA_SET, reduced=IS_REDUCED) 72 | 73 | kf = KFold(n_splits = NUM_FOLDS, shuffle=True) 74 | 75 | print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS) 76 | scores, precs, recalls = [], [], [] 77 | cnf = None 78 | for k, (train, test) in enumerate(kf.split(X, Y)): 79 | clf = train_classifier(X[train], Y[train]) 80 | score = clf.score(X[test],Y[test]) 81 | preds = clf.predict(X[test]) 82 | cnf = confusion_matrix(Y[test], preds) 83 | predictions =clf.predict(X[test]) 84 | precision = precision_score(Y[test], predictions) 85 | recall = recall_score(Y[test], predictions) 86 | print "Score: {}% Precision: {}% Recall: {}%".\ 87 | format(percentify(score), percentify(precision), percentify(recall)) 88 | scores.append(score) 89 | precs.append(precision) 90 | recalls.append(recall) 91 | plt.figure() 92 | plot_confusion_matrix(cnf, classes=['benign', 'malignant'], normalize=True, title='Confusion matrix with normalization') 93 | plt.show() 94 | 95 | print "Mean of trials: accuracy {}% precision: {}% recall: {}%".format(percentify(np.mean(scores)), percentify(np.mean(precs)), percentify(np.mean(recalls))) 96 | print "Median of trials: {}%".format(percentify(np.mean(scores))) 97 | 98 | def percentify(decimal): 99 | return int(math.floor(100 * decimal)) 100 | 101 | def main_with_auc(): 102 | # Load dataset as numpy arr 103 | X, Y = get_data(DATA_SET, reduced=IS_REDUCED) 104 | 105 | kf = KFold(n_splits = NUM_FOLDS, shuffle=True) 106 | 107 | print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS) 108 | scores,tprs,aucs = [],[],[] 109 | mean_fpr = np.linspace(0, 1, 100) 110 | for k, (train, test) in enumerate(kf.split(X, Y)): 111 | clf = train_classifier(X[train], Y[train]) 112 | score = clf.score(X[test],Y[test]) 113 | probs = clf.predict_proba(X[test]) 114 | fpr, tpr, thresholds = roc_curve(Y[test], probs[:, 1]) 115 | tprs.append(interp(mean_fpr, fpr, tpr)) 116 | tprs[-1][0] = 0.0 117 | roc_auc = auc(fpr, tpr) 118 | aucs.append(roc_auc) 119 | plt.plot(fpr, tpr, lw=1, alpha=0.3, 120 | label='ROC fold %d (AUC = %0.2f)' % (k, roc_auc)) 121 | #score = clf.score(X[test],Y[test]) 122 | #print "Score: {}%".format(int(math.floor(100 * score))) 123 | 124 | scores.append(score) 125 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', 126 | label='Random', alpha=.8) 127 | mean_tpr = np.mean(tprs, axis=0) 128 | mean_tpr[-1] = 1.0 129 | mean_auc = auc(mean_fpr, mean_tpr) 130 | std_auc = np.std(aucs) 131 | plt.plot(mean_fpr, mean_tpr, color='b', 132 | label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), 133 | lw=2, alpha=.8) 134 | std_tpr = np.std(tprs, axis=0) 135 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 136 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 137 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, 138 | label=r'$\pm$ 1 std. dev.') 139 | 140 | plt.xlim([-0.05, 1.05]) 141 | plt.ylim([-0.05, 1.05]) 142 | plt.xlabel('False-positive rate (unnecessary biopsy)') 143 | plt.ylabel('Sensitivity') 144 | plt.title('Receiver Operator Characteristic for {} Folds'.format(NUM_FOLDS)) 145 | plt.legend(loc="lower right") 146 | plt.show() 147 | 148 | print "Mean of trials: {}".format(np.mean(scores)) 149 | print "Median of trials: {}".format(np.mean(scores)) 150 | 151 | def plot_confusion_matrix(cm, classes, 152 | normalize=False, 153 | title='Confusion matrix', 154 | cmap=plt.cm.Blues): 155 | if normalize: 156 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 157 | print("Normalized confusion matrix") 158 | else: 159 | print('Confusion matrix, without normalization') 160 | 161 | print(cm) 162 | 163 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 164 | plt.title(title) 165 | plt.colorbar() 166 | tick_marks = np.arange(len(classes)) 167 | plt.xticks(tick_marks, classes, rotation=45) 168 | plt.yticks(tick_marks, classes) 169 | 170 | fmt = '.2f' if normalize else 'd' 171 | thresh = cm.max() / 2. 172 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 173 | plt.text(j, i, format(cm[i, j], fmt), 174 | horizontalalignment="center", 175 | color="white" if cm[i, j] > thresh else "black") 176 | 177 | plt.tight_layout() 178 | plt.ylabel('True label') 179 | plt.xlabel('Predicted label') 180 | 181 | if __name__ == "__main__": 182 | #main() 183 | #main_with_auc() 184 | main_with_cnf() 185 | -------------------------------------------------------------------------------- /classifier.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evankozliner/simple-classifier/ec780f434b555d5e2acdfefeac4ecff518828bd9/classifier.pyc -------------------------------------------------------------------------------- /preprocessor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import PIL 4 | import time 5 | import math 6 | from PIL import Image,ImageFile 7 | from sklearn.decomposition import IncrementalPCA 8 | 9 | # Median dimensions for the lesions after segmentation 10 | # NOTE these are *not* the median dimensions of the ISIC dataset, they are the 11 | # median dimensions of the ISIC combined with another dataset 12 | HEIGHT = 510 13 | WIDTH = 766 14 | 15 | CHANNELS = 3 16 | DATA_DIR = 'bound_box_final_2/' 17 | N_COMPS = 10 18 | BATCH_SIZE=100 19 | OUTPUT_NAME = 'final_isic_thres' 20 | 21 | def timer(f): 22 | def wrapper(*args, **kwargs): 23 | t1 = time.time() 24 | f(*args, **kwargs) 25 | t2 = time.time() 26 | print "Function took {} seconds. \n".format(str(t2-t1)) 27 | return wrapper 28 | 29 | def build_dataset(dataset, ignore_set): 30 | print "Building dataset: {}".format(dataset) 31 | data = pd.read_csv(dataset + '.csv') 32 | 33 | # Images not reliably complete 34 | ImageFile.LOAD_TRUNCATED_IMAGES = True 35 | 36 | # Flattened image vector is of shape HEIGHT*WIDTH*CHANNELS 37 | # Add 1 dimension to vector for class 38 | data_matrix = np.zeros((data.shape[0], HEIGHT * WIDTH * CHANNELS + 1)) 39 | 40 | for idx,row in data.iterrows(): 41 | if row[0] + ".jpg" in ignore_set or not "isic" in row[0]: 42 | print "ignoring " + row[0] + ".jpg" 43 | continue 44 | img = Image.open(DATA_DIR + row[0] + '.jpg') 45 | resized_img = img.resize((HEIGHT, WIDTH), PIL.Image.ANTIALIAS) 46 | image_vector = np.array(resized_img).flatten() 47 | image_vector_with_class = np.concatenate(\ 48 | (image_vector, [row['melanoma']]), axis=0) 49 | data_matrix[idx] = image_vector_with_class 50 | 51 | if idx % 50 == 0: 52 | print "{} images done...".format(idx) 53 | 54 | return data_matrix 55 | 56 | def get_next_batch(data, i, BATCH_SIZE): 57 | if (i + 1) * BATCH_SIZE > data.shape[0]: 58 | return data[i*BATCH_SIZE:data.shape[0],:] 59 | return data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE,:] 60 | 61 | def reduce_dimensionality(dataset): 62 | print "Reducing dimensionality for dataset: {}".format(dataset) 63 | print "Loading {}".format(dataset) 64 | data_with_class = np.load(dataset) 65 | print "Finished Loading {}".format(dataset) 66 | data_no_class, y = extract_features_and_class(data_with_class) 67 | 68 | # Will Seg-fault with regular PCA due to dataset size 69 | # Somewhat arbitrary batch size here. 70 | pca = IncrementalPCA(n_components=N_COMPS) 71 | num_batches = int(math.ceil(y.shape[0] / float(BATCH_SIZE))) 72 | 73 | print "Beggining to fit dataset" 74 | for i in xrange(num_batches): 75 | batch = get_next_batch(data_no_class, i, BATCH_SIZE) 76 | pca.partial_fit(batch) 77 | if i % 10 == 0: 78 | print "{}% complete.".format( float(i) / num_batches * 100) 79 | 80 | print "Beggining to fit transform dataset" 81 | reduced_data = None 82 | for i in xrange(num_batches): 83 | batch = get_next_batch(data_no_class, i, BATCH_SIZE) 84 | transformed_chunk = pca.transform(batch) 85 | if reduced_data == None: 86 | reduced_data = transformed_chunk 87 | else: 88 | reduced_data = np.vstack((reduced_data, transformed_chunk)) 89 | if i % 10 == 0: 90 | print "{}% complete.".format(float(i) / num_batches * 100) 91 | 92 | print "PCA complete for {} components. Explained variance: {}".\ 93 | format(N_COMPS, np.sum(pca.explained_variance_ratio_)) 94 | print reduced_data.shape 95 | print y.shape 96 | reduced_data_with_class = np.hstack((reduced_data,y)) 97 | return reduced_data_with_class 98 | 99 | def extract_features_and_class(data_with_class): 100 | y = data_with_class[:,-1] 101 | # Reshape into column vector instead of row 102 | y_col = y.reshape(y.size,1) 103 | n_columns = data_with_class.shape[1] - 1 104 | data_no_class = data_with_class[:,0:n_columns] 105 | return data_no_class, y_col 106 | 107 | def get_ignored_images(): 108 | with open("failed_thres_img.txt") as f: 109 | a = map(lambda x: x.split("\n")[0], f.readlines()) 110 | return set(a) 111 | 112 | @timer 113 | def build_and_write_dataset(dataset): 114 | dataset_file = dataset + ".npy" 115 | 116 | ignore_set = get_ignored_images() 117 | 118 | dataset_matrix = build_dataset(dataset, ignore_set) 119 | print "Saving {}".format(dataset_file) 120 | np.save(dataset_file, dataset_matrix) 121 | 122 | @timer 123 | def build_and_write_reduced_dataset(dataset): 124 | reduced_dataset_file = dataset + "_reduced.npy" 125 | reduced_matrix = reduce_dimensionality(dataset + '.npy') 126 | print "Saving {}".format(reduced_dataset_file) 127 | np.save(reduced_dataset_file, reduced_matrix) 128 | 129 | def main(dataset): 130 | build_and_write_dataset(dataset) 131 | build_and_write_reduced_dataset(dataset) 132 | 133 | if __name__ == "__main__": 134 | if BATCH_SIZE < N_COMPS: 135 | # See https://github.com/scikit-learn/scikit-learn/issues/6452 136 | raise ValueError("Number of components must be < \ 137 | batch size.") 138 | main(OUTPUT_NAME) 139 | 140 | -------------------------------------------------------------------------------- /successful_threshold_seperator.py: -------------------------------------------------------------------------------- 1 | """ Images that failed thresholding tend to be blank, this just filters them into a new dataset. TODO add failures due to hair here. """ 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from PIL import Image 6 | from shutil import copyfile 7 | 8 | THRES_DIR = "thresholded_final/out_final/" 9 | DATA_DIR = "final/" 10 | OUT_DIR = "success_threshold_final/" 11 | PERCENTAGE_WHITE = 1 12 | 13 | def main(): 14 | df = pd.read_csv("final.csv") 15 | failed_img_notes = open("failed_thres_img.txt", 'w+') 16 | for idx,row in df.iterrows(): 17 | print row[0] 18 | thres_img = np.array(Image.open(THRES_DIR + row[0] + '_bin.bmp')) 19 | if (image_is_not_blank(thres_img)): 20 | copyfile(DATA_DIR + row[0] + ".jpg", OUT_DIR + row[0] + ".jpg") 21 | else: 22 | failed_img_notes.write(row[0] + ".jpg\n") 23 | 24 | failed_img_notes.close() 25 | 26 | def image_is_not_blank(thres_img): 27 | flattened = thres_img.flatten() 28 | return np.sum(flattened) / float(flattened.size) * 100 > PERCENTAGE_WHITE 29 | 30 | if __name__ == "__main__": 31 | main() 32 | --------------------------------------------------------------------------------