├── README.md
├── bounding_box_dataset_builder.py
├── classifier.py
├── classifier.pyc
├── preprocessor.py
└── successful_threshold_seperator.py


/README.md:
--------------------------------------------------------------------------------
1 | # A basic classifier for the ISIC challenge. 
2 | 
3 | Accomponies [the following blog post:)](https://hackernoon.com/machine-learning-for-isic-skin-cancer-classification-challenge-part-1-ccddea4ec44a)
4 | 
5 | * Does not contain the dataset used in the blog post. You can find the data [on the ISIC website](https://isic-archive.com/)
6 | * Not exactly beautiful code here
7 | * There is some code here for experiments I did later (such as thresholding the images and seeing how that affected accuracy). The thresholding code is not here. 
8 | 


--------------------------------------------------------------------------------
/bounding_box_dataset_builder.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import subprocess
 3 | import os
 4 | import cv2
 5 | 
 6 | EXT = '.jpg'
 7 | DATA_DIR_THRES = 'thresholded_final/out_final/'
 8 | DATA_DIR = 'final/'
 9 | OUT_DIR = 'bound_box_final_2/'
10 | #DATA_DIR = 'final/'
11 | 
12 | def main():
13 |     gt = pd.read_csv("final.csv")
14 |     # iterate over ground truth
15 |     for idx, row in gt.iterrows():
16 |         # use threshold fusion to build directory structure for transfer learning
17 |         bounded_box_img = get_bounding_box(row['image_id'], threshold=False)
18 |         cv2.imwrite(OUT_DIR + row['image_id'] + '.jpg',bounded_box_img)
19 |         if idx % 5 == 0:
20 |             print "{}% Complete".format(idx / float(len(gt)) * 100)
21 | 
22 | def get_bounding_box(image_id, threshold=True):
23 |     img_path = DATA_DIR + image_id + EXT
24 |     if threshold:
25 |         threshold_fusion_cmd = "./fourier_0.8/threshold_fusion " + img_path
26 |         thresholded_img_name = OUT_DIR + image_id + "_bin.bmp"
27 |         subprocess.call(threshold_fusion_cmd, shell = True)
28 |     else:
29 |         thresholded_img_name = DATA_DIR_THRES + image_id + "_bin.bmp"
30 |     threshold_img = cv2.imread(thresholded_img_name, 0)
31 |     original_img = cv2.imread(img_path)
32 |     contours = cv2.findContours(threshold_img, 
33 |             cv2.RETR_TREE, 
34 |             cv2.CHAIN_APPROX_SIMPLE)[0][0]
35 |     x,y,w,h = cv2.boundingRect(contours)
36 | 
37 |     cv2.rectangle(original_img,(x,y), (x+w,y+h),3)
38 |     #cv2.imshow('img',original_img[y:y+h,x:x+w])
39 |     #cv2.waitKey(0)
40 |     
41 |     return original_img[y:y+h, x:x+w]
42 | 
43 | if __name__ == "__main__":
44 |     #box = get_bounding_box('isic_0000009', threshold=False)
45 |     main()
46 | 


--------------------------------------------------------------------------------
/classifier.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import math
  4 | from preprocessor import extract_features_and_class
  5 | from sklearn import ensemble
  6 | from sklearn.model_selection import KFold
  7 | from sklearn.metrics import roc_curve, auc
  8 | import matplotlib.pyplot as plt
  9 | from scipy import interp
 10 | import itertools
 11 | from sklearn.metrics import confusion_matrix, precision_score, recall_score
 12 | 
 13 | NUM_FOLDS = 7
 14 | DATA_SET = 'isic_balanced'
 15 | PERCENTAGE_BENIGN = 0.5
 16 | PERCENTAGE_MALIGNANT = 1 - PERCENTAGE_BENIGN
 17 | IS_REDUCED = True
 18 | 
 19 | def get_data(dataset_file, reduced=False, shuffle=True):
 20 |     dataset_file += '_reduced_20.npy' if reduced else '.npy'
 21 |     print "Loading {}...".format(dataset_file)
 22 |     data_with_class = np.load(dataset_file)
 23 |     print data_with_class.shape
 24 |     return extract_features_and_class(get_split(data_with_class))
 25 | 
 26 | def train_classifier(data, classes):
 27 |     clf = ensemble.RandomForestClassifier(n_estimators=100) # 75%
 28 |     clf.fit(data, classes)
 29 |     return clf
 30 | 
 31 | def get_benign_training_prob(label_data):
 32 |     return int(math.floor(100 * \
 33 |             len(np.where(label_data == 0)[0]) / float(len(label_data))))
 34 | 
 35 | # Gets a shuffled split of benign to malignant images based on PERCENTAGE_BENIGN
 36 | def get_split(data):
 37 |     malignant = data[np.where(data[:,-1] == 1)]
 38 |     num_benign = int(math.ceil(PERCENTAGE_BENIGN * malignant.shape[0] / PERCENTAGE_MALIGNANT))
 39 |     benign = data[np.where(data[:,-1] == 0)]
 40 |     np.random.shuffle(benign)
 41 |     per_benign = benign[0:num_benign,:]
 42 |     split_data = np.vstack((per_benign, malignant))
 43 |     np.random.shuffle(split_data)
 44 |     return split_data
 45 | 
 46 | def main():
 47 |     # Load dataset as numpy arr
 48 |     X, Y = get_data(DATA_SET, reduced=IS_REDUCED)
 49 | 
 50 |     kf = KFold(n_splits = NUM_FOLDS, shuffle=True)
 51 | 
 52 |     print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS)
 53 |     scores, precs, recalls = [], [], []
 54 |     for k, (train, test) in enumerate(kf.split(X, Y)):
 55 |         clf = train_classifier(X[train], Y[train])
 56 |         score = clf.score(X[test],Y[test])
 57 |         predictions =clf.predict(X[test]) 
 58 |         precision = precision_score(Y[test], predictions)
 59 |         recall = recall_score(Y[test], predictions)
 60 |         print "Score: {}%    Precision: {}%    Recall: {}%".\
 61 |                 format(percentify(score), percentify(precision), percentify(recall))
 62 |         scores.append(score)
 63 |         precs.append(precision)
 64 |         recalls.append(recall)
 65 |         
 66 |     print "Mean of trials: accuracy {}% precision: {}% recall: {}%".format(percentify(np.mean(scores)), percentify(np.mean(precs)), percentify(np.mean(recalls)))
 67 |     print "Median of trials: {}%".format(percentify(np.mean(scores)))
 68 | 
 69 | def main_with_cnf():
 70 |     # Load dataset as numpy arr
 71 |     X, Y = get_data(DATA_SET, reduced=IS_REDUCED)
 72 | 
 73 |     kf = KFold(n_splits = NUM_FOLDS, shuffle=True)
 74 | 
 75 |     print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS)
 76 |     scores, precs, recalls = [], [], []
 77 |     cnf = None
 78 |     for k, (train, test) in enumerate(kf.split(X, Y)):
 79 |         clf = train_classifier(X[train], Y[train])
 80 |         score = clf.score(X[test],Y[test])
 81 |         preds = clf.predict(X[test])
 82 |         cnf = confusion_matrix(Y[test], preds)
 83 |         predictions =clf.predict(X[test]) 
 84 |         precision = precision_score(Y[test], predictions)
 85 |         recall = recall_score(Y[test], predictions)
 86 |         print "Score: {}%    Precision: {}%    Recall: {}%".\
 87 |                 format(percentify(score), percentify(precision), percentify(recall))
 88 |         scores.append(score)
 89 |         precs.append(precision)
 90 |         recalls.append(recall)
 91 |         plt.figure()
 92 |     plot_confusion_matrix(cnf, classes=['benign', 'malignant'], normalize=True, title='Confusion matrix with normalization')
 93 |     plt.show()
 94 | 
 95 |     print "Mean of trials: accuracy {}% precision: {}% recall: {}%".format(percentify(np.mean(scores)), percentify(np.mean(precs)), percentify(np.mean(recalls)))
 96 |     print "Median of trials: {}%".format(percentify(np.mean(scores)))
 97 | 
 98 | def percentify(decimal):
 99 |     return int(math.floor(100 * decimal))
100 | 
101 | def main_with_auc():
102 |     # Load dataset as numpy arr
103 |     X, Y = get_data(DATA_SET, reduced=IS_REDUCED)
104 | 
105 |     kf = KFold(n_splits = NUM_FOLDS, shuffle=True)
106 | 
107 |     print "Running K-folds for the classifier... K = {}".format(NUM_FOLDS)
108 |     scores,tprs,aucs = [],[],[]
109 |     mean_fpr = np.linspace(0, 1, 100)
110 |     for k, (train, test) in enumerate(kf.split(X, Y)):
111 |         clf = train_classifier(X[train], Y[train])
112 |         score = clf.score(X[test],Y[test])
113 |         probs = clf.predict_proba(X[test])
114 |         fpr, tpr, thresholds = roc_curve(Y[test], probs[:, 1])
115 |         tprs.append(interp(mean_fpr, fpr, tpr))
116 |         tprs[-1][0] = 0.0
117 |         roc_auc = auc(fpr, tpr)
118 |         aucs.append(roc_auc)
119 |         plt.plot(fpr, tpr, lw=1, alpha=0.3,
120 |              label='ROC fold %d (AUC = %0.2f)' % (k, roc_auc))
121 |         #score = clf.score(X[test],Y[test])
122 |         #print "Score: {}%".format(int(math.floor(100 * score)))
123 | 
124 |         scores.append(score)
125 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
126 |          label='Random', alpha=.8)
127 |     mean_tpr = np.mean(tprs, axis=0)
128 |     mean_tpr[-1] = 1.0
129 |     mean_auc = auc(mean_fpr, mean_tpr)
130 |     std_auc = np.std(aucs)
131 |     plt.plot(mean_fpr, mean_tpr, color='b',
132 |          label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
133 |          lw=2, alpha=.8)
134 |     std_tpr = np.std(tprs, axis=0)
135 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
136 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
137 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
138 |                  label=r'$\pm$ 1 std. dev.')
139 | 
140 |     plt.xlim([-0.05, 1.05])
141 |     plt.ylim([-0.05, 1.05])
142 |     plt.xlabel('False-positive rate (unnecessary biopsy)')
143 |     plt.ylabel('Sensitivity')
144 |     plt.title('Receiver Operator Characteristic for {} Folds'.format(NUM_FOLDS))
145 |     plt.legend(loc="lower right")
146 |     plt.show()
147 | 
148 |     print "Mean of trials: {}".format(np.mean(scores))
149 |     print "Median of trials: {}".format(np.mean(scores))
150 | 
151 | def plot_confusion_matrix(cm, classes,
152 |                           normalize=False,
153 |                           title='Confusion matrix',
154 |                           cmap=plt.cm.Blues):
155 |     if normalize:
156 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
157 |         print("Normalized confusion matrix")
158 |     else:
159 |         print('Confusion matrix, without normalization')
160 | 
161 |     print(cm)
162 | 
163 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
164 |     plt.title(title)
165 |     plt.colorbar()
166 |     tick_marks = np.arange(len(classes))
167 |     plt.xticks(tick_marks, classes, rotation=45)
168 |     plt.yticks(tick_marks, classes)
169 | 
170 |     fmt = '.2f' if normalize else 'd'
171 |     thresh = cm.max() / 2.
172 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
173 |         plt.text(j, i, format(cm[i, j], fmt),
174 |                  horizontalalignment="center",
175 |                  color="white" if cm[i, j] > thresh else "black")
176 | 
177 |     plt.tight_layout()
178 |     plt.ylabel('True label')
179 |     plt.xlabel('Predicted label')
180 | 
181 | if __name__ == "__main__":
182 |     #main()
183 |     #main_with_auc()
184 |     main_with_cnf()
185 | 


--------------------------------------------------------------------------------
/classifier.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evankozliner/simple-classifier/ec780f434b555d5e2acdfefeac4ecff518828bd9/classifier.pyc


--------------------------------------------------------------------------------
/preprocessor.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import PIL
  4 | import time
  5 | import math
  6 | from PIL import Image,ImageFile
  7 | from sklearn.decomposition import IncrementalPCA
  8 | 
  9 | # Median dimensions for the lesions after segmentation
 10 | # NOTE these are *not* the median dimensions of the ISIC dataset, they are the
 11 | # median dimensions of the ISIC combined with another dataset 
 12 | HEIGHT = 510
 13 | WIDTH = 766
 14 | 
 15 | CHANNELS    = 3
 16 | DATA_DIR = 'bound_box_final_2/'
 17 | N_COMPS = 10
 18 | BATCH_SIZE=100
 19 | OUTPUT_NAME = 'final_isic_thres'
 20 | 
 21 | def timer(f):
 22 |     def wrapper(*args, **kwargs):
 23 |         t1 = time.time()
 24 |         f(*args, **kwargs)
 25 |         t2 = time.time()
 26 |         print "Function took {} seconds. \n".format(str(t2-t1))
 27 |     return wrapper
 28 | 
 29 | def build_dataset(dataset, ignore_set):
 30 |     print "Building dataset: {}".format(dataset)
 31 |     data = pd.read_csv(dataset + '.csv')
 32 | 
 33 |     # Images not reliably complete
 34 |     ImageFile.LOAD_TRUNCATED_IMAGES = True
 35 | 
 36 |     # Flattened image vector is of shape HEIGHT*WIDTH*CHANNELS
 37 |     # Add 1 dimension to vector for class
 38 |     data_matrix = np.zeros((data.shape[0], HEIGHT * WIDTH * CHANNELS + 1))
 39 | 
 40 |     for idx,row in data.iterrows():
 41 |         if row[0] + ".jpg" in ignore_set or not "isic" in row[0]:
 42 |             print "ignoring " + row[0] + ".jpg"
 43 |             continue
 44 |         img = Image.open(DATA_DIR + row[0] + '.jpg')
 45 |         resized_img = img.resize((HEIGHT, WIDTH), PIL.Image.ANTIALIAS)
 46 |         image_vector = np.array(resized_img).flatten()
 47 |         image_vector_with_class = np.concatenate(\
 48 |                 (image_vector, [row['melanoma']]), axis=0)
 49 |         data_matrix[idx] = image_vector_with_class
 50 | 
 51 |         if idx % 50 == 0:
 52 |             print "{} images done...".format(idx)
 53 | 
 54 |     return data_matrix
 55 | 
 56 | def get_next_batch(data, i, BATCH_SIZE):
 57 |     if (i + 1) * BATCH_SIZE > data.shape[0]:
 58 |         return data[i*BATCH_SIZE:data.shape[0],:]
 59 |     return data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE,:]
 60 | 
 61 | def reduce_dimensionality(dataset):
 62 |     print "Reducing dimensionality for dataset: {}".format(dataset)
 63 |     print "Loading {}".format(dataset)
 64 |     data_with_class = np.load(dataset)
 65 |     print "Finished Loading {}".format(dataset)
 66 |     data_no_class, y = extract_features_and_class(data_with_class)
 67 | 
 68 |     # Will Seg-fault with regular PCA due to dataset size
 69 |     # Somewhat arbitrary batch size here. 
 70 |     pca = IncrementalPCA(n_components=N_COMPS)
 71 |     num_batches = int(math.ceil(y.shape[0] / float(BATCH_SIZE)))
 72 | 
 73 |     print "Beggining to fit dataset"
 74 |     for i in xrange(num_batches):
 75 |         batch = get_next_batch(data_no_class, i, BATCH_SIZE) 
 76 |         pca.partial_fit(batch)
 77 |         if i % 10 == 0:
 78 |             print "{}% complete.".format( float(i) / num_batches * 100)
 79 | 
 80 |     print "Beggining to fit transform dataset"
 81 |     reduced_data = None
 82 |     for i in xrange(num_batches):
 83 |         batch = get_next_batch(data_no_class, i, BATCH_SIZE) 
 84 |         transformed_chunk = pca.transform(batch)
 85 |         if reduced_data == None:
 86 |             reduced_data = transformed_chunk
 87 |         else:
 88 |             reduced_data = np.vstack((reduced_data, transformed_chunk))
 89 |         if i % 10 == 0:
 90 |             print "{}% complete.".format(float(i) / num_batches * 100)
 91 | 
 92 |     print "PCA complete for {} components. Explained variance: {}".\
 93 |             format(N_COMPS, np.sum(pca.explained_variance_ratio_))
 94 |     print reduced_data.shape
 95 |     print y.shape
 96 |     reduced_data_with_class = np.hstack((reduced_data,y))
 97 |     return reduced_data_with_class
 98 | 
 99 | def extract_features_and_class(data_with_class):
100 |     y = data_with_class[:,-1]
101 |     # Reshape into column vector instead of row
102 |     y_col = y.reshape(y.size,1)
103 |     n_columns = data_with_class.shape[1] - 1
104 |     data_no_class = data_with_class[:,0:n_columns]
105 |     return data_no_class, y_col
106 | 
107 | def get_ignored_images():
108 |     with open("failed_thres_img.txt") as f: 
109 |         a = map(lambda x: x.split("\n")[0], f.readlines())
110 |     return set(a)
111 | 
112 | @timer
113 | def build_and_write_dataset(dataset):
114 |     dataset_file = dataset + ".npy"
115 | 
116 |     ignore_set = get_ignored_images()
117 | 
118 |     dataset_matrix = build_dataset(dataset, ignore_set)
119 |     print "Saving {}".format(dataset_file)
120 |     np.save(dataset_file, dataset_matrix)
121 | 
122 | @timer
123 | def build_and_write_reduced_dataset(dataset):
124 |     reduced_dataset_file = dataset + "_reduced.npy"
125 |     reduced_matrix = reduce_dimensionality(dataset + '.npy')
126 |     print "Saving {}".format(reduced_dataset_file)
127 |     np.save(reduced_dataset_file, reduced_matrix)
128 | 
129 | def main(dataset):
130 |     build_and_write_dataset(dataset)
131 |     build_and_write_reduced_dataset(dataset)
132 | 
133 | if __name__ == "__main__":
134 |     if BATCH_SIZE < N_COMPS:
135 |         # See https://github.com/scikit-learn/scikit-learn/issues/6452
136 |         raise ValueError("Number of components must be < \
137 |                 batch size.")
138 |     main(OUTPUT_NAME)
139 | 
140 | 


--------------------------------------------------------------------------------
/successful_threshold_seperator.py:
--------------------------------------------------------------------------------
 1 | """ Images that failed thresholding tend to be blank, this just filters them into a new dataset. TODO add failures due to hair here. """
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from PIL import Image
 6 | from shutil import copyfile 
 7 | 
 8 | THRES_DIR = "thresholded_final/out_final/"
 9 | DATA_DIR = "final/"
10 | OUT_DIR = "success_threshold_final/"
11 | PERCENTAGE_WHITE = 1
12 | 
13 | def main():
14 |     df = pd.read_csv("final.csv")
15 |     failed_img_notes = open("failed_thres_img.txt", 'w+')
16 |     for idx,row in df.iterrows():
17 |         print row[0]
18 |         thres_img = np.array(Image.open(THRES_DIR + row[0] + '_bin.bmp'))
19 |         if (image_is_not_blank(thres_img)):
20 |             copyfile(DATA_DIR + row[0] + ".jpg", OUT_DIR + row[0] + ".jpg")
21 |         else:
22 |             failed_img_notes.write(row[0] + ".jpg\n")
23 | 
24 |     failed_img_notes.close()
25 | 
26 | def image_is_not_blank(thres_img):
27 |     flattened = thres_img.flatten()
28 |     return np.sum(flattened) / float(flattened.size) * 100 > PERCENTAGE_WHITE
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------